In [1]:
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import re
import spacy

# Load pre-trained transformer model for summarization
tokenizer = AutoTokenizer.from_pretrained("t5-base")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
summarization_pipeline = pipeline("summarization", model=model, tokenizer=tokenizer)

# Load spaCy for PII detection and redaction
nlp = spacy.load("en_core_web_sm")

def redact_pii(text):
    # Function to redact PII (Personal Identifiable Information) from text
    doc = nlp(text)
    redacted_text = text
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "NORP", "ORG", "GPE", "DATE", "TIME", "CARDINAL", "QUANTITY"]:
            redacted_text = redacted_text.replace(ent.text, "<REDACTED>")
    return redacted_text

def summarize_and_redact(text):
    # Summarize the text
    summarized_text = summarization_pipeline(text, max_length=5000, min_length=30, do_sample=False)[0]['summary_text']
    # Redact PII from the summarized text
    redacted_summarized_text = redact_pii(summarized_text)
    return redacted_summarized_text

# Example usage
input_text = """

"""

redacted_summary = summarize_and_redact(input_text)
print(redacted_summary)


2024-04-26 20:12:25.113802: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  from pandas.core import (
  torch.utils._pytree._register_pytree_node(
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Token indices sequence length is longer than the specified maximum sequence length for this model (834 > 512). Running this sequence through the model will result in indexing errors
Your max_length is set to 5000,

quality control in the pharmaceutical industry aims to verify and test the medicine at various stages of production to ensure every product is of the highest quality . quality assurance is the process of making sure quality requirements have been fulfilled . type I borosilicate glass is used for laboratory glass apparatus, water for injection and for parenteral and non-parenteral use . Type III regular soda-lime glass is fairly resistant to attack by water .


In [4]:
pip install fitz

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting fitz
  Obtaining dependency information for fitz from https://files.pythonhosted.org/packages/7e/28/27f27d66eb82f24e6595deb26c0a875e62431878c416e38eac515023abb2/fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata
  Downloading fitz-0.0.1.dev2-py2.py3-none-any.whl.metadata (816 bytes)
Collecting configobj (from fitz)
  Obtaining dependency information for configobj from https://files.pythonhosted.org/packages/d3/bb/d10e531b297dd1d46f6b1fd11d018247af9f2d460037554bb7bb9011c6ac/configobj-5.0.8-py2.py3-none-any.whl.metadata
  Downloading configobj-5.0.8-py2.py3-none-any.whl.metadata (3.4 kB)
Collecting configparser (from fitz)
  Obtaining dependency information for configparser from https://files.pythonhoste

In [6]:
pip install frontend

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting frontend
  Obtaining dependency information for frontend from https://files.pythonhosted.org/packages/98/02/a9c4713b26ae11464ac334b59504ef6fe66e2c323ee44520ca1cd6dc4929/frontend-0.0.3-py3-none-any.whl.metadata
  Downloading frontend-0.0.3-py3-none-any.whl.metadata (847 bytes)
Collecting starlette>=0.12.0 (from frontend)
  Obtaining dependency information for starlette>=0.12.0 from https://files.pythonhosted.org/packages/fd/18/31fa32ed6c68ba66220204ef0be798c349d0a20c1901f9d4a794e08c76d8/starlette-0.37.2-py3-none-any.whl.metadata
  Downloading starlette-0.37.2-py3-none-any.whl.metadata (5.9 kB)
Collecting uvicorn>=0.7.1 (from frontend)
  Obtaining dependency information for uvicorn>=0.7.1 from https