In [None]:
pip install transformers pdf_annotate pymupdf


Collecting pdf_annotate
  Downloading pdf_annotate-0.12.0-py3-none-any.whl.metadata (721 bytes)
Collecting pymupdf
  Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting pdfrw>=0.4 (from pdf_annotate)
  Downloading pdfrw-0.4-py2.py3-none-any.whl.metadata (32 kB)
Downloading pdf_annotate-0.12.0-py3-none-any.whl (198 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.0/199.0 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfrw-0.4-py2.py3-none-any.whl (69 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.5/69.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pdfrw, pymupdf, pdf_annotate
Successfully installed pdf_annotate-0.1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
import fitz  # PyMuPDF for text extraction and clause location
from pdf_annotate import PdfAnnotator, Location, Appearance
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load NER and summarization models
def load_models():
    ner_model_name = "/content/drive/MyDrive/final_model_folder"  # Path to your trained LegalBERT model
    summarizer_model_name = "/content/drive/MyDrive/t5_summarizer_finetuned-20250121T101029Z-001/t5_summarizer_finetuned"  # Path to your T5 model

    # Load NER model
    ner_tokenizer = AutoTokenizer.from_pretrained(ner_model_name)
    ner_model = AutoModelForSequenceClassification.from_pretrained(ner_model_name)

    # Load summarization model
    summarizer_tokenizer = T5Tokenizer.from_pretrained(summarizer_model_name)
    summarizer_model = T5ForConditionalGeneration.from_pretrained(summarizer_model_name)

    ner_pipeline = pipeline("text-classification", model=ner_model, tokenizer=ner_tokenizer)

    return ner_pipeline, summarizer_model, summarizer_tokenizer

# Extract text and coordinates from PDF
def extract_pdf_text_with_coords(pdf_path):
    document = fitz.open(pdf_path)
    text_data = []
    for page_num in range(len(document)):
        page = document[page_num]
        blocks = page.get_text("blocks")
        for block in blocks:
            text_data.append({"text": block[4], "bbox": block[:4], "page": page_num})
    document.close()
    return text_data

# Highlight clauses in the PDF
def highlight_pdf(input_pdf, output_pdf, highlighted_clauses):
    annotator = PdfAnnotator(input_pdf)
    for clause, bbox, page_num in highlighted_clauses:
        x1, y1, x2, y2 = bbox
        annotator.add_annotation(
            "square",
            Location(x1=x1, y1=y1, x2=x2, y2=y2, page=page_num),
            Appearance(stroke_color=(1, 1, 0), stroke_width=2),
        )
    annotator.write(output_pdf)

# Pre-process clauses (enhanced for context retention)
def preprocess_clause(clause):
    # Remove extra spaces and preserve context of bracketed sections
    clause = re.sub(r'\[.*?\]', lambda match: match.group(0).strip('[]'), clause)
    clause = re.sub(r'\s+', ' ', clause.strip())  # Remove extra spaces
    return clause

# Summarize clauses using the fine-tuned T5 model with anti-repeat settings
def summarize_clauses(clauses, model, tokenizer):
    summaries = []
    for clause in clauses:
        processed_clause = preprocess_clause(clause)
        inputs = tokenizer.encode("summarize: " + processed_clause, return_tensors="pt", truncation=True)

        # Generate the summary with enhanced parameters to reduce repetition
        outputs = model.generate(
            inputs,
            max_length=70,
            min_length=20,
            length_penalty=2.0,
            repetition_penalty=5.0,  # Increased to further penalize repetition
            no_repeat_ngram_size=3,  # Avoid repeating n-grams of size 3 or more
            num_beams=5,
            early_stopping=True,
        )

        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries

def postprocess_summary(summary):
    # Remove list markers and extra semicolons, add punctuation
    summary = re.sub(r'\s?[a-e]\.\s?', '', summary)
    summary = re.sub(r'[;]+', '.', summary)
    summary = re.sub(r'\b(u\.s\.|france|other countries)\b', '', summary, flags=re.IGNORECASE)
    summary = re.sub(r'employee\'s', 'employee\'s', summary)  # Preserve apostrophes

    # Correct specific misspellings or formatting issues
    summary = re.sub(r'noto:', 'not to:', summary)
    summary = re.sub(r'inserthose', 'insert those', summary)

    # Remove extra dots and ensure proper spacing
    summary = re.sub(r'\.\.+', '.', summary)  # Replace multiple dots with one
    summary = re.sub(r'\s+', ' ', summary.strip())  # Clean extra spaces

    # Handle specific cases like ensuring spaces after commas and periods
    summary = re.sub(r'([.,])(?!\s)', r'\1 ', summary)

    # Capitalize the first letter and ensure sentence ends with a period
    summary = summary.capitalize()

    # Ensure lowercase for 'other' if it's at the end
    if summary.endswith('Other.'):
        summary = summary[:-5] + 'other.'

    if not summary.endswith('.'):
        summary += '.'

    return summary
# Validate Non-Compete Clauses
def validate_non_compete_clause(clause):
    keywords = ["compete", "restrict", "solicit", "goods or services", "employment termination"]
    return any(keyword in clause.lower() for keyword in keywords)

# Main pipeline function
def process_pdf(input_pdf, output_pdf):
    ner_pipeline, summarizer_model, summarizer_tokenizer = load_models()

    text_data = extract_pdf_text_with_coords(input_pdf)

    extracted_clauses = []
    highlighted_clauses = []
    for data in text_data:
        prediction = ner_pipeline(data["text"])
        predicted_label = prediction[0]["label"]
        confidence = prediction[0]["score"]

        if predicted_label == "LABEL_2" and confidence > 0.85:
            if validate_non_compete_clause(data["text"]):
                extracted_clauses.append(data["text"])
                highlighted_clauses.append((data["text"], data["bbox"], data["page"]))

    highlight_pdf(input_pdf, output_pdf, highlighted_clauses)

    summaries = summarize_clauses(extracted_clauses, summarizer_model, summarizer_tokenizer)

    cleaned_summaries = [postprocess_summary(summary) for summary in summaries]

    return extracted_clauses, cleaned_summaries

# Example usage
input_pdf_path = "/content/contract3.pdf"
output_pdf_path = "highlighted_output.pdf"

extracted_clauses, cleaned_summaries = process_pdf(input_pdf_path, output_pdf_path)

print("Extracted Clauses:")
for clause in extracted_clauses:
    print(clause)

print("\nCleaned Summaries:")
for summary in cleaned_summaries:
    print(summary)

Device set to use cuda:0


Extracted Clauses:
[MONTHS] months following the voluntary of involuntary termination of Employee’s 
employment], not to: [INSERT THOSE THAT APPLY] 
a. [Provide goods or services which directly or indirectly compete with Company]; 
b. [Invest either directly or indirectly in a business that directly or indirectly competes with 

Company];  
c. [Solicit Company employees to leave their employment]; 
d. [Engage in any other activities that result in injury to Company]; 
e. [Other].  


Cleaned Summaries:
Months months after involuntary termination of employee’s employment, not to: insert those that apply. provide goods or services which directly or indirectly compete with company. .
Solicit company employees to leave their employment. engage in any other activities that result in injury to company. other. .


In [None]:
import re
import fitz  # PyMuPDF for text extraction and clause location
from pdf_annotate import PdfAnnotator, Location, Appearance
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load NER and summarization models
def load_models():
    ner_model_name = "/content/drive/MyDrive/final_model_folder"  # Path to your trained LegalBERT model
    summarizer_model_name = "/content/drive/MyDrive/t5_summarizer_finetuned-20250121T101029Z-001/t5_summarizer_finetuned"  # Path to your T5 model

    # Load NER model
    ner_tokenizer = AutoTokenizer.from_pretrained(ner_model_name)
    ner_model = AutoModelForSequenceClassification.from_pretrained(ner_model_name)

    # Load summarization model
    summarizer_tokenizer = T5Tokenizer.from_pretrained(summarizer_model_name)
    summarizer_model = T5ForConditionalGeneration.from_pretrained(summarizer_model_name)

    ner_pipeline = pipeline("text-classification", model=ner_model, tokenizer=ner_tokenizer)

    return ner_pipeline, summarizer_model, summarizer_tokenizer

# Extract text and coordinates from PDF
def extract_pdf_text_with_coords(pdf_path):
    document = fitz.open(pdf_path)
    text_data = []
    for page_num in range(len(document)):
        page = document[page_num]
        blocks = page.get_text("blocks")
        for block in blocks:
            text_data.append({"text": block[4], "bbox": block[:4], "page": page_num})
    document.close()
    return text_data

# Highlight clauses in the PDF
def highlight_pdf(input_pdf, output_pdf, clauses):
    annotator = PdfAnnotator(input_pdf)
    doc = fitz.open(input_pdf)  # Open the document for searching text

    for clause in clauses:  # Loop through the clauses you want to highlight
        for page_num in range(len(doc)):
            page = doc[page_num]
            text_instances = page.search_for(clause)
            for inst in text_instances:
                # Add an annotation for each instance found
                annotator.add_annotation(
                    "square",
                    Location(x1=inst[0], y1=inst[1], x2=inst[2], y2=inst[3], page=page_num),
                    Appearance(stroke_color=(1, 1, 0), stroke_width=2),
                )

    doc.close()  # Close the document after searching
    annotator.write(output_pdf)

# Pre-process clauses (enhanced for context retention)
def preprocess_clause(clause):
    # Remove extra spaces and preserve context of bracketed sections
    clause = re.sub(r'\[.*?\]', lambda match: match.group(0).strip('[]'), clause)
    clause = re.sub(r'\s+', ' ', clause.strip())  # Remove extra spaces
    return clause

# Summarize clauses using the fine-tuned T5 model with anti-repeat settings
def summarize_clauses(clauses, model, tokenizer):
    summaries = []
    for clause in clauses:
        processed_clause = preprocess_clause(clause)
        inputs = tokenizer.encode("summarize: " + processed_clause, return_tensors="pt", truncation=True)

        # Generate the summary with enhanced parameters to reduce repetition
        outputs = model.generate(
            inputs,
            max_length=70,
            min_length=20,
            length_penalty=2.0,
            repetition_penalty=5.0,  # Increased to further penalize repetition
            no_repeat_ngram_size=3,  # Avoid repeating n-grams of size 3 or more
            num_beams=5,
            early_stopping=True,
        )

        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries

def postprocess_summary(summary):
    # Remove list markers and extra semicolons, add punctuation
    summary = re.sub(r'\s?[a-e]\.\s?', '', summary)
    summary = re.sub(r'[;]+', '.', summary)
    summary = re.sub(r'\b(u\.s\.|france|other countries)\b', '', summary, flags=re.IGNORECASE)
    summary = re.sub(r'employee\'s', 'employee\'s', summary)  # Preserve apostrophes

    # Correct specific misspellings or formatting issues
    summary = re.sub(r'noto:', 'not to:', summary)
    summary = re.sub(r'inserthose', 'insert those', summary)

    # Remove extra dots and ensure proper spacing
    summary = re.sub(r'\.\.+', '.', summary)  # Replace multiple dots with one
    summary = re.sub(r'\s+', ' ', summary.strip())  # Clean extra spaces

    # Handle specific cases like ensuring spaces after commas and periods
    summary = re.sub(r'([.,])(?!\s)', r'\1 ', summary)

    # Capitalize the first letter and ensure sentence ends with a period
    summary = summary.capitalize()

    # Ensure lowercase for 'other' if it's at the end
    if summary.endswith('Other.'):
        summary = summary[:-5] + 'other.'

    if not summary.endswith('.'):
        summary += '.'

    return summary

# Validate Non-Compete Clauses
def validate_non_compete_clause(clause):
    keywords = ["compete", "restrict", "solicit", "goods or services", "employment termination"]
    return any(keyword in clause.lower() for keyword in keywords)

# Main pipeline function
def process_pdf(input_pdf, output_pdf):
    ner_pipeline, summarizer_model, summarizer_tokenizer = load_models()

    text_data = extract_pdf_text_with_coords(input_pdf)

    extracted_clauses = []
    for data in text_data:
        prediction = ner_pipeline(data["text"])
        predicted_label = prediction[0]["label"]
        confidence = prediction[0]["score"]

        if predicted_label == "LABEL_2" and confidence > 0.85:
            if validate_non_compete_clause(data["text"]):
                extracted_clauses.append(data["text"])

    highlight_pdf(input_pdf, output_pdf, extracted_clauses)

    summaries = summarize_clauses(extracted_clauses, summarizer_model, summarizer_tokenizer)

    cleaned_summaries = [postprocess_summary(summary) for summary in summaries]

    return extracted_clauses, cleaned_summaries

# Example usage
input_pdf_path = "/content/contract3.pdf"
output_pdf_path = "highlighted_output.pdf"

extracted_clauses, cleaned_summaries = process_pdf(input_pdf_path, output_pdf_path)

print("Extracted Clauses:")
for clause in extracted_clauses:
    print(clause)

print("\nCleaned Summaries:")
for summary in cleaned_summaries:
    print(summary)

Device set to use cuda:0


Extracted Clauses:
[MONTHS] months following the voluntary of involuntary termination of Employee’s 
employment], not to: [INSERT THOSE THAT APPLY] 
a. [Provide goods or services which directly or indirectly compete with Company]; 
b. [Invest either directly or indirectly in a business that directly or indirectly competes with 

Company];  
c. [Solicit Company employees to leave their employment]; 
d. [Engage in any other activities that result in injury to Company]; 
e. [Other].  


Cleaned Summaries:
Months months after involuntary termination of employee’s employment, not to: insert those that apply. provide goods or services which directly or indirectly compete with company. .
Solicit company employees to leave their employment. engage in any other activities that result in injury to company. other. .


In [None]:
import re
import fitz  # PyMuPDF for text extraction and clause location
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load NER and summarization models
def load_models():
    ner_model_name = "/content/drive/MyDrive/final_model_folder"  # Path to your trained LegalBERT model
    summarizer_model_name = "/content/drive/MyDrive/t5_summarizer_finetuned-20250121T101029Z-001/t5_summarizer_finetuned"  # Path to your T5 model

    # Load NER model
    ner_tokenizer = AutoTokenizer.from_pretrained(ner_model_name)
    ner_model = AutoModelForSequenceClassification.from_pretrained(ner_model_name)

    # Load summarization model
    summarizer_tokenizer = T5Tokenizer.from_pretrained(summarizer_model_name)
    summarizer_model = T5ForConditionalGeneration.from_pretrained(summarizer_model_name)

    ner_pipeline = pipeline("text-classification", model=ner_model, tokenizer=ner_tokenizer)

    return ner_pipeline, summarizer_model, summarizer_tokenizer

# Extract text and coordinates from PDF
def extract_pdf_text_with_coords(pdf_path):
    document = fitz.open(pdf_path)
    text_data = []
    for page_num in range(len(document)):
        page = document[page_num]
        blocks = page.get_text("blocks")
        for block in blocks:
            text_data.append({"text": block[4], "bbox": block[:4], "page": page_num})
    document.close()
    return text_data

# Highlight clauses in the PDF
def highlight_pdf(input_pdf, output_pdf, clauses):
    doc = fitz.open(input_pdf)  # Open the input PDF document
    for clause in clauses:  # Loop through the clauses to highlight
        clause_text = preprocess_clause(clause)  # Preprocess clause for better matching
        for page_num in range(len(doc)):
            page = doc[page_num]
            text_instances = page.search_for(clause_text)  # Search for the exact text
            if not text_instances:  # Fallback for partial matches if exact text is not found
                words = clause_text.split()  # Split the clause into words
                for i in range(len(words) - 3):  # Use a sliding window of 3 words
                    partial_phrase = " ".join(words[i:i + 3])
                    text_instances += page.search_for(partial_phrase)

            for inst in text_instances:
                # Add highlight annotation for each instance found
                highlight = page.add_highlight_annot(inst)
                highlight.set_colors({"stroke": (1, 1, 0), "fill": (1, 1, 0)})  # Yellow color
                highlight.update()

    doc.save(output_pdf)  # Save the annotated PDF
    doc.close()

# Pre-process clauses (enhanced for context retention)
def preprocess_clause(clause):
    # Remove extra spaces and preserve context of bracketed sections
    clause = re.sub(r'\[.*?\]', lambda match: match.group(0).strip('[]'), clause)
    clause = re.sub(r'\s+', ' ', clause.strip())  # Remove extra spaces
    return clause

# Summarize clauses using the fine-tuned T5 model with anti-repeat settings
def summarize_clauses(clauses, model, tokenizer):
    summaries = []
    for clause in clauses:
        processed_clause = preprocess_clause(clause)
        inputs = tokenizer.encode("summarize: " + processed_clause, return_tensors="pt", truncation=True)

        # Generate the summary with enhanced parameters to reduce repetition
        outputs = model.generate(
            inputs,
            max_length=70,
            min_length=20,
            length_penalty=2.0,
            repetition_penalty=5.0,  # Increased to further penalize repetition
            no_repeat_ngram_size=3,  # Avoid repeating n-grams of size 3 or more
            num_beams=5,
            early_stopping=True,
        )

        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
        summaries.append(summary)

    return summaries

def postprocess_summary(summary):
    # Remove list markers and extra semicolons, add punctuation
    summary = re.sub(r'\s?[a-e]\.\s?', '', summary)
    summary = re.sub(r'[;]+', '.', summary)
    summary = re.sub(r'\b(u\.s\.|france|other countries)\b', '', summary, flags=re.IGNORECASE)
    summary = re.sub(r'employee\'s', 'employee\'s', summary)  # Preserve apostrophes

    # Correct specific misspellings or formatting issues
    summary = re.sub(r'noto:', 'not to:', summary)
    summary = re.sub(r'inserthose', 'insert those', summary)

    # Remove extra dots and ensure proper spacing
    summary = re.sub(r'\.\.+', '.', summary)  # Replace multiple dots with one
    summary = re.sub(r'\s+', ' ', summary.strip())  # Clean extra spaces

    # Handle specific cases like ensuring spaces after commas and periods
    summary = re.sub(r'([.,])(?!\s)', r'\1 ', summary)

    # Capitalize the first letter and ensure sentence ends with a period
    summary = summary.capitalize()

    # Ensure lowercase for 'other' if it's at the end
    if summary.endswith('Other.'):
        summary = summary[:-5] + 'other.'

    if not summary.endswith('.'):
        summary += '.'

    return summary

# Validate Non-Compete Clauses
def validate_non_compete_clause(clause):
    keywords = ["compete", "restrict", "solicit", "goods or services", "employment termination"]
    return any(keyword in clause.lower() for keyword in keywords)

# Main pipeline function
def process_pdf(input_pdf, output_pdf):
    ner_pipeline, summarizer_model, summarizer_tokenizer = load_models()

    text_data = extract_pdf_text_with_coords(input_pdf)

    extracted_clauses = []
    for data in text_data:
        prediction = ner_pipeline(data["text"])
        predicted_label = prediction[0]["label"]
        confidence = prediction[0]["score"]

        if predicted_label == "LABEL_2" and confidence > 0.85:
            if validate_non_compete_clause(data["text"]):
                extracted_clauses.append(data["text"])

    highlight_pdf(input_pdf, output_pdf, extracted_clauses)

    summaries = summarize_clauses(extracted_clauses, summarizer_model, summarizer_tokenizer)

    cleaned_summaries = [postprocess_summary(summary) for summary in summaries]

    return extracted_clauses, cleaned_summaries

# Example usage
input_pdf_path = "/content/This Employment Agreement.pdf"
output_pdf_path = "highlighted_output.pdf"

extracted_clauses, cleaned_summaries = process_pdf(input_pdf_path, output_pdf_path)

print("Extracted Clauses:")
for clause in extracted_clauses:
    print(clause)

print("\nCleaned Summaries:")
for summary in cleaned_summaries:
    print(summary)


Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Extracted Clauses:
any company that competes with the business of the Company in 
the geographic area of [Geographic Scope]. 

The Employee agrees not to solicit, directly or indirectly, any 
customers, clients, or employees of the Company for the 
purpose of diverting business away from the Company, both 
during the term of employment and for a period of [Time 
Period] following the termination of employment. 


Cleaned Summaries:
A company that competes with the business of geographic scope in geographic areit is not affiliated with other companies, such as those listed on this sit.
The employee agrees not to solicit any customers, clients or employees of the company for the purpose of diverting business away from the company. .
