In [11]:
import argparse
import pdfplumber
import spacy
import re
import subprocess
import json

# Extract text
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() or ""
    return text

# NER with spaCy and custom rules
nlp = spacy.load("en_core_web_sm")

def detect_entities(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

def add_custom_rules(text):
    patterns = {
        "AADHAAR": r"\b\d{12}\b",
        "PAN": r"\b[A-Z]{5}[0-9]{4}[A-Z]\b"
    }
    custom_entities = []
    for label, pattern in patterns.items():
        for match in re.finditer(pattern, text):
            custom_entities.append((match.group(), label))
    return custom_entities

def get_all_entities(text):
    return detect_entities(text) + add_custom_rules(text)

# LLaMA via Ollama
def generate_pseudonym(prompt="Generate a fake name: "):
    # Use Ollama CLI to generate text (assumes Ollama is running locally)
    result = subprocess.run(
        ["ollama", "run", "llama3:8b", prompt],
        capture_output=True,
        text=True
    )
    # Extract the generated text from the output
    generated_text = result.stdout.strip()
    return generated_text if generated_text else "Fake Name"  # Fallback if empty

# Anonymization
def anonymize_text(text, entities, method="redact"):
    anonymized_text = text
    for entity, label in entities:
        if method == "redact":
            if label in ["PERSON", "AADHAAR", "PAN", "ORG", "GPE"]:
                anonymized_text = anonymized_text.replace(entity, "[REDACTED]")
        elif method == "mask":
            if label == "AADHAAR":
                anonymized_text = anonymized_text.replace(entity, "XXXX-XXXX-" + entity[-4:])
            elif label == "PAN":
                anonymized_text = anonymized_text.replace(entity, "XXXXX" + entity[-5:])
        elif method == "pseudonymize":
            if label == "PERSON":
                fake_name = generate_pseudonym()
                anonymized_text = anonymized_text.replace(entity, fake_name)
    return anonymized_text

# Main function
def main(input_file, output_file, method):
    text = extract_text_from_pdf(input_file)
    entities = get_all_entities(text)
    anonymized_text = anonymize_text(text, entities, method=method)
    with open(output_file, "w") as f:
        f.write(anonymized_text)
    print(f"Anonymized document saved to {output_file}")

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Anonymize legal documents")
    parser.add_argument("input_file", help="Path to input PDF")
    parser.add_argument("output_file", help="Path to output text file")
    parser.add_argument("--method", choices=["redact", "mask", "pseudonymize"], default="redact", help="Anonymization method")
    args = parser.parse_args()

    main(args.input_file, args.output_file, args.method)

usage: ipykernel_launcher.py [-h] [--method {redact,mask,pseudonymize}]
                             input_file output_file
ipykernel_launcher.py: error: the following arguments are required: input_file, output_file


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
