Intelligent Document Processing System for Compliance and Security

In [3]:
# Import required libraries
pip install spacy pdfplumber scikit-learn transformers torch pandas

Collecting pdfplumber
  Downloading pdfplumber-0.11.5-py3-none-any.whl.metadata (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.5/42.5 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.2/48.2 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_c

In [5]:
import pdfplumber
import spacy
import re
import json
import random
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from transformers import pipeline as hf_pipeline

In [6]:
# Load spaCy model for NER
nlp = spacy.load("en_core_web_sm")

In [7]:
# Sample training data for text classification
training_data = [
    ("This contract is legally binding and pertains to financial obligations.", "Legal"),
    ("The patient was diagnosed with diabetes and prescribed medication.", "Medical"),
    ("Your bank account balance is $5,000 as of today.", "Finance"),
]


In [30]:
# Train a simple text classifier using TF-IDF and Naive Bayes
vectorizer = TfidfVectorizer()
classifier = MultinomialNB()
train_texts, train_labels = zip(*training_data)
text_clf_pipeline = Pipeline([("tfidf", vectorizer), ("clf", classifier)])
text_clf_pipeline.fit(train_texts, train_labels)


In [9]:
# Hugging Face model for section classification
hf_text_classification = hf_pipeline("zero-shot-classification")


No model was supplied, defaulted to facebook/bart-large-mnli and revision d7645e1 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu


In [31]:
# Categories for section classification
section_labels = ["Personal Information", "Financial Data", "Legal Information", "Medical Record"]

In [11]:
def extract_text_from_pdf(pdf_path):
    #Extracts text from a PDF file using pdfplumber.
    extracted_text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            extracted_text += page.extract_text() + "\n"
    return extracted_text.strip()


In [12]:
def detect_pii(text):
    #Detects PII using spaCy's Named Entity Recognition (NER).
    doc = nlp(text)
    pii_entities = []
    for ent in doc.ents:
        if ent.label_ in ["PERSON", "GPE", "DATE", "ORG", "MONEY", "CARDINAL"]:
            pii_entities.append((ent.text, ent.label_))
    return pii_entities

In [13]:
def redact_pii(text, pii_entities):
    #Redacts PII from text by replacing it with [REDACTED].
    for entity, _ in pii_entities:
        text = text.replace(entity, "[REDACTED]")
    return text


In [14]:
def classify_document(text):
    #Classifies document type (Legal, Medical, Finance).
    return text_clf_pipeline.predict([text])[0]


In [15]:
def classify_sections(text):
    #Classifies different sections of the document using NLP.
    sentences = text.split("\n")
    section_classification = {}

    for sentence in sentences:
        if sentence.strip():
            result = hf_text_classification(sentence, candidate_labels=section_labels)
            section_classification[sentence] = result["labels"][0]  # Take the top label

    return section_classification

In [28]:
def etl_pipeline(pdf_path):
    #Runs the full ETL pipeline on a given PDF document."""
    print("\n Starting ETL Pipeline...\n")

    # Step 1: Extract Text
    extracted_text = extract_text_from_pdf(pdf_path)
    print("Extracted Text")

    # Step 2: Detect PII
    pii_entities = detect_pii(extracted_text)
    print(f"Detected PII: {pii_entities}")

    # Step 3: Redact PII
    redacted_text = redact_pii(extracted_text, pii_entities)
    print("PII Redacted")

    # Step 4: Classify Document Type
    doc_type = classify_document(redacted_text)
    print(f"Document Classified as: {doc_type}")

    # Step 5: Classify Sections
    section_data = classify_sections(redacted_text)
    print("Sections Classified")

    # Step 6: Save results
    output_data = {
        "document_type": doc_type,
        "redacted_text": redacted_text,
        "section_classification": section_data,
        "pii_entities": pii_entities
    }

    output_file = "processed_document.json"
    with open(output_file, "w") as json_file:
        json.dump(output_data, json_file, indent=4)

    print(f"Processed document saved as {output_file}")

In [29]:
# Run the pipeline on a sample PDF
pdf_path = "AI_OnTheFrontLines_SMR.pdf"
etl_pipeline(pdf_path)


 Starting ETL Pipeline...

Extracted Text
Detected PII: [('Title 17', 'DATE'), ('201.14', 'CARDINAL'), ('the Code of Federal Regulations:\nhttps://www.law.cornell.edu/cfr/text/37/201.14\nThe', 'ORG'), ('the United States', 'GPE'), ('United States', 'ORG'), ('One', 'CARDINAL'), ('University of Texas', 'ORG'), ('Arlington Libraries’ Resource Delivery Team', 'ORG'), ('The University of Texas', 'ORG'), ('Arlington Libraries', 'ORG'), ('Arlington', 'GPE'), ('Texas', 'GPE'), ('817', 'CARDINAL'), ('library-ill@listserv.uta.edu\nTECHNOLOGY IMPLEMENTATION\nAI', 'ORG'), ('Front Lines', 'ORG'), ('AI', 'ORG'), ('KATHERINE C. KELLOGG', 'PERSON'), ('MARK SENDAK', 'PERSON'), ('Monday', 'DATE'), ('Aman', 'PERSON'), ('one', 'CARDINAL'), ('that day', 'DATE'), ('Duke University Hospital’s', 'ORG'), ('Aman', 'PERSON'), ('AI', 'ORG'), ('ICU', 'ORG'), ('AI', 'ORG'), ('Aman', 'PERSON'), ('AI', 'ORG'), ('ICU', 'ORG'), ('a year', 'DATE'), ('three weeks', 'DATE'), ('One', 'CARDINAL'), ('ER', 'ORG'), ('AI', 'OR