Install dependencies

In [90]:
!pip install PyMuPDF

Collecting PyMuPDF
  Using cached pymupdf-1.25.5-cp39-abi3-macosx_11_0_arm64.whl.metadata (3.4 kB)
Using cached pymupdf-1.25.5-cp39-abi3-macosx_11_0_arm64.whl (18.6 MB)
Installing collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.5


In [71]:
import os
import fitz  # PyMuPDF
import csv

In [72]:
def SafeExtractText(file_path):
    """
    Safely extract text from a PDF using PyMuPDF (fitz).
    Returns None if the file is invalid, empty, or unreadable.
    """
    try:
        if not os.path.exists(file_path):
            print(f"File not found: {file_path}")
            return None, "File not found"
        if os.path.getsize(file_path) == 0:
            print(f"Empty file: {file_path}")
            return None, "Empty file"

        doc = fitz.open(file_path)
        text = " ".join(page.get_text() for page in doc)
        doc.close()

        if not text.strip():
            print(f"No text extracted: {file_path}")
            return None, "No text extracted"

        return text.strip(), None

    except fitz.FileDataError as e:  # Specific catch for PDF corruption-related issues
        print(f"Corrupted PDF or file data issue: {file_path}")
        return None, f"Corrupted PDF: {str(e)}"

    except fitz.PdfError as e:  # More general catch for PDF-specific errors
        print(f"PDF error in {file_path}: {str(e)}")
        return None, f"PDF error: {str(e)}"

    except Exception as e:
        print(f"Unexpected error in {file_path}: {e}")
        return None, f"Unexpected error: {e}"

def LoadValidDocuments(folder_path, log_file="skipped_files.csv"):
    """
    Load and extract text from all valid PDF files in the given folder.
    Logs failed files with reasons to a .csv file.
    Returns a list of texts and corresponding filenames.
    """
    documents = []
    filenames = []
    skipped = []

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            full_path = os.path.join(folder_path, filename)
            print(f"Processing: {filename}")

            text, error_reason = SafeExtractText(full_path)

            if text:
                documents.append(text)
                filenames.append(filename)
            else:
                print(f"Skipping: {filename} ({error_reason})")
                skipped.append([filename, error_reason])

    # Write skipped files with reasons to .csv
    if skipped:
        with open(log_file, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["filename", "error_reason"])
            writer.writerows(skipped)
        print(f"Skipped files logged in: {log_file}")

    print(f"Successfully loaded {len(documents)} documents.\n")
    return documents, filenames

# === Example Usage ===
if __name__ == "__main__":
    folder_path = "../data/raw_data"  # Adjust to your folder path
    documents, filenames = LoadValidDocuments(folder_path, log_file="skipped_files.csv")

    # Show sample output of loaded documents
    for i, (name, text) in enumerate(zip(filenames, documents[:3])):
        print(f"--- Document {i+1}: {name} ---\n{text[:500]}...\n")

Processing: document_459.pdf
No text extracted: ../data/raw_data/document_459.pdf
Skipping: document_459.pdf (No text extracted)
Processing: document_317.pdf
Processing: document_471.pdf
Processing: document_465.pdf
Processing: document_303.pdf
Processing: document_854.pdf
Processing: document_1105.pdf
Processing: document_840.pdf
Processing: document_1111.pdf
No text extracted: ../data/raw_data/document_1111.pdf
Skipping: document_1111.pdf (No text extracted)
Processing: document_698.pdf
No text extracted: ../data/raw_data/document_698.pdf
Skipping: document_698.pdf (No text extracted)
Processing: document_868.pdf
No text extracted: ../data/raw_data/document_868.pdf
Skipping: document_868.pdf (No text extracted)
Processing: document_1139.pdf
Processing: document_897.pdf
No text extracted: ../data/raw_data/document_897.pdf
Skipping: document_897.pdf (No text extracted)
Processing: document_129.pdf
No text extracted: ../data/raw_data/document_129.pdf
Skipping: document_129.pdf (No text 

In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import joblib  # to save/load the model

In [80]:
# Extract text using fitz
def SafeExtractText(file_path):
    try:
        if not os.path.exists(file_path):
            return None, "File not found"
        if os.path.getsize(file_path) == 0:
            return None, "Empty file"

        doc = fitz.open(file_path)
        text = " ".join(page.get_text() for page in doc)
        doc.close()

        if not text.strip():
            return None, "No text extracted"

        return text.strip(), None

    except fitz.FileDataError as e:
        return None, f"FileDataError: {str(e)}"

    except RuntimeError as e:
        return None, f"RuntimeError: {str(e)}"

    except Exception as e:
        return None, f"Unexpected error: {e}"

# Load and filter documents
def LoadValidDocuments(folder_path, log_file="skipped_files.csv"):
    documents, filenames, skipped = [], [], []

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            full_path = os.path.join(folder_path, filename)
            print(f"Processing: {filename}")

            text, error = SafeExtractText(full_path)

            if text:
                documents.append(text)
                filenames.append(filename)
            else:
                print(f"Skipping: {filename} ({error})")
                skipped.append([filename, error])

    if skipped:
        with open(log_file, "w", newline="") as f:
            writer = csv.writer(f)
            writer.writerow(["filename", "error_reason"])
            writer.writerows(skipped)
        print(f"Logged skipped files to {log_file}")

    print(f"\nLoaded {len(documents)} valid documents.")
    return documents, filenames

# Train or Load Classifier
def TrainSampleClassifier():
    # Sample training data
    data = [
        ("This agreement is made between the parties...", "contract"),
        ("Your electricity usage this month is 250 kWh.", "bill"),
        ("The contract term shall be effective until...", "contract"),
        ("Meter reading: 350 kWh. Amount due: $90.25", "bill"),
        ("This contract is made between...", "contract"),
        ("Invoice number: 12345, payment due...", 'bill'),
        ("The agreement shall remain valid for...", 'contract'),
        ("Electricity consumption details for March...", 'bill')
    ]
    texts, labels = zip(*data)

    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(texts)

    clf = MultinomialNB()
    clf.fit(X, labels)

    # Save model and vectorizer
    joblib.dump(clf, "doc_classifier.joblib")
    joblib.dump(vectorizer, "tfidf_vectorizer.joblib")

# Run Classification
def ClassifyDocuments(documents):
    clf = joblib.load("doc_classifier.joblib")
    vectorizer = joblib.load("tfidf_vectorizer.joblib")

    X = vectorizer.transform(documents)
    preds = clf.predict(X)
    return preds

# Main Execution
if __name__ == "__main__":
    folder_path = "../data/raw_data"
    documents, filenames = LoadValidDocuments(folder_path)

    # Train the classifier
    TrainSampleClassifier()

    # Classify the documents loaded
    predictions = ClassifyDocuments(documents)

    # Output classification results
    for fname, pred in zip(filenames, predictions):
        print(f"{fname} -> Predicted category: {pred}")

Processing: document_459.pdf
Skipping: document_459.pdf (No text extracted)
Processing: document_317.pdf
Processing: document_471.pdf
Processing: document_465.pdf
Processing: document_303.pdf
Processing: document_854.pdf
Processing: document_1105.pdf
Processing: document_840.pdf
Processing: document_1111.pdf
Skipping: document_1111.pdf (No text extracted)
Processing: document_698.pdf
Skipping: document_698.pdf (No text extracted)
Processing: document_868.pdf
Skipping: document_868.pdf (No text extracted)
Processing: document_1139.pdf
Processing: document_897.pdf
Skipping: document_897.pdf (No text extracted)
Processing: document_129.pdf
Skipping: document_129.pdf (No text extracted)
Processing: document_883.pdf
Skipping: document_883.pdf (No text extracted)
Processing: document_673.pdf
Processing: document_115.pdf
Skipping: document_115.pdf (No text extracted)
Processing: document_101.pdf
Processing: document_667.pdf
Processing: document_1071.pdf
Skipping: document_1071.pdf (No text ex

In [82]:
from collections import Counter

In [84]:
Counter(predictions)

Counter({'bill': 485, 'contract': 206})

In [86]:
# Show sample output of loaded documents
for i, (name, text) in enumerate(zip(filenames, documents[:3])):
    print(f"--- Document {i+1}: {name} ---\n{text[:500]}...\n")

--- Document 1: document_317.pdf ---
1 M
J
EVERSSURCE
Total Amount Due
by 02/16/24
$2,222.16
Account Number:
Statement Date:
Service Provided To:
TOWN OF BERLIN
5160 051 3022
12/18/23
j Amount Due 0n12/11/23
j Last Payment Received On 11/15/23
I Balance Forward
E Total Current Charges
$2,376.90
-$1,162.11
■^,007.37
Electric Usage History - Kilowatt Hours (kwh)
Current Charges for Electricity
Supply
$422.39
Delivery
$584.98
kWWOay
350-1
300-
Costof electricity from NEXTERA
ENERGY SERVICES CONN
Cost to deliver electricity
from Everso...

--- Document 2: document_471.pdf ---
Account Number:
Amount Enclosed
Please make your check payable to Eversource and consider adding $1 for Good Neighbor.
WM_230404PROD.TXT
You may be subject to a 1.02% late payment charge if
the "Total Amount Due" is not received by 04/29/23
Eversource
PO Box 56005
Boston, MA 02205-6005
5441420701937 0011569675 0000120463
Visit Eversource.com to make your payment today. If mailing payment, please allow up to 5 business