In [2]:
import os
from pdf2image import convert_from_path
import easyocr
from PIL import Image
import numpy as np
import pandas as pd

# Initialize EasyOCR
reader = easyocr.Reader(['en'])

# Section keywords
sections = {
    "Application Form": ["processing", "application form for internship"],
    "Request Letter": ["director", "the hrm", "request letter", "guide recommendation"],
    "Consent Email": ["regards", "approval email"],
    "Resume": ["languages", "curriculum vitae", "resume", "skills", "certifications", "activities", "hobbies"],
    "ID Card": ["valid upto", "hosteller", "blood", "roll", "identity card", "student id", "id number", "address"]
}

def perform_ocr(image):
    img_np = np.array(image)
    results = reader.readtext(img_np)
    return "\n".join([text for _, text, _ in results]).lower()

def process_pdf_for_dataset(pdf_path, dataset_rows):
    pdf_name = os.path.basename(pdf_path)

    # Add a header row for the PDF file
    dataset_rows.append({"data": f"=== Start of {pdf_name} ===", "label": "FileHeader"})

    images = convert_from_path(pdf_path)
    for image in images:
        text = perform_ocr(image)
        label = "Unknown"

        for section, keywords in sections.items():
            if any(kw in text for kw in keywords):
                label = section
                break

        dataset_rows.append({"data": text, "label": label})

def build_labeled_dataset(folder_path, output_csv="labeled_dataset_test2.csv"):
    dataset_rows = []

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            print(f'{filename}')
            pdf_path = os.path.join(folder_path, filename)
            process_pdf_for_dataset(pdf_path, dataset_rows)

    df = pd.DataFrame(dataset_rows)
    df.to_csv(output_csv, index=False)
    print(f"✅ Labeled dataset saved to '{output_csv}' ({len(df)} entries)")
    return df

# --- Run ---
folder_path = "input"  # Replace with your actual folder path
df_labeled = build_labeled_dataset(folder_path)


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


2 eisha Patrikar nio final.pdf
Aakriti Tank Resume (2).pdf
Application for Dissertation at NIO Visakhapatnam_Molugu Harini.pdf
Application Form for Internship.pdf
CSIR-NIO .pdf
CV RESUME.pdf
Md Jeeshan Form NIO.pdf
meghana_nio.pdf
Merged NIO-merged-Compressed.pdf
NIO Dissertation.pdf
NIO_Application_BharathPrasanth.pdf
PDF.pdf
Silpa_NIO_appln.pdf
✅ Labeled dataset saved to 'labeled_dataset_test2.csv' (86 entries)
