### MODEL TRAINING

In [1]:
# Step 1: Load and preprocess data
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments

# Load dataset
df = pd.read_csv("MAIN_DATASET.csv")
texts = df["data"].tolist()
labels = df["label"].tolist()

# Encode labels
label2id = {label: idx for idx, label in enumerate(set(labels))}
id2label = {idx: label for label, idx in label2id.items()}
encoded_labels = [label2id[label] for label in labels]

# Split data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    texts, encoded_labels, test_size=0.2, random_state=42
)

# Step 2: Tokenize with RoBERTa
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256)

# Step 3: Create PyTorch Dataset
import torch

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)

# Step 4: Train the model
model = RobertaForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    logging_dir="./logs",
)
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    return {"accuracy": accuracy_score(labels, preds)}


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)


trainer.train()
trainer.save_model("./results")

results = trainer.evaluate()
print(results)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss




{'eval_loss': 0.10260962694883347, 'eval_accuracy': 0.9636363636363636, 'eval_runtime': 19.6534, 'eval_samples_per_second': 2.799, 'eval_steps_per_second': 0.356, 'epoch': 3.0}


### PDF VALIDATION

In [8]:
import easyocr
import torch
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import numpy as np
from PIL import Image
from pdf2image import convert_from_path
import os
import pandas as pd

# Load tokenizer (required for prediction)
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")

# Load trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = RobertaForSequenceClassification.from_pretrained("./results")
model.to(device)
model.eval()

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

# Expected key points in Request Letter
request_letter_points = [
    "consent", "moral", "evaluated", "name of the supervising",
    "confidentiality", "follow", "negligence"
]

# Predict section using the model
def predict_section(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        pred = torch.argmax(probs, dim=-1).item()
    return pred

# Perform OCR on a single image
def perform_ocr(image):
    img_np = np.array(image)
    results = reader.readtext(img_np)
    return "\n".join([text for _, text, _ in results]).lower()

# Process a single PDF
def process_pdf(pdf_path, label_map):
    images = convert_from_path(pdf_path)
    found_sections = set()
    extra_info = {}

    for i, image in enumerate(images):
        print(f"\n Page {i+1}: {os.path.basename(pdf_path)}")
        text = perform_ocr(image)

        predicted_label_id = predict_section(text)
        predicted_section = label_map.get(predicted_label_id, None)

        if predicted_section:
            if predicted_section not in found_sections:
                found_sections.add(predicted_section)
                print(f" Predicted: {predicted_section}")
            else:
                print(f" Duplicate: {predicted_section}")

            if predicted_section == "Request Letter":
                missing = [pt for pt in request_letter_points if pt not in text]
                extra_info['RequestLetterMissingPoints'] = ", ".join(missing) if missing else "None"
        else:
            print(" Predicted section not in known label_map (ignored).")

    return found_sections, extra_info

# Validate all PDFs in a folder

def validate_all_pdfs(folder_path, label_map):
    results = []
    known_labels = set(label_map.values())

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            pdf_path = os.path.join(folder_path, filename)
            print(f"\n=> Checking File: {filename}")
            found_sections, extra_info = process_pdf(pdf_path, label_map)

            # Only count known sections as missing
            missing = known_labels - found_sections

            results.append({
                "PDF File": filename,
                "Sections Found": ", ".join(sorted(found_sections)),
                "Missing Sections": ", ".join(sorted(missing)) if missing else "",
                "Request Letter Missing Points": extra_info.get('RequestLetterMissingPoints', '')
            })

    df = pd.DataFrame(results)
    df.to_csv("application_validation_results_classified.csv", index=False)
    print("\n Saved results to: application_validation_results_classified.csv")
    return df

# Run validation
label_map = {v: k for k, v in label2id.items()}
df_results = validate_all_pdfs("input", label_map)
df_results

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.



=> Checking File: NIO Dissertation.pdf

 Page 1: NIO Dissertation.pdf




 Predicted: Request Letter

 Page 2: NIO Dissertation.pdf
 Predicted: Consent Email

 Page 3: NIO Dissertation.pdf
 Predicted: Application Form

 Page 4: NIO Dissertation.pdf
 Predicted: Resume

 Page 5: NIO Dissertation.pdf
 Duplicate: Resume

 Page 6: NIO Dissertation.pdf
 Predicted: ID Card

 Saved results to: application_validation_results_classified.csv


Unnamed: 0,PDF File,Sections Found,Missing Sections,Request Letter Missing Points
0,NIO Dissertation.pdf,"Application Form, Consent Email, ID Card, Requ...",,


### ACCURACY

In [4]:
from sklearn.metrics import accuracy_score, classification_report

# Predict using your model and tokenizer
def predict_label(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        pred_id = torch.argmax(probs, dim=-1).item()
    return label_map.get(pred_id, "Unknown")

df["predicted_label"] = df["data"].apply(predict_label)

# Evaluation
accuracy = accuracy_score(df["label"], df["predicted_label"])
print(" ACCURACY:", accuracy)


 ACCURACY: 0.9927007299270073
