In [17]:
import fitz  # PyMuPDF
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [18]:
def upload_file():
    """Prompt user to enter a file path manually."""
    return input("Enter the file path: ")

In [19]:
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text if text.strip() else "No text extracted"

In [20]:
def get_bert_embedding(text, model, tokenizer):
    """Generate BERT embeddings for the given text."""
    if not text.strip():
        return np.zeros((1, 768))  # Return a zero vector if text is empty
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()  # Get mean pooled embedding

In [21]:
def grade_answers():
    """Grade answers by comparing them with reference material."""
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased")

    print("Enter the reference PDF path")
    reference_pdf = upload_file()
    print("Enter the answers PDF path")
    answers_pdf = upload_file()

    reference_text = extract_text_from_pdf(reference_pdf).split("\n")
    student_answers = extract_text_from_pdf(answers_pdf).split("\n")

    if "No text extracted" in reference_text or not any(student_answers):
        print("Error: One or both PDFs contain no extractable text.")
        return []

    if len(reference_text) < 2:
        print("Error: Reference PDF should contain at least two questions.")
        return []

    reference_embeddings = [get_bert_embedding(q, model, tokenizer) for q in reference_text[:2]]
    scores = []

    for idx, ans in enumerate(student_answers[:2], start=1):  # Only compare the first two answers
        if not ans.strip():
            print(f"Answer {idx}: No valid text detected.")
            scores.append(0)
            continue

        ans_embedding = get_bert_embedding(ans, model, tokenizer)
        similarity = cosine_similarity(reference_embeddings[idx - 1], ans_embedding)
        score = np.mean(similarity) * 100  # Convert similarity to percentage
        scores.append(score)
        print(f"Answer {idx}: {score:.2f}% relevance")

    avg_score = np.mean(scores) if scores else 0
    print(f"\nFinal Score: {avg_score:.2f}%")

    return scores

In [22]:
# Example Usage
if __name__ == "__main__":
    scores = grade_answers()
    print("Scores:", scores)

Enter the reference PDF path
Enter the file path: /content/answers.pdf
Enter the answers PDF path
Enter the file path: /content/student.pdf
Answer 1: 76.48% relevance
Answer 2: 35.81% relevance

Final Score: 56.14%
Scores: [76.47531032562256, 35.808539390563965]
