In [2]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m57.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.25.3


In [3]:
import fitz  # PyMuPDF
import re
import spacy
import numpy as np
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Load spaCy's English model for sentence tokenization
nlp = spacy.load("en_core_web_sm")

# Load BERT model and tokenizer once to avoid repeated initialization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

def get_bert_embedding(text):
    """Generate BERT embeddings by averaging all token embeddings."""
    if not text.strip():
        return np.zeros((1, 768))  # Return a zero vector if text is empty
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()  # Mean pooling over all tokens

def remove_stopwords(sentence):
    """Remove stopwords from a sentence."""
    words = sentence.split()
    return " ".join([word for word in words if word.lower() not in ENGLISH_STOP_WORDS])

def sentence_similarity(reference_text, student_text):
    """Compute similarity between reference and student explanations with stricter thresholding and stopword removal."""
    # Tokenize sentences and remove stopwords
    reference_sentences = [sent.text.strip() for sent in nlp(reference_text).sents if sent.text.strip()]
    student_sentences = [sent.text.strip() for sent in nlp(student_text).sents if sent.text.strip()]

    if not reference_sentences or not student_sentences:
        return 0

    # Remove stopwords from each sentence
    reference_sentences = [remove_stopwords(sentence) for sentence in reference_sentences]
    student_sentences = [remove_stopwords(sentence) for sentence in student_sentences]

    # Get BERT embeddings for each sentence
    reference_embeddings = np.vstack([get_bert_embedding(sentence) for sentence in reference_sentences])
    student_embeddings = np.vstack([get_bert_embedding(sentence) for sentence in student_sentences])

    # Compute the cosine similarity matrix between reference and student embeddings
    similarity_matrix = cosine_similarity(reference_embeddings, student_embeddings)

    # Max similarity between each reference sentence and all student sentences
    max_similarities = np.max(similarity_matrix, axis=1)

    # Apply a stricter threshold (e.g., 0.75 instead of 0.7) to prevent inflated scores
    filtered_similarities = [sim for sim in max_similarities if sim > 0.75]

    # Return the mean similarity as the final score (in percentage)
    return (np.mean(filtered_similarities) * 100) if filtered_similarities else 0

def point_wise_grading(reference_points, student_points):
    """Compare individual points in the reference and student answers."""
    if not reference_points:
        return 0

    match_count = sum(any(ref.lower() in student.lower() for student in student_points) for ref in reference_points)
    return (match_count / len(reference_points)) * 100

def keyword_check(reference_answer, student_answer):
    """Keyword matching between reference and student answer."""
    reference_keywords = set(re.findall(r'\w+', reference_answer.lower()))
    student_keywords = set(re.findall(r'\w+', student_answer.lower()))
    common_keywords = reference_keywords.intersection(student_keywords)
    return (len(common_keywords) / len(reference_keywords)) * 100 if reference_keywords else 0

def split_answer(answer):
    """Dynamically split answers into explanations and bullet points."""
    lines = [line.strip() for line in answer.split("\n") if line.strip()]

    if not lines:
        return "", []

    explanation = lines[0]
    points = [line for line in lines[1:] if re.match(r'^\d+\.', line)]  # Extract numbered subpoints
    return explanation, points

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file using multiple strategies."""
    try:
        doc = fitz.open(pdf_path)
        text = ""

        # Try extracting as plain text (default method)
        for page in doc:
            text += page.get_text("text")

        if not text.strip():  # If plain text extraction didn't work
            # Try HTML-based extraction for a more robust result
            text = ""
            for page in doc:
                text += page.get_text("html")  # Can be more reliable for complex formatting

        # If still empty, try dictionary-based extraction for structure (less readable, but might help)
        if not text.strip():
            text = ""
            for page in doc:
                text += str(page.get_text("dict"))  # This returns structured text, including coordinates

        return text if text.strip() else "No text extracted"
    except Exception as e:
        return f"Error extracting text: {e}"

def grade_answers():
    """Grade student answers against reference answers."""
    print("Enter the reference PDF path (correct answers PDF)")
    reference_pdf = input("Enter the file path: ")
    print("Enter the answers PDF path (student answers PDF)")
    answers_pdf = input("Enter the file path: ")

    reference_text = extract_text_from_pdf(reference_pdf)
    student_text = extract_text_from_pdf(answers_pdf)

    if "No text extracted" in reference_text or "No text extracted" in student_text:
        print("Error: One or both PDFs contain no extractable text.")
        return

    reference_answers = re.split(r'(?=\nA\d+)', reference_text)
    student_answers = re.split(r'(?=\nA\d+)', student_text)

    reference_answers = [ans.strip() for ans in reference_answers if ans.strip()]
    student_answers = [ans.strip() for ans in student_answers if ans.strip()]

    if len(student_answers) != len(reference_answers):
        print(f"Warning: Mismatch in number of answers detected. ({len(student_answers)} vs {len(reference_answers)})")

    total_score = 0
    total_questions = min(len(reference_answers), len(student_answers))

    print(f"Total Questions to Grade: {total_questions}")

    for i in range(total_questions):
        ref_explanation, ref_points = split_answer(reference_answers[i])
        student_explanation, student_points = split_answer(student_answers[i])

        # No extracted text is displayed here, only scores
        score_explanation = sentence_similarity(ref_explanation, student_explanation)
        score_points = point_wise_grading(ref_points, student_points)
        score_keywords = keyword_check(ref_explanation, student_explanation)

        # Lower weight for explanation and stricter matching threshold
        question_score = (0.3 * score_explanation) + (0.3 * score_points) + (0.4 * score_keywords)
        total_score += question_score

        print(f"Question {i+1} Explanation Score: {score_explanation:.2f}%")
        print(f"Question {i+1} Points Score: {score_points:.2f}%")
        print(f"Question {i+1} Keyword Match Score: {score_keywords:.2f}%")
        print(f"Total Score for Question {i+1}: {question_score:.2f}%\n")

    average_score = total_score / total_questions if total_questions else 0
    print(f"Final Average Score: {average_score:.2f}%")
    return average_score

# Running the grading function
if __name__ == "__main__":
    grade_answers()



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Enter the reference PDF path (correct answers PDF)
Enter the file path: /content/reference.pdf
Enter the answers PDF path (student answers PDF)
Enter the file path: /content/answers.pdf
Total Questions to Grade: 2
Question 1 Explanation Score: 100.00%
Question 1 Points Score: 100.00%
Question 1 Keyword Match Score: 100.00%
Total Score for Question 1: 100.00%

Question 2 Explanation Score: 100.00%
Question 2 Points Score: 0.00%
Question 2 Keyword Match Score: 13.33%
Total Score for Question 2: 35.33%

Final Average Score: 67.67%
