In [2]:
!pip install PyMuPDF

Collecting PyMuPDF
  Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.3-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.3


In [23]:
import fitz  # PyMuPDF
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import spacy
import re

In [24]:
# Load spaCy's English model for sentence tokenization
nlp = spacy.load("en_core_web_sm")

def upload_file():
    """Prompt user to enter a file path manually."""
    return input("Enter the file path: ")

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file."""
    doc = fitz.open(pdf_path)
    text = "\n".join([page.get_text("text") for page in doc])
    return text if text.strip() else "No text extracted"

def get_bert_embedding(text, model, tokenizer):
    """Generate BERT embeddings for the given text."""
    if not text.strip():
        return np.zeros((1, 768))  # Return a zero vector if text is empty
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).numpy()  # Get mean pooled embedding

def sentence_similarity(reference_text, student_text, model, tokenizer):
    """Compare sentences between reference and student answers using BERT embeddings."""
    reference_sentences = [sent.text for sent in nlp(reference_text).sents]
    student_sentences = [sent.text for sent in nlp(student_text).sents]

    reference_embeddings = [get_bert_embedding(sentence, model, tokenizer) for sentence in reference_sentences]
    student_embeddings = [get_bert_embedding(sentence, model, tokenizer) for sentence in student_sentences]

    total_similarity = 0
    matches = 0

    for ref_embedding in reference_embeddings:
        for student_embedding in student_embeddings:
            similarity = cosine_similarity(ref_embedding, student_embedding)
            if similarity > 0.75:  # Use a more strict threshold for high similarity
                total_similarity += similarity
                matches += 1

    if matches == 0:
        return 0  # No matches found, so score is 0

    # Ensure total_similarity is a scalar before performing the division
    return float(total_similarity) / matches * 100  # Return average similarity as percentage






In [25]:
def point_wise_grading(reference_points, student_points):
    """Grading based on specific points in the reference answer."""
    total_points = len(reference_points)
    if total_points == 0:  # Avoid division by zero
        return 0  # Return 0 if no points are present in the reference answer

    correct_points = 0

    for i, ref_point in enumerate(reference_points):
        if i < len(student_points):
            student_point = student_points[i]
            if ref_point.lower() in student_point.lower():  # Check for a partial match
                correct_points += 1  # Full or partial match

    return (correct_points / total_points) * 100  # Percentage of correct points

def keyword_check(reference_answer, student_answer):
    """Check if student answer contains essential keywords from the reference answer."""
    reference_keywords = set(re.findall(r'\w+', reference_answer.lower()))  # Extract words
    student_keywords = set(re.findall(r'\w+', student_answer.lower()))  # Extract words
    common_keywords = reference_keywords.intersection(student_keywords)
    return len(common_keywords) / len(reference_keywords)  # Ratio of common words to total reference words

def grade_answers():
    """Grade answers by comparing them with reference material."""
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    model = BertModel.from_pretrained("bert-base-uncased")

    print("Enter the reference PDF path (correct answers PDF)")
    reference_pdf = upload_file()
    print("Enter the answers PDF path (student answers PDF)")
    answers_pdf = upload_file()

    reference_text = extract_text_from_pdf(reference_pdf)
    student_answers = extract_text_from_pdf(answers_pdf)

    if "No text extracted" in reference_text or not student_answers.strip():
        print("Error: One or both PDFs contain no extractable text.")
        return []

    # Split reference and student answers into individual answers (A1, A2)
    reference_answers = reference_text.split("\n")[:2]  # Assuming there are two reference answers
    student_answers = student_answers.split("\n")[:2]  # Assuming there are two student answers

    # Split each answer into the basic explanation and numbered points
    def split_answer(answer):
        basic_explanation = answer.split("\n")[0]  # First line is the basic explanation
        points = [point.strip() for point in answer.split("\n")[1:] if point.strip()]  # Rest are points
        return basic_explanation, points

    ref_explanation_A1, ref_points_A1 = split_answer(reference_answers[0])
    student_explanation_A1, student_points_A1 = split_answer(student_answers[0])

    ref_explanation_A2, ref_points_A2 = split_answer(reference_answers[1])
    student_explanation_A2, student_points_A2 = split_answer(student_answers[1])

    # Grading the explanation (basic explanation comparison)
    print("Grading Answer 1 (A1) Explanation...")
    score_A1_explanation = sentence_similarity(ref_explanation_A1, student_explanation_A1, model, tokenizer)
    print(f"Answer 1 Explanation Score: {score_A1_explanation:.2f}%")

    print("Grading Answer 2 (A2) Explanation...")
    score_A2_explanation = sentence_similarity(ref_explanation_A2, student_explanation_A2, model, tokenizer)
    print(f"Answer 2 Explanation Score: {score_A2_explanation:.2f}%")

    # Point-wise grading for A1 and A2
    print("Grading Answer 1 (A1) Points...")
    score_A1_points = point_wise_grading(ref_points_A1, student_points_A1)
    print(f"Answer 1 Points Score: {score_A1_points:.2f}%")

    print("Grading Answer 2 (A2) Points...")
    score_A2_points = point_wise_grading(ref_points_A2, student_points_A2)
    print(f"Answer 2 Points Score: {score_A2_points:.2f}%")

    # Combine explanation and point scores for each answer
    total_score_A1 = (score_A1_explanation + score_A1_points) / 2
    total_score_A2 = (score_A2_explanation + score_A2_points) / 2

    print(f"Total Score for Answer 1: {total_score_A1:.2f}%")
    print(f"Total Score for Answer 2: {total_score_A2:.2f}%")

    # Calculate the final average score
    final_score = (total_score_A1 + total_score_A2) / 2
    print(f"Final Average Score: {final_score:.2f}%")

    return final_score

# Example Usage
if __name__ == "__main__":
    score = grade_answers()
    print("Average Score:", score)

Enter the reference PDF path (correct answers PDF)
Enter the file path: /content/reference.pdf
Enter the answers PDF path (student answers PDF)
Enter the file path: /content/answers.pdf
Grading Answer 1 (A1) Explanation...


  return float(total_similarity) / matches * 100  # Return average similarity as percentage


Answer 1 Explanation Score: 100.00%
Grading Answer 2 (A2) Explanation...
Answer 2 Explanation Score: 98.21%
Grading Answer 1 (A1) Points...
Answer 1 Points Score: 0.00%
Grading Answer 2 (A2) Points...
Answer 2 Points Score: 0.00%
Total Score for Answer 1: 50.00%
Total Score for Answer 2: 49.10%
Final Average Score: 49.55%
Average Score: 49.55212622880936


  return float(total_similarity) / matches * 100  # Return average similarity as percentage
