In [15]:
# ✅ Step 1: Import Libraries
import json
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from transformers import BertTokenizer, BertForSequenceClassification

# ✅ Step 2: Load the Model and Tokenizer
MODEL_DIR = "edu_feedback_bert_model"
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)
model = BertForSequenceClassification.from_pretrained(MODEL_DIR)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# ✅ Step 3: Load the Test Dataset
try:
    with open("test_dataset.json", "r", encoding="utf-8") as f:
        test_data = json.load(f)
    print(f"✅ Loaded {len(test_data)} test samples.")
except FileNotFoundError:
    print("❌ Test dataset not found! Please verify the file path.")
    exit()

# ✅ Step 4: Normalize Grades
def normalize_grade(grade):
    """ Normalize grades by stripping spaces and converting to uppercase """
    return grade.strip().upper()

# ✅ Step 5: Expanded Grade Mapping
grade_to_score = {
    "A+": 6, "A": 5, "A-": 4.5,
    "B+": 4, "B": 3.5, "B-": 3,
    "C+": 2.5, "C": 2, 
    "D": 1.5, "F": 1
}

# ✅ Handle Unknown Grades Gracefully
def get_score(grade):
    """Return the numeric score for a grade, or use a default value if unknown."""
    grade = normalize_grade(grade)  # Normalize the grade
    return grade_to_score.get(grade, 2.5)  # Default to 2.5 for unknown grades

# ✅ Step 6: Function to Make Predictions with Debugging Prints
def predict(student_answer, model_answer):
    """ Generate predictions for student vs model answer """
    try:
        inputs = tokenizer(
            student_answer + " [SEP] " + model_answer,
            return_tensors="pt",
            max_length=512,
            padding="max_length",
            truncation=True
        ).to(device)

        with torch.no_grad():
            outputs = model(**inputs)
            score = torch.argmax(outputs.logits, dim=1).item()

        grade = normalize_grade(map_score_to_grade(score))

        # ✅ Debugging: Print Model Predictions
        print(f"➡️ Student: {student_answer[:50]}...")
        print(f"➡️ Model: {model_answer[:50]}...")
        print(f"➡️ Predicted Grade: {grade}")
        print(f"➡️ Predicted Score: {score}\n")

        return score, grade

    except Exception as e:
        print(f"❌ Error during prediction: {e}")
        return 0, "F"

# ✅ Step 7: Run Evaluation on the Test Dataset with Debugging Prints
true_scores = []
pred_scores = []
true_grades = []
pred_grades = []

for sample in test_data:
    student_answer = sample["input"]["student_answer"]
    model_answer = sample["input"]["model_answer"]

    # Get model prediction
    pred_score, pred_grade = predict(student_answer, model_answer)

    # Normalize grades
    true_grade = normalize_grade(sample["output"]["grade"])
    pred_grade = normalize_grade(pred_grade)

    # Append for evaluation
    true_scores.append(sample["output"]["score"])
    pred_scores.append(pred_score)
    true_grades.append(true_grade)
    pred_grades.append(pred_grade)

# ✅ Step 8: Map Grades to Numeric Scores
true_numeric = [get_score(g) for g in true_grades]
pred_numeric = [get_score(g) for g in pred_grades]

# ✅ Step 9: Calculate Both Classification and Regression Metrics
try:
    # ✅ Classification Metrics
    accuracy = accuracy_score(true_grades, pred_grades)
    precision = precision_score(true_grades, pred_grades, average="weighted", zero_division=1)
    recall = recall_score(true_grades, pred_grades, average="weighted", zero_division=1)
    f1 = f1_score(true_grades, pred_grades, average="weighted", zero_division=1)

    # ✅ Regression Metrics
    mae = mean_absolute_error(true_numeric, pred_numeric)
    mse = mean_squared_error(true_numeric, pred_numeric)
    r2 = r2_score(true_numeric, pred_numeric)

    # ✅ Print Results
    print("\n✅ Model Evaluation Results:")
    print("\n🔥 Classification Metrics:")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

    print("\n🔥 Regression Metrics:")
    print(f"Mean Absolute Error (MAE): {mae:.2f}")
    print(f"Mean Squared Error (MSE): {mse:.2f}")
    print(f"R2 Score: {r2:.2f}")

except Exception as e:
    print(f"❌ Error calculating evaluation metrics: {e}")


✅ Loaded 26 test samples.
➡️ Student: 1.  **The nervous system and endocrine system work...
➡️ Model: ### Summary ###
The neural system coordinates and ...
➡️ Predicted Grade: A
➡️ Predicted Score: 90

➡️ Student: 1.  **Conceptual Understanding & Application of Su...
➡️ Model: ### Summary ###
Okay, here's a breakdown of the ch...
➡️ Predicted Grade: A
➡️ Predicted Score: 90

➡️ Student: 1.  **Conceptual Understanding of Spontaneity:** E...
➡️ Model: ### Summary ###
Thermodynamics deals with energy c...
➡️ Predicted Grade: A
➡️ Predicted Score: 90

➡️ Student: 1.  A copper wire carries a steady current. If the...
➡️ Model: ### Summary ###
1.

### Definitions ###
[Eq. (3.2)...
➡️ Predicted Grade: A
➡️ Predicted Score: 90

➡️ Student: 1.  A triangle is formed by the x-axis and two lin...
➡️ Model: ### Summary ###
®Slope (m) of a non-vertical line ...
➡️ Predicted Grade: A
➡️ Predicted Score: 90

➡️ Student: 1.  **Scenario:** A small town is experiencing a r...
➡️ Model: ### Summary ###
Ok