In [1]:
!pip install torch transformers scikit-learn pandas




In [3]:
import json
import torch
import pandas as pd
from sklearn.model_selection import train_test_split

# === Load the dataset ===
with open("training_dataset.json", "r", encoding="utf-8") as f:
    dataset = json.load(f)

# === Prepare the data ===
student_answers = [item["input"]["student_answer"] for item in dataset]
model_answers = [item["input"]["model_answer"] for item in dataset]
grades = [item["output"]["grade"].strip().upper() for item in dataset]  # Clean grades
scores = [item["output"]["score"] for item in dataset]

# === Print unique grades for debugging ===
print("Unique grades:", set(grades))  

# === Convert grades to numeric labels ===
grade_to_score = {"A": 5, "B": 4, "C": 3, "D": 2, "F": 1}

# Map grades to scores with a default value for unknown grades
y_labels = [grade_to_score.get(g, 0) for g in grades]  # 0 for unknown grades

# === Split the dataset into training and validation sets ===
train_texts, val_texts, train_labels, val_labels = train_test_split(
    [f"{s} [SEP] {m}" for s, m in zip(student_answers, model_answers)],
    y_labels,
    test_size=0.2,
    random_state=42
)

print(f"✅ Loaded {len(train_texts)} training samples and {len(val_texts)} validation samples.")


Unique grades: {'** A', 'A**', '** A-', 'B**', '** B', '** B+', '** A+', 'B+**', 'F', '**', 'A+**'}
✅ Loaded 80 training samples and 20 validation samples.


In [5]:
# === Imports ===
import os
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# === Configuration ===
MODEL_DIR = "./fine_tuned_edu_feedback_bert_model"  # Directory to save fine-tuned model
OUTPUT_DIR = "./results"                            # Output directory for logs and checkpoints
LOG_DIR = "./logs"                                  # Logging directory

# === Tokenizer ===
print("🚀 Loading tokenizer...")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# === Dataset Class ===
class GradingDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# === Sample Dataset (For Testing) ===
train_texts = ["The student gave a good answer.", "The answer was incorrect."]
train_labels = [4, 1]

val_texts = ["The student demonstrated excellent understanding.", "Incomplete explanation."]
val_labels = [5, 2]

# === Create Dataset Objects ===
print("📊 Creating dataset objects...")
train_dataset = GradingDataset(train_texts, train_labels)
val_dataset = GradingDataset(val_texts, val_labels)

# === Load BERT Model with Correct Classifier Shape ===
num_labels = 5  # Set the correct number of labels
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)

# === Conditional Loading of Fine-Tuned Weights ===
print("🔧 Loading model weights...")
try:
    checkpoint_path = os.path.join(MODEL_DIR, "pytorch_model.bin")
    if os.path.exists(checkpoint_path):
        checkpoint = torch.load(checkpoint_path, map_location="cpu")

        # Remove classifier mismatch keys
        for key in list(checkpoint.keys()):
            if "classifier" in key:
                del checkpoint[key]

        # Load remaining compatible weights
        model.load_state_dict(checkpoint, strict=False)
        print("✅ Loaded fine-tuned weights successfully.")
    else:
        print("⚠️ No pre-trained weights found. Starting with base BERT model.")
except Exception as e:
    print(f"❌ Error loading pre-trained weights: {e}")

model.train()

# === Training Arguments ===
print("⚙️ Setting training arguments...")
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,                # Optimized learning rate
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,                 # Increased epochs for better training
    weight_decay=0.01,                  # L2 regularization
    logging_dir=LOG_DIR,
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True
)

# === Trainer ===
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# === Fine-Tuning ===
try:
    print("🚀 Starting training...")
    trainer.train()
    print("✅ Training completed successfully.")
except Exception as e:
    print(f"❌ Error during training: {e}")

# === Save the Fine-Tuned Model ===
try:
    print("💾 Saving the fine-tuned model...")
    os.makedirs(MODEL_DIR, exist_ok=True)
    model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)
    print(f"✅ Model saved successfully in {MODEL_DIR}")
except Exception as e:
    print(f"❌ Error while saving model: {e}")



🚀 Loading tokenizer...
📊 Creating dataset objects...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


🔧 Loading model weights...
⚠️ No pre-trained weights found. Starting with base BERT model.
⚙️ Setting training arguments...
🚀 Starting training...


Epoch,Training Loss,Validation Loss


❌ Error during training: Target 5 is out of bounds.
💾 Saving the fine-tuned model...
✅ Model saved successfully in ./fine_tuned_edu_feedback_bert_model


In [19]:
!pip install torch transformers scikit-learn pandas




In [6]:
import json
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score, precision_score, recall_score, f1_score

# === Load dataset ===
with open("training_dataset.json", "r", encoding="utf-8") as f:
    dataset = json.load(f)

# === Extract answers and labels ===
try:
    student_answers = [item['input']['student_answer'] for item in dataset]
    model_answers = [item['input']['model_answer'] for item in dataset]
    grades = [item['output']['grade'] for item in dataset]
    scores = [item['output']['score'] for item in dataset]
    
    # Map grades to numeric scores for classification
    grade_to_score = {"A": 5, "B": 4, "C": 3, "D": 2, "F": 1}
    y_labels = [grade_to_score.get(g, 0) for g in grades]

    print(f"✅ Loaded {len(student_answers)} samples.")
except KeyError as e:
    print(f"❌ Error extracting data: Missing key {e}")


✅ Loaded 100 samples.


In [9]:
import torch
from transformers import BertTokenizer

# === Tokenizer ===
MODEL_DIR = "edu_feedback_bert_model"
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)

# === Dataset Class ===
class GradingDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = tokenizer(
            self.texts[idx],
            max_length=512,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx])
        }


In [11]:
# ✅ Import Libraries
import json
import numpy as np
import torch
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, mean_absolute_error, mean_squared_error, r2_score
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

# ✅ Step 1: Load and Prepare Dataset
print("✅ Loading Dataset...")

# ✅ Load the training dataset
with open("training_dataset.json", "r", encoding="utf-8") as f:
    dataset = json.load(f)

# ✅ Map grades to 101 labels
label_mapping = {f"Class_{i}": i for i in range(1, 102)}  # 1 to 101 labels

# ✅ Extract student and model answers
student_answers = [item['input']['student_answer'] for item in dataset]
model_answers = [item['input']['model_answer'] for item in dataset]

# ✅ Map grades to numerical labels (101 classes)
y_labels = [label_mapping.get(item['output']['grade'], 0) for item in dataset]

print(f"✅ Loaded {len(dataset)} samples with 101 labels!")

# ✅ Step 2: Tokenize Data
print("\n🔥 Tokenizing the Dataset...")

MODEL_DIR = "edu_feedback_bert_model"   # ✅ Path to your fine-tuned model

# ✅ Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)

# ✅ Tokenize student and model answers
encoded_inputs = tokenizer(
    student_answers,
    model_answers,
    max_length=512,
    padding="max_length",
    truncation=True,
    return_tensors="pt"
)

# ✅ Create Dataset class
class GradingDataset(torch.utils.data.Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            "input_ids": self.texts["input_ids"][idx],
            "attention_mask": self.texts["attention_mask"][idx],
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# ✅ Step 3: Perform K-Fold Cross-Validation
print("\n🚀 Running 5-Fold Cross-Validation...")

# ✅ K-Fold setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# ✅ Initialize metrics
fold_results = []

# ✅ Loop through each fold
for fold, (train_idx, val_idx) in enumerate(kf.split(student_answers)):
    print(f"\n➡️ Fold {fold + 1}")

    # ✅ Prepare training and validation sets
    train_texts = {key: val[train_idx] for key, val in encoded_inputs.items()}
    val_texts = {key: val[val_idx] for key, val in encoded_inputs.items()}

    train_labels = [y_labels[i] for i in train_idx]
    val_labels = [y_labels[i] for i in val_idx]

    # ✅ Create Dataset objects
    train_dataset = GradingDataset(train_texts, train_labels)
    val_dataset = GradingDataset(val_texts, val_labels)

    # ✅ Load BERT model with 101 labels
    model = BertForSequenceClassification.from_pretrained(
        MODEL_DIR,
        num_labels=101,                  # ✅ Use 101 labels
        ignore_mismatched_sizes=False    # ✅ Ensure it matches the pre-trained checkpoint
    )
    model.train()

    # ✅ Define hyperparameters
    training_args = TrainingArguments(
        output_dir=f"./results_fold_{fold + 1}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        logging_dir=f"./logs_fold_{fold + 1}",
        logging_steps=10,
        save_steps=100,
        save_total_limit=2,
        load_best_model_at_end=True
    )

    # ✅ Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset
    )

    # ✅ Train the model
    trainer.train()

    # ✅ Evaluate the model
    eval_results = trainer.evaluate(eval_dataset=val_dataset)
    print(f"✅ Fold {fold + 1} Evaluation: {eval_results}")

    # ✅ Store the evaluation results
    fold_results.append(eval_results)

# ✅ Step 4: Calculate Metrics
print("\n🔥 Cross-Validation Completed!")

# ✅ Aggregate metrics
avg_accuracy = np.mean([result['eval_accuracy'] for result in fold_results])
avg_precision = np.mean([result['eval_precision'] for result in fold_results])
avg_recall = np.mean([result['eval_recall'] for result in fold_results])
avg_f1 = np.mean([result['eval_f1'] for result in fold_results])

print("\n✅ Cross-Validation Results:")
print(f"🔥 Average Accuracy: {avg_accuracy:.4f}")
print(f"🔥 Average Precision: {avg_precision:.4f}")
print(f"🔥 Average Recall: {avg_recall:.4f}")
print(f"🔥 Average F1-Score: {avg_f1:.4f}")


✅ Loading Dataset...
✅ Loaded 100 samples with 101 labels!

🔥 Tokenizing the Dataset...


Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai


🚀 Running 5-Fold Cross-Validation...

➡️ Fold 1


Epoch,Training Loss,Validation Loss
1,No log,4.195947
2,No log,4.058129
3,No log,3.992083


✅ Fold 1 Evaluation: {'eval_loss': 3.9920833110809326, 'eval_runtime': 4.5466, 'eval_samples_per_second': 0.66, 'eval_steps_per_second': 0.22, 'epoch': 3.0}

➡️ Fold 2




Epoch,Training Loss,Validation Loss
1,No log,4.186403
2,No log,4.048894
3,No log,3.983248


✅ Fold 2 Evaluation: {'eval_loss': 3.983248472213745, 'eval_runtime': 4.4475, 'eval_samples_per_second': 0.675, 'eval_steps_per_second': 0.225, 'epoch': 3.0}

➡️ Fold 3




Epoch,Training Loss,Validation Loss
1,No log,4.193542
2,No log,4.060963
3,No log,3.999822


✅ Fold 3 Evaluation: {'eval_loss': 3.999821901321411, 'eval_runtime': 4.5101, 'eval_samples_per_second': 0.665, 'eval_steps_per_second': 0.222, 'epoch': 3.0}

➡️ Fold 4




Epoch,Training Loss,Validation Loss
1,No log,4.1929
2,No log,4.053015
3,No log,3.98715


✅ Fold 4 Evaluation: {'eval_loss': 3.987150192260742, 'eval_runtime': 4.7506, 'eval_samples_per_second': 0.631, 'eval_steps_per_second': 0.21, 'epoch': 3.0}

➡️ Fold 5




Epoch,Training Loss,Validation Loss
1,No log,4.177279
2,No log,4.039158
3,No log,3.971989


✅ Fold 5 Evaluation: {'eval_loss': 3.971989393234253, 'eval_runtime': 4.4784, 'eval_samples_per_second': 0.67, 'eval_steps_per_second': 0.223, 'epoch': 3.0}

🔥 Cross-Validation Completed!


KeyError: 'eval_accuracy'

In [13]:
# Print the fold results to inspect available keys
print(fold_results)


[{'eval_loss': 3.9920833110809326, 'eval_runtime': 4.5466, 'eval_samples_per_second': 0.66, 'eval_steps_per_second': 0.22, 'epoch': 3.0}, {'eval_loss': 3.983248472213745, 'eval_runtime': 4.4475, 'eval_samples_per_second': 0.675, 'eval_steps_per_second': 0.225, 'epoch': 3.0}, {'eval_loss': 3.999821901321411, 'eval_runtime': 4.5101, 'eval_samples_per_second': 0.665, 'eval_steps_per_second': 0.222, 'epoch': 3.0}, {'eval_loss': 3.987150192260742, 'eval_runtime': 4.7506, 'eval_samples_per_second': 0.631, 'eval_steps_per_second': 0.21, 'epoch': 3.0}, {'eval_loss': 3.971989393234253, 'eval_runtime': 4.4784, 'eval_samples_per_second': 0.67, 'eval_steps_per_second': 0.223, 'epoch': 3.0}]


In [15]:
# ✅ Aggregate metrics with .get() to avoid KeyError
avg_accuracy = np.mean([result.get('eval_accuracy', 0.0) for result in fold_results])
avg_precision = np.mean([result.get('eval_precision', 0.0) for result in fold_results])
avg_recall = np.mean([result.get('eval_recall', 0.0) for result in fold_results])
avg_f1 = np.mean([result.get('eval_f1', 0.0) for result in fold_results])


In [17]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    precision = precision_score(labels, predictions, average='weighted')
    recall = recall_score(labels, predictions, average='weighted')
    f1 = f1_score(labels, predictions, average='weighted')
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }


In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics  # ✅ Add the metrics function
)


In [21]:
# ✅ Load the fine-tuned model
from transformers import BertTokenizer, BertForSequenceClassification

# Load model and tokenizer
model_path = "edu_feedback_bert_model_tuned"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)
model.eval()

# ✅ Load test dataset
import json
with open("test_dataset.json", "r", encoding="utf-8") as f:
    test_data = json.load(f)

# ✅ Make predictions
for sample in test_data:
    student_answer = sample["input"]["student_answer"]
    model_answer = sample["input"]["model_answer"]

    # Tokenize inputs
    inputs = tokenizer(
        student_answer + " [SEP] " + model_answer,
        return_tensors="pt",
        max_length=512,
        padding="max_length",
        truncation=True
    )

    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
        score = torch.argmax(outputs.logits, dim=1).item()

    grade = (
        "A" if score >= 90 else
        "B" if score >= 75 else
        "C" if score >= 60 else
        "D" if score >= 45 else
        "F"
    )

    print(f"\n➡️ Student: {student_answer}")
    print(f"➡️ Model: {model_answer}")
    print(f"➡️ Predicted Grade: {grade}")
    print(f"➡️ Predicted Score: {score}")



➡️ Student: 1.  **The nervous system and endocrine system work together to maintain homeostasis. Compare and contrast the mechanisms by which each system achieves coordination, highlighting the advantages and disadvantages of each approach.** (This question assesses understanding of the roles of both systems and their relative speeds and specificities).
Sample Answer:
Here's a sample answer to the assignment question, suitable for a CBSE Class 11-12 student:

**Answer:**

Both the nervous system and the endocrine system are crucial for maintaining homeostasis, the stable internal environment essential for our survival. They act as the body's primary communication and coordination networks, but they differ significantly in their mechanisms, speed, and specificity.

**Nervous System: Rapid and Targeted Coordination**

*   **Mechanism:** The nervous system uses electrical and chemical signals to transmit information along specialized cells called neurons. These signals travel rapidly alo

In [1]:
from transformers import BertForSequenceClassification, BertTokenizer

MODEL_DIR = "edu_feedback_bert_model"

# ✅ Reload the model
model = BertForSequenceClassification.from_pretrained(MODEL_DIR)
tokenizer = BertTokenizer.from_pretrained(MODEL_DIR)

# ✅ Save the fine-tuned model
model.save_pretrained("edu_feedback_bert_model_tuned")
tokenizer.save_pretrained("edu_feedback_bert_model_tuned")

print("\n✅ Fine-tuned model saved successfully!")



✅ Fine-tuned model saved successfully!
