In [1]:
import os
import json
import pandas as pd

# Folder path containing structured JSON files
TRAINING_DATA_FOLDER = "training_data"

# Load JSON samples into a DataFrame
data = []

# Iterate over JSON files and load them
for file_name in os.listdir(TRAINING_DATA_FOLDER):
    if file_name.endswith(".json"):
        with open(os.path.join(TRAINING_DATA_FOLDER, file_name), "r", encoding="utf-8") as f:
            sample = json.load(f)
            data.append({
                "student_answer": sample["input"]["student_answer"],
                "model_answer": sample["input"]["model_answer"],
                "score": sample["output"]["score"],
                "grade": sample["output"]["grade"],
                "feedback": sample["output"]["feedback"]
            })

# Convert to DataFrame
df = pd.DataFrame(data)

# Display dataset sample
print(df.head())


                                      student_answer  \
0  1.  Discuss how the concept of biodiversity ne...   
1  1.  A scientist discovers a new chemical that ...   
2  1.  **Comparative Analysis:** C3 and C4 plants...   
3  1.  A scientist is studying two plant cells: C...   
4  1.  A scientist is studying a plant species th...   

                                        model_answer  score  grade  \
0  ### Summary ###\nThe living world is rich in v...     92    A**   
1  ### Summary ###\nAccording to the cell theory,...     95    A**   
2  ### Summary ###\ncould be drawn?\n11.2 EARLY E...     88  ** B+   
3  ### Summary ###\nequation for this\n\n### Defi...     95   ** A   
4  ### Summary ###\nGrowth is one of the most con...     88    A**   

                                            feedback  
0  **\n\nThis is an exceptionally well-written an...  
1  **\n\n**Strengths:**\n\n*   Excellent understa...  
2  No feedback provided.\nOkay, let's evaluate th...  
3  **\n\n*   **Strengt

In [3]:
import google.generativeai as genai
import time

# Set up Gemini API key
GOOGLE_API_KEY = "AIzaSyDxtvmaGC9iB53VkvyYbtcBZKgOVg9Z2S8"  
genai.configure(api_key=GOOGLE_API_KEY)

# Function to generate feedback
def generate_feedback_gemini(student_answer, model_answer, retries=3, delay=5):
    """
    Uses Google Gemini to generate feedback by comparing student and model answers.
    """
    prompt = f"""
    You are an AI teacher grading a CBSE class 11-12 student's answer.
    Compare the student's answer with the model answer and provide:
    - A score between 0-100
    - A grade (A, B, C, D, F)
    - Detailed feedback on strengths and areas for improvement.

    **Model Answer:**
    {model_answer}

    **Student Answer:**
    {student_answer}
    """

    for attempt in range(retries):
        try:
            time.sleep(delay)
            model = genai.GenerativeModel(model_name="gemini-2.0-flash")
            response = model.generate_content(prompt)

            if response.candidates and response.candidates[0].content.parts:
                return response.candidates[0].content.parts[0].text.strip()

        except Exception as e:
            print(f"⚠️ Attempt {attempt + 1} failed: {e}")

    return "Feedback generation failed."


# Example Usage: Generate feedback for the first sample
feedback = generate_feedback_gemini(df['student_answer'][0], df['model_answer'][0])
print("Generated Feedback:\n", feedback)


Generated Feedback:
 ## Overall Assessment

Your answers demonstrate a good understanding of the concepts covered in the model answer. You've addressed the questions thoroughly and provided clear explanations. The use of examples and justifications strengthens your responses.

**Score:** 85/100
**Grade:** A

## Detailed Feedback

**Strengths:**

*   **Comprehensive Understanding:** You display a solid grasp of biodiversity, nomenclature, the biological species concept, and dichotomous keys.
*   **Clear Explanations:** Your explanations are articulated well and easy to follow. You break down complex concepts into simpler terms.
*   **Relevant Examples:** You effectively use the example of *Mangifera indica* and create your own hypothetical insect species to illustrate the concepts.
*   **Justification:** You provide strong justifications for your choices, such as the characteristics used in the dichotomous key.
*   **Addresses Limitations:** You acknowledge the limitations of Mayr's def

In [5]:
import os
import json
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification

# === Disable Hugging Face Symlink Warning ===
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# === Step 1: Load the Dataset ===
TRAINING_DATA_FOLDER = "training_data"

# Load JSON samples into a DataFrame
data = []

# Iterate over JSON files and load them
for file_name in os.listdir(TRAINING_DATA_FOLDER):
    if file_name.endswith(".json"):
        with open(os.path.join(TRAINING_DATA_FOLDER, file_name), "r", encoding="utf-8") as f:
            sample = json.load(f)
            data.append({
                "student_answer": sample["input"]["student_answer"],
                "model_answer": sample["input"]["model_answer"],
                "score": sample["output"]["score"],
                "grade": sample["output"]["grade"],
                "feedback": sample["output"]["feedback"]
            })

# Convert to DataFrame
df = pd.DataFrame(data)
print("✅ Dataset loaded successfully:")
print(df.head())

# === Step 2: Define the Dataset Class ===
class GradingDataset(Dataset):
    """
    Custom Dataset for BERT fine-tuning.
    """
    def __init__(self, df, tokenizer, max_length=512):
        self.df = df
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        student_answer = str(self.df.loc[index, "student_answer"])
        model_answer = str(self.df.loc[index, "model_answer"])
        label = self.df.loc[index, "score"]  # Score as label

        inputs = self.tokenizer(
            student_answer + " [SEP] " + model_answer,
            padding="max_length",
            max_length=self.max_length,
            truncation=True,
            return_tensors="pt"
        )

        return {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "labels": torch.tensor(label, dtype=torch.long)
        }

# === Step 3: Model Training ===
def train_model(df, batch_size=8, epochs=3, learning_rate=2e-5):
    """
    Fine-tunes BERT for grading.
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Tokenizer and dataset
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    dataset = GradingDataset(df, tokenizer)
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Load pre-trained BERT
    model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=101)
    model.to(device)

    # Use PyTorch's AdamW to avoid deprecation warning
    from torch.optim import AdamW
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Training loop
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch + 1}/{epochs}")
        total_loss = 0

        for batch in dataloader:
            optimizer.zero_grad()

            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            loss.backward()
            optimizer.step()

            total_loss += loss.item()

        avg_loss = total_loss / len(dataloader)
        print(f"Loss: {avg_loss:.4f}")

    # Save model and tokenizer
    model.save_pretrained("edu_feedback_bert_model")
    tokenizer.save_pretrained("edu_feedback_bert_model")

    print("✅ Model training complete!")

# === Train the model ===
train_model(df)


✅ Dataset loaded successfully:
                                      student_answer  \
0  1.  Discuss how the concept of biodiversity ne...   
1  1.  A scientist discovers a new chemical that ...   
2  1.  **Comparative Analysis:** C3 and C4 plants...   
3  1.  A scientist is studying two plant cells: C...   
4  1.  A scientist is studying a plant species th...   

                                        model_answer  score  grade  \
0  ### Summary ###\nThe living world is rich in v...     92    A**   
1  ### Summary ###\nAccording to the cell theory,...     95    A**   
2  ### Summary ###\ncould be drawn?\n11.2 EARLY E...     88  ** B+   
3  ### Summary ###\nequation for this\n\n### Defi...     95   ** A   
4  ### Summary ###\nGrowth is one of the most con...     88    A**   

                                            feedback  
0  **\n\nThis is an exceptionally well-written an...  
1  **\n\n**Strengths:**\n\n*   Excellent understa...  
2  No feedback provided.\nOkay, let's evaluate

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Loss: 4.4491
Epoch 2/3
Loss: 4.0156
Epoch 3/3
Loss: 3.6650
✅ Model training complete!
