In [72]:
import pandas as pd
from datasets import load_dataset
import json
from sentence_transformers import SentenceTransformer, util
from transformers import BartForConditionalGeneration, BartTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import prepare_model_for_kbit_training, LoraConfig, TaskType, PeftModel
from trl import SFTTrainer
from transformers.utils import is_flash_attn_2_available
from transformers import BitsAndBytesConfig
import torch
from datasets import Dataset
import re
from rouge_score import rouge_scorer

In [2]:
#===========================
#1. Import Test Data
#===========================
with open("test_data.json") as json_data:
    df = json.load(json_data)

In [3]:
df[0]

{'input_text': 'In quantum mechanics, wave-particle duality describes how particles exhibit both wave-like and particle-like properties. This principle is fundamental to understanding phenomena like electron diffraction.',
 'question': 'Which experiment first demonstrated the wave nature of electrons?',
 'options': {'A': 'Michelson-Morley experiment',
  'B': 'Davisson-Germer experiment',
  'C': "Young's double-slit experiment with light",
  'D': "Rutherford's gold foil experiment"},
 'correct_option': 'B',
 'explanation': 'The Davisson-Germer experiment (1927) confirmed the wave nature of electrons by observing diffraction patterns when electrons were scattered by a nickel crystal.'}

In [4]:
#===============================
# 2. Import fine-tuned model
#===============================

## Merging the model 

In [5]:
#device_map = {"": 0}
model_id = "./nairs-2e"
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)
use_quantization_config = True 
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")
print(f"[INFO] Using model_id: {model_id}")
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)
nairs = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                             torch_dtype = torch.float16,
                                              quantization_config=quantization_config if use_quantization_config else None,
                                               low_cpu_mem_usage=True,
                                                 device_map = "auto",
                                                attn_implementation=attn_implementation
                                           )
if not use_quantization_config:
    nairs.to("cuda")

[INFO] Using attention implementation: flash_attention_2
[INFO] Using model_id: ./nairs-2e


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Zero Shot

In [6]:
#================================
# 3. Zero Shot Prompt Test
#================================
prompt = "Input_text: I have a problem understanding Temperature in Physics"
input_ids = tokenizer(prompt, return_tensors = "pt").to("cuda")

In [8]:
outputs = nairs.generate(
    **input_ids, 
    max_length=300,
    temperature=0.7,
    do_sample = True
)
output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Results:", output)

Results: Input_text: I have a problem understanding Temperature in Physics.
Question: Which of these is NOT a measure of temperature?
Options:
A: Mass
B: Density
C: Volume
D: Energy

Question: Options:

Answer: D: Energy

Explanation:

Options A, B, and C are all related to physical properties that are not directly related to temperature. Energy, on the other hand, is not a measure of temperature.

Options:

A: Mass: The heavier the object, the higher its temperature.
B: Density: The density of a substance does not affect its temperature.
C: Volume: The volume of a substance does not affect its temperature.
D: Energy: Energy is not directly related to temperature.

Explanation:

The SI unit of temperature is Kelvin (K), which is based on the absolute scale. The temperature of absolute zero is 0 K.

Options:

A: Mass: The heavier the object, the higher its temperature.
B: Density: The density of a substance does not affect its temperature.
C: Volume: The volume of a substance does not a

## Few shot Prompt

In [None]:
#================================
# 4. Few Shot Prompt Test
#================================

In [9]:
def generate_physics_assessment(nairs, tokenizer, context, max_new_tokens=300, temperature=0.7):
    """
    Generates properly formatted physics assessments with guaranteed structure.
    Implements multiple fallback mechanisms for reliable output.
    """
    # 1. Create an explicit few-shot prompt with clear formatting examples
    prompt = f"""Generate an assessment question with options and provide a detailed explanation using EXACTLY this format:

Example 1:
Context: When soldiers march across a suspension bridge...
Question: Why are marching soldiers advised to break step on bridges?
Options:
A: To reduce air resistance
B: To prevent resonance
C: To minimize friction
D: To decrease bridge weight
Answer: B
Explanation: Marching soldiers are advised to break step on bridges to prevent resonance. When soldiers march in unison, their rhythmic footsteps can match the bridge's natural frequency. This matching of frequencies can cause the bridge to oscillate with increasing amplitude, potentially leading to structural damage. Breaking step ensures that the periodic force isn't applied at the bridge's natural frequency, preventing dangerous resonance effects.

Now generate for:
Context: {context}
Question:"""

    # 2. Generate the output with conservative parameters
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    outputs = nairs.generate(
        **inputs,
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # 3. Extract and clean the generated text
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    generated_part = full_output.split("Question:")[-1].strip()
    return generated_part

In [10]:
context = "I have a problem understanding Temperature in Physics"
assessment = generate_physics_assessment(nairs, tokenizer, context)
print(assessment)

What is the SI unit of temperature?
Options:
A: Celsius
B: Kelvin
C: Fahrenheit
D: Joule
Answer: B
Explanation: The SI unit of temperature is the Kelvin, denoted by the symbol K. One degree Celsius is equal to one degree Kelvin, and 0°C = 273.15°K.

Please answer in the format provided.


## Evaluating fine-tuned models

In [None]:
#=========================
# 5. Zero shot
#=========================

In [50]:
def evaluate_accuracy(nairs, tokenizer, df):
    correct = 0
    total = 0

    for item in df:
        # Format the question and options
        input_text = (
            f"Context: {item['input_text']}\n"
            f"Question: {item['question']}\n"
            "Options:\n"
            f"A) {item['options']['A']}\n"
            f"B) {item['options']['B']}\n"
            f"C) {item['options']['C']}\n"
            f"D) {item['options']['D']}\n"
            "Answer:"
        )

        inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to("cuda")

        outputs = nairs.generate(
            **inputs,
            max_new_tokens=2,
            temperature=0.8, 
        )

        predicted_answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()[-1]  # Takes last character

        if predicted_answer == item["correct_option"]:
            correct += 1
        total += 1

    accuracy = correct / total if total > 0 else 0.0
    return accuracy

In [51]:
"""def evaluate_accuracy(nairs, tokenizer, df):
    correct = 0
    total = 0

    for item in df:
        # Format the question and options
        input_text = (
            f"Context: {item['input_text']}\n"
            f"Question: {item['question']}\n"
            "Options:\n"
            f"A) {item['options']['A']}\n"
            f"B) {item['options']['B']}\n"
            f"C) {item['options']['C']}\n"
            f"D) {item['options']['D']}\n"
            "Answer:"
        )

        
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to("cuda")

        
        outputs = nairs.generate(
            **inputs,
            max_new_tokens=2,
            temperature=0.7,  
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

        
        predicted_answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()[-1]  # Takes last character

        # Check if correct
        if predicted_answer == item["correct_option"]:
            correct += 1
        total += 1

    accuracy = correct / total if total > 0 else 0.0
    return accuracy"""

'def evaluate_accuracy(nairs, tokenizer, df):\n    correct = 0\n    total = 0\n\n    for item in df:\n        # Format the question and options\n        input_text = (\n            f"Context: {item[\'input_text\']}\n"\n            f"Question: {item[\'question\']}\n"\n            "Options:\n"\n            f"A) {item[\'options\'][\'A\']}\n"\n            f"B) {item[\'options\'][\'B\']}\n"\n            f"C) {item[\'options\'][\'C\']}\n"\n            f"D) {item[\'options\'][\'D\']}\n"\n            "Answer:"\n        )\n\n        \n        inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to("cuda")\n\n        \n        outputs = nairs.generate(\n            **inputs,\n            max_new_tokens=2,\n            temperature=0.7,  \n            do_sample=False,\n            pad_token_id=tokenizer.eos_token_id,\n        )\n\n        \n        predicted_answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()[-1]  # Takes last character\n\n        # Check if cor

In [57]:
# accuracy = evaluate_accuracy(nairs, tokenizer, df)
# print(f"Accuracy: {accuracy * 100:.2f}")

In [53]:
def evaluate_accuracy(model, tokenizer, test_data, verbose=True):
    correct = 0
    total = 0

    for idx, item in enumerate(test_data, 1):
        # Format the input
        input_text = (
            f"Context: {item['input_text']}\n"
            f"Question: {item['question']}\n"
            "Options:\n"
            f"A) {item['options']['A']}\n"
            f"B) {item['options']['B']}\n"
            f"C) {item['options']['C']}\n"
            f"D) {item['options']['D']}\n"
            "Answer:"
        )

        # Tokenize and generate
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to("cuda")
        outputs = model.generate(
            **inputs,
            max_new_tokens=2,
            temperature=0.1,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

        # Decode and extract the predicted answer
        full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
        predicted_answer = full_output.strip()[-1]  # Takes last character (A/B/C/D)

        # Check correctness
        is_correct = predicted_answer == item["correct_option"]
        if is_correct:
            correct += 1
        total += 1

        # Print detailed results if verbose=True
        if verbose:
            print(f"\n--- Question {idx} ---")
            print(f"Context: {item['input_text']}")
            print(f"Question: {item['question']}")
            print("Options:")
            for opt, desc in item['options'].items():
                print(f"{opt}) {desc}")
            print(f"Model's Answer: {predicted_answer}")
            print(f"Correct Answer: {item['correct_option']}")
            print(f"Result: {'✅ Correct' if is_correct else '❌ Incorrect'}")

    accuracy = correct / total if total > 0 else 0.0
    print(f"\nFinal Accuracy: {accuracy * 100:.2f}% ({correct}/{total})")
    return accuracy

# Example Usage:
with open("test_data.json") as json_data:
    test_data = json.load(json_data)

accuracy = evaluate_accuracy(nairs, tokenizer, test_data, verbose=True)


--- Question 1 ---
Context: In quantum mechanics, wave-particle duality describes how particles exhibit both wave-like and particle-like properties. This principle is fundamental to understanding phenomena like electron diffraction.
Question: Which experiment first demonstrated the wave nature of electrons?
Options:
A) Michelson-Morley experiment
B) Davisson-Germer experiment
C) Young's double-slit experiment with light
D) Rutherford's gold foil experiment
Model's Answer: :
Correct Answer: B
Result: ❌ Incorrect

--- Question 2 ---
Context: Special relativity introduces the concept of time dilation, where time intervals measured in a moving frame appear longer to a stationary observer.
Question: A clock traveling at 0.8c relative to Earth will appear to run slower by what factor compared to Earth's clock?
Options:
A) 0.6
B) 0.8
C) 1.25
D) 1.67
Model's Answer: :
Correct Answer: A
Result: ❌ Incorrect

--- Question 3 ---
Context: Thermodynamics states that entropy in an isolated system ne

In [None]:
#=========================
# 6. Few shot Evaluation
#=========================

In [54]:
few_shot_examples = [
    {
        "input_text": "When soldiers march across a suspension bridge...",
        "question": "Why are marching soldiers advised to break step on bridges?",
        "options": {
            "A": "To reduce air resistance",
            "B": "To prevent resonance",
            "C": "To minimize friction",
            "D": "To decrease bridge weight"
        },
        "correct_option": "B",
        "explanation": "Marching soldiers are advised to break step to prevent resonance, which could amplify vibrations and damage the bridge."
    },
    {
        "input_text": "In an electric circuit, resistance opposes current flow...",
        "question": "What happens to current if resistance increases while voltage stays constant?",
        "options": {
            "A": "Current increases",
            "B": "Current decreases",
            "C": "Current remains the same",
            "D": "Voltage must change"
        },
        "correct_option": "B",
        "explanation": "According to Ohm's Law (V=IR), if resistance increases and voltage is constant, current must decrease."
    }
]

In [65]:
def evaluate_accuracy_with_few_shot(nairs, tokenizer, df, few_shot_examples):
    correct = 0
    total = 0

    # Ensure test_data is a list
    if isinstance(df, dict):
        df = [df]

    for item in df:
        few_shot_prompt = ""
        for example in few_shot_examples:
            few_shot_prompt += (
                f"Context: {example['input_text']}\n"
                f"Question: {example['question']}\n"
                "Options:\n"
                f"A) {example['options']['A']}\n"
                f"B) {example['options']['B']}\n"
                f"C) {example['options']['C']}\n"
                f"D) {example['options']['D']}\n"
                f"Answer: {example['correct_option']}\n"
                f"Explanation: {example['explanation']}\n\n"
            )

        # Final prompt = Few-shot examples + Current question
        input_text = (
            few_shot_prompt +
            f"Context: {item['input_text']}\n"
            f"Question: {item['question']}\n"
            "Options:\n"
            f"A) {item['options']['A']}\n"
            f"B) {item['options']['B']}\n"
            f"C) {item['options']['C']}\n"
            f"D) {item['options']['D']}\n"
            "Answer:"
        )

        # Tokenize and generate
        inputs = tokenizer(input_text, return_tensors="pt", truncation=True).to("cuda")
        outputs = nairs.generate(
            **inputs,
            max_new_tokens=2,
            temperature=0.8,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
        )

        # Extract predicted answer (last character)
        predicted_answer = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()[-1]

        # Check correctness
        if predicted_answer == item["correct_option"]:
            correct += 1
        total += 1

    accuracy = correct / total if total > 0 else 0.0
    return accuracy

In [66]:
accuracy = evaluate_accuracy_with_few_shot(nairs, tokenizer, df, few_shot_examples)
print(f"Few-shot Accuracy: {accuracy * 100:.2f}%")



Few-shot Accuracy: 80.00%


## ROUGE

In [None]:
!pip install rouge-score nltk

In [67]:
def build_prompt_with_few_shot(context, question, options, few_shot_examples=True):
    prompt = ""

    if few_shot_examples:
        for example in few_shot_examples:
            prompt += (
                f"Context: {example['input_text']}\n"
                f"Question: {example['question']}\n"
                "Options:\n"
                f"A) {example['options']['A']}\n"
                f"B) {example['options']['B']}\n"
                f"C) {example['options']['C']}\n"
                f"D) {example['options']['D']}\n"
                f"Answer: {example['correct_option']}\n"
                f"Explanation: {example['explanation']}\n\n"
            )
    
    # Add the current question
    prompt += (
        f"Context: {context}\n"
        f"Question: {question}\n"
        "Options:\n"
        f"A) {options['A']}\n"
        f"B) {options['B']}\n"
        f"C) {options['C']}\n"
        f"D) {options['D']}\n"
        "Answer:"
    )
    
    return prompt

In [68]:
def extract_answer(full_output):
    match = re.search(r"Answer:\s*([A-D])", full_output, re.IGNORECASE)
    return match.group(1).strip().upper() if match else None
def extract_explanation(full_output):
    match = re.search(r"Explanation:\s*(.+)", full_output, re.IGNORECASE | re.DOTALL)
    return match.group(1).strip() if match else "No explanation generated."

In [73]:
#================================
# 7. Rouge Evaluation -- Few-shot
#================================

In [74]:
def generate_answer_and_explanation(nairs, tokenizer, context, question, options, few_shot_examples=True):
    prompt = build_prompt_with_few_shot(context, question, options, few_shot_examples)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True).to("cuda")
    outputs = nairs.generate(
        **inputs,
        max_new_tokens=200,
        temperature=0.8,
        do_sample=False,
        pad_token_id=tokenizer.eos_token_id
    )
    
    # Extract full generated text
    full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Parse answer (e.g., "Answer: B") and explanation
    answer = extract_answer(full_output)  # e.g., "B"
    explanation = extract_explanation(full_output)  # e.g., "The Davisson-Germer experiment..."
    
    return answer, explanation

In [75]:
def compute_rouge_scores(generated_explanations, reference_explanations):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = {
        'rouge1': {'precision': [], 'recall': [], 'f1': []},
        'rouge2': {'precision': [], 'recall': [], 'f1': []},
        'rougeL': {'precision': [], 'recall': [], 'f1': []}
    }
    
    for gen, ref in zip(generated_explanations, reference_explanations):
        scores = scorer.score(ref, gen)
        for key in rouge_scores:
            rouge_scores[key]['precision'].append(scores[key].precision)
            rouge_scores[key]['recall'].append(scores[key].recall)
            rouge_scores[key]['f1'].append(scores[key].fmeasure)
    
    # Compute averages
    avg_scores = {
        'rouge1': {k: sum(v) / len(v) for k, v in rouge_scores['rouge1'].items()},
        'rouge2': {k: sum(v) / len(v) for k, v in rouge_scores['rouge2'].items()},
        'rougeL': {k: sum(v) / len(v) for k, v in rouge_scores['rougeL'].items()}
    }
    return avg_scores

In [76]:
generated_explanations = []
reference_explanations = []

for item in df:
    # Generate model output
    _, explanation = generate_answer_and_explanation(
        nairs, tokenizer,
        context=item["input_text"],
        question=item["question"],
        options=item["options"],
        few_shot_examples=few_shot_examples
    )
    
    generated_explanations.append(explanation)
    reference_explanations.append(item["explanation"])

# Compute ROUGE scores
rouge_scores = compute_rouge_scores(generated_explanations, reference_explanations)
print("ROUGE Scores:", rouge_scores)

ROUGE Scores: {'rouge1': {'precision': 0.05752416448819112, 'recall': 0.699220823798627, 'f1': 0.10610336873518533}, 'rouge2': {'precision': 0.023715994130131472, 'recall': 0.3089960332065595, 'f1': 0.04395255851652705}, 'rougeL': {'precision': 0.041959639348942424, 'recall': 0.5208607388035307, 'f1': 0.07750564963905676}}
