**Environment Setup**


In [None]:
!pip install datasets transformers sentence-transformers faiss-cpu scikit-learn spacy openai
!python -m spacy download en_core_web_sm


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.7 kB)
Downloading faiss_cpu-1.13.0-cp39-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.6/23.6 MB[0m [31m108.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.0
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m145.6 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the packag

**API Configuration**

Sets the API key used by the model interface.

In [None]:
import os

os.environ["OPENAI_API_KEY"] = "OPENAI_KEY_HERE"


**Knowledge Base Construction**

Defines concept lists, prompts, and generation routines used to create the synthetic science knowledge base. Topics align with common categories in SciEntsBank dataset.

In [None]:
import random
import json
from openai import OpenAI
client = OpenAI()

CORE_CONCEPTS = [
    "particle motion", "heat transfer", "conduction", "convection", "radiation",
    "melting", "freezing", "evaporation", "condensation", "boiling",
    "expansion", "contraction", "density", "mass", "volume", "pressure",
    "force", "gravity", "friction", "push", "pull", "motion", "speed",
    "light reflection", "light absorption", "shadows", "refraction",
    "solubility", "dissolving", "solutions", "mixtures", "chemical change",
    "physical change"
]

PHENOMENA_CONCEPTS = [
    "why objects float or sink",
    "why shadows change size",
    "why steam forms",
    "why ice floats",
    "why metal feels colder",
    "why balloons expand or shrink",
    "why condensation appears",
    "why sugar dissolves faster in hot water",
    "why objects heat up",
    "why smells spread out",
    "why seasons change",
    "why the moon changes shape",
]

MISCONCEPTION_CONCEPTS = [
    "heat vs temperature misconception",
    "evaporation misconceptions",
    "shadow misconceptions",
    "gravity misconceptions",
    "floating and sinking misconceptions",
    "dissolving misconceptions",
    "moon phase misconceptions",
    "season misconceptions",
    "particle motion misconceptions",
    "force misconceptions"
]

FACT_PROMPT = """
Explain the concept "{concept}" in exactly {n} factual sentences.
Write for middle-school science. No lists or bullets.
"""

PROCESS_PROMPT = """
Explain the scientific process underlying "{concept}" in exactly {n} short sentences.
Include cause and effect. No lists.
"""

MISCONCEPTION_PROMPT = """
Write {n} sentences correcting misconceptions about "{concept}".
Each sentence must state the misconception and then correct it.
"""


In [None]:
OUTPUT = "science_kb_FINAL.json"

TOTAL = 10000
FACT_N = 2
PROC_N = 3
MIS_N = 2

FACT_COUNT = int(TOTAL * 0.34)
PROC_COUNT = int(TOTAL * 0.33)
MIS_COUNT = int(TOTAL * 0.33)

def generate(prompt):
    out = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3,
        max_tokens=200
    )
    return out.choices[0].message.content.strip()

passages = []

# factual
for _ in range(FACT_COUNT):
    c = random.choice(CORE_CONCEPTS + PHENOMENA_CONCEPTS)
    p = FACT_PROMPT.format(concept=c, n=FACT_N)
    passages.append(generate(p))

# process
for _ in range(PROC_COUNT):
    c = random.choice(CORE_CONCEPTS + PHENOMENA_CONCEPTS)
    p = PROCESS_PROMPT.format(concept=c, n=PROC_N)
    passages.append(generate(p))

# misconceptions
for _ in range(MIS_COUNT):
    c = random.choice(MISCONCEPTION_CONCEPTS)
    p = MISCONCEPTION_PROMPT.format(concept=c, n=MIS_N)
    passages.append(generate(p))

random.shuffle(passages)

with open(OUTPUT, "w") as f:
    json.dump(passages, f, indent=2)

len(passages)


10000

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir -p /content/drive/MyDrive/final_project


In [None]:
BASE = "/content/drive/MyDrive/final_project/"


In [None]:
!pip install datasets transformers scikit-learn




**Dataset Preparation**

Loads the SciEntsBank dataset, applies label mapping, and formats examples for grading.

In [None]:
from datasets import load_dataset, ClassLabel

ds = load_dataset("nkazi/SciEntsBank")

label_map = {
    "correct": 0,
    "contradictory": 1,
    "partially_correct_incomplete": 2,
    "irrelevant": 2,
    "non_domain": 2,
}

ds = ds.align_labels_with_mapping(label_map, "label")
ds = ds.cast_column("label", ClassLabel(names=["correct","contradictory","incorrect"]))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
def format_example(ex):
    return {
        "text": f"[QUESTION] {ex['question']} "
                f"[REFERENCE] {ex['reference_answer']} "
                f"[ANSWER] {ex['student_answer']}"
    }

ds2 = ds.map(format_example)


**Baseline Grader Setup**

Tokenizes the formatted dataset and initializes the classification model used as the baseline grader. Includes training configuration and metric computation.

In [None]:
from transformers import AutoTokenizer

tok = AutoTokenizer.from_pretrained("roberta-large")

def tokenize(batch):
    return tok(
        batch["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

toks = ds2.map(tokenize, batched=True)
toks.set_format(type="torch", columns=["input_ids","attention_mask","label"])


In [None]:
BASE = "/content/drive/MyDrive/final_project/"
OUT = BASE + "baseline_grader"

from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

model = AutoModelForSequenceClassification.from_pretrained("roberta-large", num_labels=3)

def metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    p,r,f,_ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    return {"accuracy": acc, "macro_f1": f}

args = TrainingArguments(
    output_dir=OUT,
    learning_rate=1e-5,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=4,
    logging_steps=50,
    do_train=True,
    do_eval=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=toks["train"],
    eval_dataset=toks["test_uq"],
    compute_metrics=metrics,
)

trainer.train()
trainer.evaluate()

model.save_pretrained(OUT)
tok.save_pretrained(OUT)


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33majgaonkarshreyas01[0m ([33majgaonkarshreyas01-george-mason-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
50,0.944
100,1.1082
150,1.0117
200,0.9736
250,0.9691
300,0.9453
350,0.9018
400,0.9638
450,0.9337
500,1.0349


('/content/drive/MyDrive/final_project/baseline_grader/tokenizer_config.json',
 '/content/drive/MyDrive/final_project/baseline_grader/special_tokens_map.json',
 '/content/drive/MyDrive/final_project/baseline_grader/vocab.json',
 '/content/drive/MyDrive/final_project/baseline_grader/merges.txt',
 '/content/drive/MyDrive/final_project/baseline_grader/added_tokens.json',
 '/content/drive/MyDrive/final_project/baseline_grader/tokenizer.json')

In [None]:
!pip install faiss-cpu sentence-transformers openai


Collecting faiss-cpu
  Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (7.6 kB)
Downloading faiss_cpu-1.13.1-cp310-abi3-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (23.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m109.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.13.1


**Retriever Index Construction**

Loads the generated knowledge base and builds the vector index used for retrieval.

In [None]:
import json

KB_PATH = "/content/drive/MyDrive/final_project/science_kb_FINAL.json"

with open(KB_PATH, "r") as f:
    kb_texts = json.load(f)

len(kb_texts)


10000

In [None]:
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from numpy.linalg import norm

EMB_PATH = "/content/drive/MyDrive/final_project/science_kb_FINAL.npy"
INDEX_PATH = "/content/drive/MyDrive/final_project/science_kb_FINAL.faiss"

encoder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

print("Encoding KB…")
emb = encoder.encode(kb_texts, convert_to_numpy=True, show_progress_bar=True)
emb = emb / norm(emb, axis=1, keepdims=True)

np.save(EMB_PATH, emb)

index = faiss.IndexFlatL2(emb.shape[1])
index.add(emb)

faiss.write_index(index, INDEX_PATH)

print("FAISS index built.")


**Retriever Loading and Query Function**

Loads the saved FAISS index and embeddings, restores the knowledge base, and defines the function that retrieves the top-k passages for any question.

In [None]:
import numpy as np
import faiss
from numpy.linalg import norm
from sentence_transformers import SentenceTransformer
import json

KB_PATH = "/content/drive/MyDrive/final_project/science_kb_FINAL.json"
EMB_PATH = "/content/drive/MyDrive/final_project/science_kb_FINAL.npy"
INDEX_PATH = "/content/drive/MyDrive/final_project/science_kb_FINAL.faiss"

encoder = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
emb = np.load(EMB_PATH)
index = faiss.read_index(INDEX_PATH)

with open(KB_PATH, "r") as f:
    kb_texts = json.load(f)

def retrieve_passages(question, k=5):
    q = encoder.encode([question], convert_to_numpy=True)
    q = q / norm(q, axis=1, keepdims=True)
    _, idx = index.search(q, k)
    return [kb_texts[i] for i in idx[0]]


In [None]:
import os
os.environ["OPENAI_API_KEY"] = "OPENAI_KEY_HERE"


**RAG Answer Generater**

Implements the language-model interface and the controlled prompting template used to generate grounded answers from retrieved passages.

In [None]:
from openai import OpenAI
client = OpenAI()

RAG_PROMPT = """
Use these passages to answer the science question in 2–4 clear sentences.
Do not mention retrieval.

QUESTION:
{q}

PASSAGES:
{p}
"""

def generate_rag_answer(question):
    ctx = retrieve_passages(question, k=5)
    ctx_text = "\n".join(f"- {c}" for c in ctx)

    out = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role":"user","content":RAG_PROMPT.format(q=question, p=ctx_text)}],
        temperature=0.2,
        max_tokens=200,
    )

    return out.choices[0].message.content.strip()


**Load Trained Grading Model**

Loads the fine-tuned RoBERTa classifier from disk for later use in RAG-augmented grading.

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

GRADER_PATH = "/content/drive/MyDrive/final_project/baseline_grader"

tok = AutoTokenizer.from_pretrained(GRADER_PATH)
model = AutoModelForSequenceClassification.from_pretrained(GRADER_PATH)


**RAG-Based Grading Pipeline**

Evaluates a student answer by first generating a retrieved-context reference answer and then classifying correctness using the trained baseline model.

In [None]:
import torch

label_map = {
    0: "correct",
    1: "contradictory",
    2: "incorrect"
}

def grade_with_rag(question, student_answer):
    rag_ref = generate_rag_answer(question)

    text = f"[QUESTION] {question} [REFERENCE] {rag_ref} [ANSWER] {student_answer}"

    inputs = tok(
        text,
        return_tensors="pt",
        truncation=True,
        padding="max_length",
        max_length=512
    )

    with torch.no_grad():
        logits = model(**inputs).logits

    pred = torch.argmax(logits, dim=-1).item()

    return {
        "question": question,
        "reference_answer": rag_ref,
        "student_answer": student_answer,
        "label": label_map[pred]
    }


**RAG Grading Example**

One demo RAG-assisted grading run that outputs the predicted label.

In [None]:
grade_with_rag(
    "why does condensation appear on cold surfaces",
    "because warm moist air cools down and forms liquid droplets"
)


{'question': 'why does condensation appear on cold surfaces',
 'reference_answer': 'Condensation appears on cold surfaces because water vapor in the air cools down and loses energy, causing the molecules to slow down and cluster together. When the temperature of the air drops, its ability to hold moisture decreases, leading to the formation of tiny water droplets on surfaces like cold glass or metal. These surfaces provide a substrate for the droplets to accumulate, making the condensation visible.',
 'student_answer': 'because warm moist air cools down and forms liquid droplets',
 'label': 'incorrect'}

**Baseline Evaluation**

Computes accuracy and macro-metrics for the baseline grader on the SciEntsBank test split.

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import torch

def evaluate_baseline(split_name="test_uq"):
    ds_eval = toks[split_name]  # toks = tokenized SciEntsBank dataset
    all_preds = []
    all_labels = []

    for batch in ds_eval:
        input_ids = batch["input_ids"].unsqueeze(0)
        att = batch["attention_mask"].unsqueeze(0)
        label = batch["label"].item()

        with torch.no_grad():
            logits = model(input_ids=input_ids, attention_mask=att).logits

        pred = torch.argmax(logits, dim=-1).item()

        all_preds.append(pred)
        all_labels.append(label)

    acc = accuracy_score(all_labels, all_preds)
    p, r, f, _ = precision_recall_fscore_support(all_labels, all_preds, average="macro", zero_division=0)

    return {
        "accuracy": acc,
        "macro_precision": p,
        "macro_recall": r,
        "macro_f1": f
    }

baseline_metrics = evaluate_baseline("test_uq")
baseline_metrics


{'accuracy': 0.6016371077762619,
 'macro_precision': 0.5423583150855878,
 'macro_recall': 0.47298783643892334,
 'macro_f1': 0.458423081830397}

**RAG Pipeline Evaluation**

Evaluates the full RAG-based grading system on the SciEntsBank test split and reports accuracy and macro-metrics.

In [None]:
def evaluate_rag_pipeline(split_name="test_uq"):
    ds_eval = ds[split_name]   # original SciEntsBank samples
    true_labels = []
    pred_labels = []

    for ex in ds_eval:
        q = ex["question"]
        s = ex["student_answer"]
        true_label = ex["label"]  # integer label 0/1/2

        # use your RAG pipeline
        out = grade_with_rag(q, s)
        pred = list(label_map.keys())[list(label_map.values()).index(out["label"])]

        true_labels.append(true_label)
        pred_labels.append(pred)

    acc = accuracy_score(true_labels, pred_labels)
    p, r, f, _ = precision_recall_fscore_support(true_labels, pred_labels, average="macro", zero_division=0)

    return {
        "accuracy": acc,
        "macro_precision": p,
        "macro_recall": r,
        "macro_f1": f
    }

rag_metrics = evaluate_rag_pipeline("test_uq")
rag_metrics


{'accuracy': 0.6125511596180082,
 'macro_precision': 0.5198522622345337,
 'macro_recall': 0.45173320212817175,
 'macro_f1': 0.43589796144244447}

In [None]:
!pip install nltk rouge-score




In [None]:
!pip install bert-score


Collecting bert-score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.1/61.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bert-score
Successfully installed bert-score-0.3.13


**RAG Answer Quality Evaluation**

Computes cosine, BLEU, ROUGE-L, retrieval relevance, and BERTScore averages between RAG-generated answers and gold references.

In [None]:
from sentence_transformers import SentenceTransformer
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
import numpy as np
from numpy.linalg import norm
from bert_score import score as bert_score

encoder_eval = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)

def cos(a, b):
    a = a / norm(a)
    b = b / norm(b)
    return np.dot(a, b)

def evaluate_rag_answer_quality(split_name="test_uq"):
    questions = ds[split_name]["question"]
    gold_refs = ds[split_name]["reference_answer"]

    cos_scores = []
    bleu_scores = []
    rouge_scores = []
    retrieval_scores = []

    rag_answers = []
    gold_answers = []

    for q, gold in zip(questions, gold_refs):
        rag = generate_rag_answer(q)

        rag_answers.append(rag)
        gold_answers.append(gold)

        # cosine embedding similarity
        emb_gold = encoder_eval.encode(gold, convert_to_numpy=True)
        emb_rag  = encoder_eval.encode(rag, convert_to_numpy=True)
        cos_scores.append(cos(emb_gold, emb_rag))

        # BLEU
        bleu = sentence_bleu([gold.split()], rag.split())
        bleu_scores.append(bleu)

        # ROUGE-L
        rougeL = rouge.score(gold, rag)['rougeL'].fmeasure
        rouge_scores.append(rougeL)

        # Retrieval relevance
        ctx = retrieve_passages(q, 3)
        ctx_emb = encoder_eval.encode(ctx, convert_to_numpy=True)
        q_emb = encoder_eval.encode(q, convert_to_numpy=True)
        rs = np.mean([cos(q_emb, c) for c in ctx_emb])
        retrieval_scores.append(rs)

    # BERTScore (batch)
    P, R, F = bert_score(rag_answers, gold_answers, lang="en", verbose=False)

    return {
        "cosine_similarity_mean": float(np.mean(cos_scores)),
        "bleu_mean": float(np.mean(bleu_scores)),
        "rougeL_mean": float(np.mean(rouge_scores)),
        "retrieval_relevance_mean": float(np.mean(retrieval_scores)),
        "bertscore_precision": float(P.mean()),
        "bertscore_recall": float(R.mean()),
        "bertscore_f1": float(F.mean())
    }

rag_quality = evaluate_rag_answer_quality("test_uq")
rag_quality


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'cosine_similarity_mean': 0.6704363226890564,
 'bleu_mean': 0.012226617988213439,
 'rougeL_mean': 0.20907519038480699,
 'retrieval_relevance_mean': 0.36437445878982544,
 'bertscore_precision': 0.8571872115135193,
 'bertscore_recall': 0.9010624885559082,
 'bertscore_f1': 0.8784171342849731}

**Full RAG Grading Demo**

Three end-to-end examples printing the retrieved passages, the generated RAG reference, and the model’s final predicted label.

In [None]:
examples = [
    ("why do metal objects feel colder than wood",
     "because metal takes away the heat faster"),
    ("why does ice float on water",
     "because ice weighs less"),
    ("why do shadows change size",
     "because the light moves")
]

for q, s in examples:
    print("="*80)
    print("QUESTION:", q)
    print("STUDENT:", s)
    print()

    # retrieved passages
    ctx = retrieve_passages(q, k=3)
    print("RETRIEVED PASSAGES:")
    for c in ctx:
        print("-", c)
    print()

    # rag reference
    rag_ref = generate_rag_answer(q)
    print("RAG REFERENCE:", rag_ref)
    print()

    # grading
    out = grade_with_rag(q, s)
    print("MODEL OUTPUT:", out["label"])
    print()


QUESTION: why do metal objects feel colder than wood
STUDENT: because metal takes away the heat faster

RETRIEVED PASSAGES:
- Metal feels colder than other materials, like wood or plastic, because it is a better conductor of heat. When you touch metal, it quickly pulls heat away from your skin, making it feel colder than materials that do not conduct heat as well.
- Metal feels colder than other materials because it conducts heat away from your skin more efficiently. When you touch metal, it quickly absorbs heat from your body, making it feel colder than materials like wood or fabric, which do not conduct heat as well.
- Metal feels colder than other materials because it is a good conductor of heat, meaning it can quickly absorb heat from your skin. When you touch metal, it draws heat away from your body faster than materials like wood or fabric, making it feel colder even if they are at the same temperature.

RAG REFERENCE: Metal objects feel colder than wood because metal is a better