In [1]:
# Step 2: Data Processing – Clean and Prepare for LLaMA-2 Fine-Tuning

import pandas as pd
import json
import os

In [2]:
# Load the CBC dataset created in Step 1
input_path = "../backend/data/cbc_questions.csv"
df = pd.read_csv(input_path)

print("✅ Loaded dataset with", len(df), "records")
df.head()


✅ Loaded dataset with 13 records


Unnamed: 0,subject,grade,cbc_code,bloom_level,question,options,answer
0,Mathematics,4,M4.1.1,1,What is 5 + 7?,"['10', '11', '12', '13']",12
1,Mathematics,4,M4.2.3,2,Which shape has 4 equal sides?,"['Rectangle', 'Square', 'Triangle', 'Circle']",Square
2,Mathematics,5,M5.3.2,3,"A triangle has sides of 3 cm, 4 cm, and 5 cm. ...","['Equilateral', 'Isosceles', 'Right-angled', '...",Right-angled
3,Mathematics,6,M6.4.1,4,"If the radius of a circle is 7 cm, find its ar...","['144 cm²', '154 cm²', '132 cm²', '160 cm²']",154 cm²
4,Science,4,S4.1.1,1,Which organ helps humans to breathe?,"['Lungs', 'Heart', 'Brain', 'Kidney']",Lungs


In [3]:
# Clean nulls or empty fields
df = df.dropna(subset=["subject", "grade", "question", "answer"])

# Standardize text casing (for consistency)
df["subject"] = df["subject"].str.strip().str.title()
df["cbc_code"] = df["cbc_code"].str.upper()

# Ensure Bloom’s levels are integers (1–6)
df["bloom_level"] = df["bloom_level"].astype(int)
df = df[df["bloom_level"].between(1, 6)]

print("✅ Cleaned and normalized data")
df.sample(5)


✅ Cleaned and normalized data


Unnamed: 0,subject,grade,cbc_code,bloom_level,question,options,answer
4,Science,4,S4.1.1,1,Which organ helps humans to breathe?,"['Lungs', 'Heart', 'Brain', 'Kidney']",Lungs
7,English,4,E4.1.1,1,Choose the correct plural form of 'child'.,"['childs', 'childes', 'children', 'child']",children
0,Mathematics,4,M4.1.1,1,What is 5 + 7?,"['10', '11', '12', '13']",12
6,Science,6,S6.3.2,3,Which process describes how plants make their ...,"['Transpiration', 'Photosynthesis', 'Respirati...",Photosynthesis
12,English,6,E6.3.4,5,Choose the sentence that uses the correct form...,"['They was going to school.', 'They were going...",They were going to school.


In [4]:
# Map Bloom’s levels to human-readable labels
bloom_map = {
    1: "Remember",
    2: "Understand",
    3: "Apply",
    4: "Analyze",
    5: "Evaluate",
    6: "Create"
}

df["bloom_category"] = df["bloom_level"].map(bloom_map)
print("✅ Added Bloom’s taxonomy categories")
df[["subject", "grade", "cbc_code", "bloom_category"]].head()


✅ Added Bloom’s taxonomy categories


Unnamed: 0,subject,grade,cbc_code,bloom_category
0,Mathematics,4,M4.1.1,Remember
1,Mathematics,4,M4.2.3,Understand
2,Mathematics,5,M5.3.2,Apply
3,Mathematics,6,M6.4.1,Analyze
4,Science,4,S4.1.1,Remember


In [5]:
# Function to build prompt-completion pairs
def make_prompt(row):
    return (
        f"Generate a CBC-aligned question for Grade {row['grade']} {row['subject']} "
        f"based on code {row['cbc_code']} at the '{row['bloom_category']}' Bloom level."
    )

def make_completion(row):
    opts = ", ".join(row['options'].split(',')) if isinstance(row['options'], str) else ", ".join(row['options'])
    return (
        f"Question: {row['question']} Options: {opts}. Answer: {row['answer']}"
    )

# Apply functions
df["prompt"] = df.apply(make_prompt, axis=1)
df["completion"] = df.apply(make_completion, axis=1)

print("✅ Created prompt–completion pairs")
df[["prompt", "completion"]].head(3)


✅ Created prompt–completion pairs


Unnamed: 0,prompt,completion
0,Generate a CBC-aligned question for Grade 4 Ma...,"Question: What is 5 + 7? Options: ['10', '11'..."
1,Generate a CBC-aligned question for Grade 4 Ma...,Question: Which shape has 4 equal sides? Optio...
2,Generate a CBC-aligned question for Grade 5 Ma...,"Question: A triangle has sides of 3 cm, 4 cm, ..."


In [6]:
output_path = "../backend/data/cbc_finetune_dataset.jsonl"

# Convert to JSON Lines format
with open(output_path, "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        record = {"prompt": row["prompt"], "completion": row["completion"]}
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f"✅ Fine-tuning dataset saved to {output_path}")


✅ Fine-tuning dataset saved to ../backend/data/cbc_finetune_dataset.jsonl


In [7]:
print("\n--- Summary ---")
print(df.groupby("subject")["grade"].count())
print("\nBloom’s Level Distribution:")
print(df["bloom_category"].value_counts())

df.head()



--- Summary ---
subject
English        6
Mathematics    4
Science        3
Name: grade, dtype: int64

Bloom’s Level Distribution:
bloom_category
Understand    4
Remember      3
Apply         3
Analyze       2
Evaluate      1
Name: count, dtype: int64


Unnamed: 0,subject,grade,cbc_code,bloom_level,question,options,answer,bloom_category,prompt,completion
0,Mathematics,4,M4.1.1,1,What is 5 + 7?,"['10', '11', '12', '13']",12,Remember,Generate a CBC-aligned question for Grade 4 Ma...,"Question: What is 5 + 7? Options: ['10', '11'..."
1,Mathematics,4,M4.2.3,2,Which shape has 4 equal sides?,"['Rectangle', 'Square', 'Triangle', 'Circle']",Square,Understand,Generate a CBC-aligned question for Grade 4 Ma...,Question: Which shape has 4 equal sides? Optio...
2,Mathematics,5,M5.3.2,3,"A triangle has sides of 3 cm, 4 cm, and 5 cm. ...","['Equilateral', 'Isosceles', 'Right-angled', '...",Right-angled,Apply,Generate a CBC-aligned question for Grade 5 Ma...,"Question: A triangle has sides of 3 cm, 4 cm, ..."
3,Mathematics,6,M6.4.1,4,"If the radius of a circle is 7 cm, find its ar...","['144 cm²', '154 cm²', '132 cm²', '160 cm²']",154 cm²,Analyze,Generate a CBC-aligned question for Grade 6 Ma...,"Question: If the radius of a circle is 7 cm, f..."
4,Science,4,S4.1.1,1,Which organ helps humans to breathe?,"['Lungs', 'Heart', 'Brain', 'Kidney']",Lungs,Remember,Generate a CBC-aligned question for Grade 4 Sc...,Question: Which organ helps humans to breathe?...
