In [2]:
from pathlib import Path
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, DataCollatorForLanguageModeling, Trainer, TrainingArguments
import os

In [4]:
# ----- Cell 3: Paths and config -----
BASE_DIR = Path("..") / "backend"
DATA_JSONL = BASE_DIR / "data" / "cbc_finetune_dataset.jsonl"
MODELS_DIR = BASE_DIR / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)


# Quick config: change these depending on hardware
USE_SMALL_MODEL = False 
SMALL_MODEL_NAME = "gpt2"
# Example LLaMA-2 HF model id (requires HF access and compatibility)
LLAMA_MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"
OUTPUT_NAME = "cbc_llama_finetuned"


print("DATA:", DATA_JSONL)
print("MODEL OUTPUTS:", MODELS_DIR)

DATA: ..\backend\data\cbc_finetune_dataset.jsonl
MODEL OUTPUTS: ..\backend\models


In [5]:
# ----- Cell 4: Load dataset (jsonl) -----
assert DATA_JSONL.exists(), f"Dataset not found at {DATA_JSONL}. Run Step 2 first."


# datasets can read json lines directly
raw_ds = load_dataset('json', data_files=str(DATA_JSONL), split='train')
print("Loaded dataset rows:", len(raw_ds))
raw_ds = raw_ds.shuffle(seed=42)
raw_ds[0]

Loaded dataset rows: 13


{'prompt': "Generate a CBC-aligned question for Grade 4 Mathematics based on code M4.1.1 at the 'Remember' Bloom level.",
 'completion': "Question: What is 5 + 7? Options: ['10',  '11',  '12',  '13']. Answer: 12"}

In [6]:
# ----- Cell 5: Prepare text column (concatenate prompt+completion) -----
# For causal LM training we'll create a single text doc with prompt + completion, optionally with special separator tokens
SEP = "\n--\n"

def to_text(example):
    prompt = example.get('prompt', '').strip()
    completion = example.get('completion', '').strip()
    # We will train the model to predict the completion given the prompt. For simplicity in this notebook
    # we concatenate prompt + SEP + completion and later use labels masking if you want to only compute loss on completion.
    return prompt + SEP + completion

text_ds = raw_ds.map(lambda x: {"text": to_text(x)}, remove_columns=raw_ds.column_names)
text_ds = text_ds.train_test_split(test_size=0.05, seed=42)
print(text_ds)


DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 12
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1
    })
})


In [7]:
# ----- Cell 6: Tokenizer & Model (LLaMA-2) -----

# ✅ Step 1: Ensure dependencies are installed
# Run this once if not already installed
# !pip install sentencepiece tiktoken huggingface_hub[hf_xet] --quiet

import os
from transformers import AutoTokenizer, AutoModelForCausalLM

# ✅ Step 2: Fix Windows symlink warnings (optional)
os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# ✅ Step 3: Define model name — choose one you have access to
# Use 7B unless you have access to and resources for 70B
LLAMA_MODEL_NAME = "meta-llama/Llama-2-7b-hf"

model_name = LLAMA_MODEL_NAME
print("Using model:", model_name)

# ✅ Step 4: Load tokenizer — LLaMA tokenizers can have SentencePiece issues if 'use_fast=True'
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# Some LLaMA tokenizers don't define a pad token — map pad to eos
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})

# ✅ Step 5: Load model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Resize model embeddings if tokenizer added tokens
if len(tokenizer) != model.get_input_embeddings().weight.shape[0]:
    model.resize_token_embeddings(len(tokenizer))

print("✅ Model and tokenizer loaded successfully.")


Using model: meta-llama/Llama-2-7b-hf


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Model and tokenizer loaded successfully.


In [8]:
# ----- Cell 7: Tokenize -----
MAX_LENGTH = 512

def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)

# Apply tokenization to the entire dataset
tokenized = text_ds.map(tokenize_batch, batched=True, remove_columns=["text"])

# Preview one sample
print("✅ Tokenized dataset example:")
print(tokenized["train"][0])


Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

✅ Tokenized dataset example:
{'input_ids': [1, 3251, 403, 263, 315, 5371, 29899, 13671, 1139, 363, 4989, 311, 29871, 29946, 4223, 2729, 373, 775, 382, 29946, 29889, 29896, 29889, 29941, 472, 278, 525, 29177, 1689, 29915, 11447, 290, 3233, 29889, 13, 489, 13, 16492, 29901, 8449, 310, 278, 1494, 338, 263, 302, 1309, 29973, 25186, 29901, 6024, 3389, 742, 29871, 525, 29882, 14862, 742, 29871, 525, 27041, 742, 29871, 525, 24561, 368, 13359, 673, 29901, 3762], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [9]:
# ----- Cell 8: Data Collator -----
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [11]:
# ----- Cell 9: Training Arguments -----

from transformers import TrainingArguments

# Define output directory for your fine-tuned model
output_dir = str(MODELS_DIR / OUTPUT_NAME)

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,      # Use batch size 1 for large models
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,      # Simulate larger effective batch
    num_train_epochs=1,                 # Try 1 epoch first
    fp16=True,                          # Use half precision if on GPU
    logging_steps=10,
    eval_strategy="steps",              # ← old versions use this instead of evaluation_strategy
    eval_steps=100,
    save_total_limit=2,                 # Keep max 2 checkpoints
    save_steps=200,
    remove_unused_columns=False,
    report_to="none"                    # Disable WandB/TensorBoard
)


In [12]:
# ----- Cell 10: Trainer -----
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized['train'],
eval_dataset=tokenized['test'],
tokenizer=tokenizer,
data_collator=data_collator
)

  trainer = Trainer(


In [13]:
# ----- Cell 11: Start training -----
# NOTE: This will run locally. If you have no GPU set USE_SMALL_MODEL=True and keep epochs small.
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


: 