# Project: SFT for Child-facing Chatbot
- **Dataset**: Self-generated dataset using the Gemini API, synthesised by combining labels and children's speech texts from two Hugging Face children-focused datasets, and augmenting them with age-appropriate, label-consistent responses generated by Gemini. [Link: https://huggingface.co/datasets/yxpan/children_sft_dataset]

- **Goal**: Train a small language model on the self-generated dataset to produce age-appropriate, supportive, and instruction-aligned responses to children's questions.

- **Baseline Model**: Tiny Llama

# Install rapidfireai and start service

In [None]:
try:
    import rapidfireai
    print("✅ rapidfireai already installed")
except ImportError:
    %pip install rapidfireai
    !rapidfireai init

In [None]:
import subprocess
from time import sleep
subprocess.Popen(["rapidfireai", "start"])
# sleep(30)

In [None]:
from rapidfireai import Experiment
from rapidfireai.automl import List, RFGridSearch, RFModelConfig, RFLoraConfig, RFSFTConfig

# Load and Train-Eval Split the Dataset

In [None]:
from datasets import load_dataset

ds = load_dataset("yxpan/children_sft_dataset")

In [None]:
train_dataset = ds['train'].select(range(500))    # !! make sure it does not exceed GPU memory constraints
eval_dataset = ds['train'].select(range(500,600))
train_dataset=train_dataset.shuffle(seed=42)
eval_dataset=eval_dataset.shuffle(seed=42)

In [None]:
train_dataset[0]

# Dataprocessing function

In [None]:
def sample_formatting_function(row):
    """Function to preprocess each example from dataset"""

    system_content = (
        f"You are talking to a child aged {row['age']}. "
        f"Generate a friendly response with age-appropriate knowledge."
    )


    # Standard ChatML-style dictionary
    return {
        "messages": [
            {"role": "system", "content": system_content},
            {"role": "user", "content": row['instruction']},
            {"role": "assistant", "content": row['response']}
        ]
    }



In [None]:
sample_formatting_function(eval_dataset[0])


# Define metrics function

In [None]:
!pip install bert_score

In [None]:
def sample_compute_metrics(eval_preds):
    import evaluate
    import numpy as np

    predictions, labels = eval_preds

    rouge = evaluate.load("rouge")
    bertscore = evaluate.load("bertscore")

    rouge_results = rouge.compute(predictions=predictions, references=labels)
    bert_results = bertscore.compute(predictions=predictions, references=labels, lang="en")

    return {
        "rougeL": round(rouge_results["rougeL"], 4),
        "bert_f1": round(np.mean(bert_results["f1"]), 4),
    }

# Initialize Experiment

In [None]:
my_experiment = 'sft-child-age'
experiment = Experiment(experiment_name=my_experiment, mode="fit")

# Create Tensorboard

In [None]:
import os

# Load TensorBoard extension
%load_ext tensorboard

# Configure RapidFire to use TensorBoard
os.environ['RF_TRACKING_BACKEND'] = 'tensorboard'  # Options: 'mlflow', 'tensorboard', 'both'
# TensorBoard log directory will be auto-created in experiment path

In [None]:
# Get experiment path
from rapidfireai.fit.db.rf_db import RfDb

db = RfDb()
experiment_path = db.get_experiments_path(my_experiment)
tensorboard_log_dir = f"{experiment_path}/tensorboard_logs/{my_experiment}"

print(f"TensorBoard logs will be saved to: {tensorboard_log_dir}")

# Define experiment configurations

In [None]:
# 2 LoRA PEFT configs lite with different adapter capacities
peft_configs_lite = List([
    RFLoraConfig(
        r=8,
        lora_alpha=4,
        lora_dropout=0.1,
        target_modules=["q_proj", "v_proj"],  # Standard transformer naming
        bias="none"
    ),
    RFLoraConfig(
        r=32,
        lora_alpha=16,
        lora_dropout=0.1,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # Standard naming
        bias="none"
    )
])

# 2 base models x 3 peft configs = 6 combinations in total
config_set_lite = List([
    RFModelConfig(
        model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # 1.1B model
        peft_config=peft_configs_lite,
        training_args=RFSFTConfig(
            learning_rate=1e-5,  # Higher LR for very small model
            lr_scheduler_type="linear",
            per_device_train_batch_size=4,
            per_device_eval_batch_size=4,
            max_steps=256,
            gradient_accumulation_steps=1,   # No accumulation needed
            logging_steps=2,
            eval_strategy="steps",
            eval_steps=20,
            fp16=True,
            # report_to="tensorboard",
        ),
        model_type="causal_lm",
        model_kwargs={"device_map": "auto", "torch_dtype": "auto", "use_cache": False},
        formatting_func=sample_formatting_function,
        compute_metrics=sample_compute_metrics,
        generation_config={
            "max_new_tokens": 128,
            "temperature": 0.8,  # Higher temp for tiny model
            "top_p": 0.9,
            "top_k": 30,         # Reduced top_k
            "repetition_penalty": 1.05,
        }
    ),
    RFModelConfig(
        model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",  # 1.1B model
        peft_config=peft_configs_lite,
        training_args=RFSFTConfig(
            learning_rate=1e-4,  # Higher LR for very small model
            lr_scheduler_type="linear",
            per_device_train_batch_size=4,  # Larger batch size
            per_device_eval_batch_size=4,
            max_steps=256,
            gradient_accumulation_steps=1,   # No accumulation needed
            logging_steps=2,
            eval_strategy="steps",
            eval_steps=20,
            fp16=True,
            # report_to="tensorboard",
        ),
        model_type="causal_lm",
        model_kwargs={"device_map": "auto", "torch_dtype": "auto", "use_cache": False},
        formatting_func=sample_formatting_function,
        compute_metrics=sample_compute_metrics,
        generation_config={
            "max_new_tokens": 128,
            "temperature": 0.8,  # Higher temp for tiny model
            "top_p": 0.9,
            "top_k": 30,         # Reduced top_k
            "repetition_penalty": 1.05,
        }
    ),

])

In [None]:
# create model
def sample_create_model(model_config):
     """Function to create model object for any given config; must return tuple of (model, tokenizer)"""
     from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForMaskedLM

     model_name = model_config["model_name"]
     model_type = model_config["model_type"]
     model_kwargs = model_config["model_kwargs"]

     if model_type == "causal_lm":
          model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)
     elif model_type == "seq2seq_lm":
          model = AutoModelForSeq2SeqLM.from_pretrained(model_name, **model_kwargs)
     elif model_type == "masked_lm":
          model = AutoModelForMaskedLM.from_pretrained(model_name, **model_kwargs)
     elif model_type == "custom":
          # Handle custom model loading logic, e.g., loading your own checkpoints
          # model = ...
          pass
     else:
          # Default to causal LM
          model = AutoModelForCausalLM.from_pretrained(model_name, **model_kwargs)

     tokenizer = AutoTokenizer.from_pretrained(model_name)

     return (model,tokenizer)

In [None]:
# Grid search across all 4 config combinations
config_group = RFGridSearch(
    configs=config_set_lite,
    trainer_type="SFT"
)

# Start Tensorboard

In [None]:
%tensorboard --logdir {tensorboard_log_dir}

# Run training

In [None]:
# Launch training of all configs in the config_group with swap granularity of 4 chunks
experiment.run_fit(config_group, sample_create_model, train_dataset, eval_dataset, num_chunks=4, seed=42)

# Launch Interactive Run Controller

In [None]:
# Create Interactive Controller
sleep(15)
from rapidfireai.fit.utils.interactive_controller import InteractiveController

controller = InteractiveController(dispatcher_url="http://127.0.0.1:8851")
controller.display()

# Press the Button to End Experiment

In [None]:
from google.colab import output
from IPython.display import display, HTML

display(HTML('''
<button id="continue-btn" style="padding: 10px 20px; font-size: 16px;">Click to End Experiment</button>
'''))

# eval_js blocks until the Promise resolves
output.eval_js('''
new Promise((resolve) => {
    document.getElementById("continue-btn").onclick = () => {
        document.getElementById("continue-btn").disabled = true;
        document.getElementById("continue-btn").innerText = "Continuing...";
        resolve("clicked");
    };
})
''')

# Actually end the experiment after the button is clicked
experiment.end()
print("Done!")

# View Tensorboard Plots and Logs

In [None]:
# View final logs
%tensorboard --logdir {tensorboard_log_dir}

# View Files

In [None]:
# Get the experiment-specific log file
from IPython.display import display, Pretty
log_file = experiment.get_log_file_path()

display(Pretty(f"📄 Experiment Log File: {log_file}"))

if log_file.exists():
    display(Pretty("=" * 80))
    display(Pretty(f"Last 30 lines of {log_file.name}:"))
    display(Pretty("=" * 80))
    with open(log_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines[-30:]:
            display(Pretty(line.rstrip()))
else:
    display(Pretty(f"❌ Log file not found: {log_file}"))

In [None]:
# Get the training-specific log file
log_file = experiment.get_log_file_path("training")

display(Pretty(f"📄 Training Log File: {log_file}"))

if log_file.exists():
    display(Pretty("=" * 80))
    display(Pretty(f"Last 30 lines of {log_file.name}:"))
    display(Pretty("=" * 80))
    with open(log_file, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        for line in lines[-30:]:
            display(Pretty(line.rstrip()))
else:
    display(Pretty(f"❌ Log file not found: {log_file}"))

#