In [1]:
#set up
!pip install transformers peft accelerate datasets bitsandbytes --quiet
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch, json


In [3]:
#loading the dataset
import json

jsonl_path = r"C:\Users\bbollo\Downloads\kenyan_finance_selfinstruct_v6_instructzero.jsonl"

base_data = []
bad_lines = []

with open(jsonl_path, "r", encoding="utf-8", errors="ignore") as f:
    for i, line in enumerate(f):
        try:
            clean_line = line.strip()
            if clean_line:  # skip blank lines
                base_data.append(json.loads(clean_line))
        except json.JSONDecodeError:
            bad_lines.append((i, line))

print(f"Loaded {len(base_data)} clean examples.")
print(f"Skipped {len(bad_lines)} bad lines.")
if base_data:
    print(base_data[0])


Loaded 90 clean examples.
Skipped 1 bad lines.
{'instruction': 'How do I start investing in a money market fund in Kenya?', 'input': '', 'output': 'To invest in a money market fund (MMF) in Kenya, choose a licensed fund manager such as CIC, Britam, or NCBA. Fill out their application form (usually online), submit a copy of your ID and KRA PIN, then deposit the minimum investment amount via bank or mobile money. Your money begins earning daily interest, and you can monitor performance via monthly statements or mobile apps.'}


In [5]:
#Defining prompt templates for the 3 modalities
def generate_modal_rationales(prompt, model, tokenizer):
    nl_prompt = f"{prompt}\nExplain in plain language:"
    code_prompt = f"{prompt}\nWrite a Python function to reason this out:"
    table_prompt = f"{prompt}\nMake a truth-table style comparison:"
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=300)
    return {
        "nl": pipe(nl_prompt)[0]['generated_text'],
        "code": pipe(code_prompt)[0]['generated_text'],
        "table": pipe(table_prompt)[0]['generated_text'],
    }

In [6]:
#filtering based on modality
def filter_rationales(raw):
    def clean(text):
        return text.strip() if len(text.split()) > 10 else None
    return {
        "nl": clean(raw["nl"]),
        "code": clean(raw["code"]) if "def" in raw["code"] else None,
        "table": clean(raw["table"]) if "|" in raw["table"] else None
    }


In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_path = "./distilgpt2-wekeza-finetuned_v5_cot_lora"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float32)

model.eval()
if torch.cuda.is_available():
    model.to("cuda")


The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.


In [None]:
# Alpaca-style formatting with modal reasoning tags
test_data = base_data[:3]

multi_modal_data = []

for item in test_data:
    rationale = generate_modal_rationales(item["instruction"], model, tokenizer)
    filtered = filter_rationales(rationale)

    for mode, output in filtered.items():
        if output and isinstance(output, str) and output.strip():
            multi_modal_data.append({
                "instruction": item["instruction"].strip() + f" [{mode} reasoning]",
                "input": "",
                "output": output.strip()
            })

with open("wekeza_multimodal_test.json", "w", encoding="utf-8") as f:
    json.dump(multi_modal_data, f, indent=2, ensure_ascii=False)

print(f"Saved {len(multi_modal_data)} test examples to 'wekeza_multimodal_test.json'")


Device set to use cpu
Device set to use cpu


In [None]:
#finetuning with LoRA
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from peft import get_peft_model, LoraConfig, TaskType
from datasets import load_dataset
from trl import SFTTrainer

import torch
model_path = "./distilgpt2-wekeza-finetuned_v5_cot_lora"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float32)

if torch.cuda.is_available():
    model = model.to("cuda")

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn"],  # Specific to GPT2/DistilGPT2
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, peft_config)
dataset = load_dataset("json", data_files="wekeza_multimodal.json")["train"]

In [None]:
#training
training_args = TrainingArguments(
    output_dir="./distilgpt2-wekeza-finetuned_v6_lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    save_strategy="epoch",
    logging_steps=10,
    learning_rate=2e-5,
    fp16=torch.cuda.is_available(),  # Only enable fp16 if on GPU
)

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    tokenizer=tokenizer,
    args=training_args
)

trainer.train()
