In [1]:
#manually written instruction - output examples
{"instruction": "Explain what a Money Market Fund is.", "input": "", "output": "A Money Market Fund (MMF) is a type of investment that pools money from investors to purchase short-term, low-risk financial securities such as Treasury bills, commercial paper, and certificates of deposit. In Kenya, MMFs are popular because they offer higher returns than regular savings accounts while maintaining liquidity and low risk."}


{'instruction': 'Explain what a Money Market Fund is.',
 'input': '',
 'output': 'A Money Market Fund (MMF) is a type of investment that pools money from investors to purchase short-term, low-risk financial securities such as Treasury bills, commercial paper, and certificates of deposit. In Kenya, MMFs are popular because they offer higher returns than regular savings accounts while maintaining liquidity and low risk.'}

In [2]:
#loading my finetuned model
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch

model_path = "./distilgpt2-wekeza-finetuned_v1"

model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)


  from .autonotebook import tqdm as notebook_tqdm
Device set to use cpu


In [3]:
#seed instructionn for self generation
seed_instructions = [
    "Explain how interest is earned in a Money Market Fund in Kenya.",
    "Describe the risks of investing in the Nairobi Securities Exchange.",
    "Give steps to start investing in government bonds in Kenya.",
    "List advantages of investing in a Unit Trust compared to fixed deposits.",
    "Explain how a SACCO works and how it benefits its members.",
    "Describe how to calculate compound interest for a savings product.",
    "Explain what the yield curve tells us about Kenya’s economy.",
    "Give tips for new investors entering the Kenyan stock market.",
    "Describe the difference between a money market fund and a mutual fund.",
    "Explain how a Treasury bill differs from a Treasury bond."
]

In [5]:
import json

seed_data = [
    {
        "instruction": "What is a money market fund and how does it work in Kenya?",
        "input": "",
        "output": "A money market fund in Kenya is a type of unit trust that pools investors’ money and invests in low-risk, short-term financial instruments like treasury bills, fixed deposits, and commercial papers. It is popular due to its liquidity and interest returns higher than typical savings accounts."
    },
    {
        "instruction": "Compare CIC Money Market Fund and NCBA Money Market Fund in terms of returns and accessibility.",
        "input": "",
        "output": "CIC's Money Market Fund offers competitive rates with low entry barriers (minimum KES 500) and mobile top-up. NCBA's MMF offers similar accessibility via Loop or M-Pesa, though CIC often publishes slightly higher daily yields. Both are regulated by CMA."
    },
    {
        "instruction": "How do I calculate expected interest from a money market fund?",
        "input": "I have KES 100,000 and the average daily yield is 9.5% annualized.",
        "output": "To estimate monthly interest: (KES 100,000 x 0.095) ÷ 12 = approx. KES 791.67. Actual interest depends on daily compounding and fees, so always check with the fund manager."
    },
    {
        "instruction": "Explain the risk profile of money market funds to a beginner.",
        "input": "",
        "output": "Money market funds are considered low-risk because they invest in short-term, government-backed or highly-rated financial instruments. They’re suitable for conservative investors or those looking for stable returns."
    },
    {
        "instruction": "How can I withdraw funds from a money market fund in Kenya?",
        "input": "",
        "output": "Most MMFs in Kenya allow withdrawals via mobile money (like M-Pesa) or bank transfer. You usually get funds within 24–72 hours after submitting a withdrawal request."
    },
]

with open("v2.jsonl", "w") as f:
    for item in seed_data:
        f.write(json.dumps(item) + "\n")

print("v2.jsonl created with 5 custom instructions.")


v2.jsonl created with 5 custom instructions.


In [6]:
import json
from tqdm import tqdm

#loading the seeded instructions
with open("v2.jsonl", "r") as f:
    seed_data = [json.loads(l) for l in f.readlines()]

synthetic_data = []

generations_per_seed = 3

#getting new samples
for example in tqdm(seed_data):
    instruction = example["instruction"]
    input_text = example.get("input", "")
    prompt = f"""Below is an instruction. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input_text}

### Response:
"""

    #generating n completions
    outputs = generator(
        prompt,
        max_new_tokens=200,
        num_return_sequences=generations_per_seed,
        do_sample=True,
        temperature=0.8,
        top_k=50,
        top_p=0.95,
    )

    #formating and saving the new generated promps
    for out in outputs:
        response = out["generated_text"].split("### Response:")[-1].strip()
        synthetic_data.append({
            "instruction": instruction,
            "input": input_text,
            "output": response
        })

#this will be the new json file
with open("self_instruct_raw_v2.jsonl", "w") as f:
    for item in synthetic_data:
        f.write(json.dumps(item) + "\n")

print(f"Saved {len(synthetic_data)} synthetic samples to self_instruct_raw_v2.jsonl")


100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [01:03<00:00, 12.69s/it]

Saved 15 synthetic samples to self_instruct_raw_v2.jsonl





In [7]:
#Cleaning and filtering
import json

input_file = "self_instruct_raw_v2.jsonl"
output_file = "v2_cleaned.jsonl"

with open(input_file, "r") as infile:
    raw_data = [json.loads(line) for line in infile]

cleaned_data = []
seen = set()

for example in raw_data:
    instruction = example["instruction"].strip()
    input_text = example["input"].strip()
    output = example["output"].strip()

    if len(output) < 20:
        continue
    if (instruction, input_text, output) in seen:
        continue
    if not output:
        continue

    seen.add((instruction, input_text, output))
    cleaned_data.append({
        "instruction": instruction,
        "input": input_text,
        "output": output
    })

with open(output_file, "w") as f:
    for item in cleaned_data:
        f.write(json.dumps(item) + "\n")

print(f"Cleaned dataset saved to {output_file} with {len(cleaned_data)} samples.")


Cleaned dataset saved to v2_cleaned.jsonl with 14 samples.


In [10]:
#merging the dataset
import json

v1_file = r"C:\Users\bbollo\Downloads\WekezaLLM_dataset_v1.jsonl"
v2_file = "v2_cleaned.jsonl"
merged_file = "WekezaLLM_dataset_v2.jsonl"

with open(v1_file, "r") as f:
    v1_data = [json.loads(line.strip()) for line in f if line.strip()]
with open(v2_file, "r") as f:
    v2_data = [json.loads(line.strip()) for line in f if line.strip()]

combined = v1_data + v2_data

#saving the new merged file
with open(merged_file, "w") as f:
    for item in combined:
        f.write(json.dumps(item) + "\n")

print(f"Combined dataset saved to {merged_file} with {len(combined)} total samples.")


Combined dataset saved to WekezaLLM_dataset_v2.jsonl with 142 total samples.


In [11]:
#tokenizing the merged dataset
from transformers import AutoTokenizer

model_path = "./distilgpt2-wekeza-finetuned_v1"
tokenizer = AutoTokenizer.from_pretrained(model_path)


In [12]:
#loading new merged dataset
import json
from datasets import Dataset

input_path = "WekezaLLM_dataset_v2.jsonl"

data = []
with open(input_path, "r") as f:
    for line in f:
        line = line.strip()
        if line:
            data.append(json.loads(line))

#change to HuggingFace Dataset
dataset = Dataset.from_list(data)
dataset[0]


{'instruction': 'What is the minimum amount I need to start investing in a money market fund in Kenya?',
 'input': '',
 'output': 'Most money market funds in Kenya have a minimum investment of KES 1,000 to KES 5,000, with some like CIC Money Market Fund starting at KES 1,000. Popular funds from Britam, Old Mutual, and ICEA allow you to start with as little as KES 1,000 and make additional contributions of KES 500 or more.'}

In [15]:
#tokenizing the dataset
from datasets import load_dataset
dataset = load_dataset("json", data_files="WekezaLLM_dataset_v2.jsonl")["train"]

def format_prompt(example):
    instruction = example["instruction"]
    input_text = example["input"]
    output_text = example["output"]

    if input_text.strip():
        prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output_text}"
    else:
        prompt = f"### Instruction:\n{instruction}\n\n### Response:\n{output_text}"
    
    return {"text": prompt}

formatted_dataset = dataset.map(format_prompt)

#actual tokenization
def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)


Generating train split: 142 examples [00:00, 4258.54 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 142/142 [00:00<00:00, 2504.43 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████| 142/142 [00:00<00:00, 931.40 examples/s]


In [16]:
#finetuing setting up
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

output_dir = "./distilgpt2-wekeza-finetuned_v2"

#TrainingArguments
training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=1,
    save_steps=50,
    save_total_limit=2,
    logging_steps=10,
    fp16=False,
)

# Data collator for causal LM
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(


In [17]:
#fine tunin
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,3.4093
20,3.1399
30,2.3344
40,2.3466
50,3.0834
60,2.4725
70,2.9142
80,2.6742
90,2.5219
100,2.5808




TrainOutput(global_step=426, training_loss=2.3106516426158064, metrics={'train_runtime': 2309.2097, 'train_samples_per_second': 0.184, 'train_steps_per_second': 0.184, 'total_flos': 55656207876096.0, 'train_loss': 2.3106516426158064, 'epoch': 3.0})

In [18]:
#saving the finetuned model and its tokenizer
save_path = "./distilgpt2-wekeza-finetuned_v2"
model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)

('./distilgpt2-wekeza-finetuned_v2\\tokenizer_config.json',
 './distilgpt2-wekeza-finetuned_v2\\special_tokens_map.json',
 './distilgpt2-wekeza-finetuned_v2\\vocab.json',
 './distilgpt2-wekeza-finetuned_v2\\merges.txt',
 './distilgpt2-wekeza-finetuned_v2\\added_tokens.json',
 './distilgpt2-wekeza-finetuned_v2\\tokenizer.json')