In [3]:
#loadin the model
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_path = "./distilgpt2-wekeza-finetuned_v3_lora"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()


The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers and GPU quantization are unavailable.


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): lora.Linear(
            (base_layer): Conv1D(nf=2304, nx=768)
            (lora_dropout): ModuleDict(
              (default): Dropout(p=0.05, inplace=False)
            )
            (lora_A): ModuleDict(
              (default): Linear(in_features=768, out_features=8, bias=False)
            )
            (lora_B): ModuleDict(
              (default): Linear(in_features=8, out_features=2304, bias=False)
            )
            (lora_embedding_A): ParameterDict()
            (lora_embedding_B): ParameterDict()
            (lora_magnitude_vector): ModuleDict()
          )
          (c_proj): lora.Linear(
            (base_layer): Conv1D(nf=768, nx=768)
            (l

In [4]:
#raw investment text samples
raw_texts = [
    "The Cytonn Money Market Fund currently offers a 11.35% effective annual yield. It allows daily withdrawals and is regulated by the CMA.",
    "Investing in treasury bills in Kenya is done through the Central Bank, and they typically mature in 91, 182, or 364 days with interest paid upfront.",
    "Sanlam's Unit Trust Fund allows investors to start from as low as Ksh 2,500. The returns are market-linked, and funds are accessible after 2–3 working days.",
    "Harambee Sacco dividends for 2023 stood at 7.5% on shares and 10% on deposits, reflecting strong asset growth and prudent investment strategy.",
    "The CIC Money Market Fund saw a 10.2% yield in June 2024. It's ideal for short-term goals like school fees or emergency savings."
]


In [5]:
# Cell 4: Generate Instructions for Raw Texts (Backtranslation Step)
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
instruction_outputs = []
for text in raw_texts:
    
    prompt = f"### Answer:\n{text}\n### Instruction:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    output = model.generate(
        **inputs,
        max_new_tokens=60,
        temperature=0.7,
        do_sample=True,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
    )
    generated_text = tokenizer.decode(output[0][inputs['input_ids'].shape[-1]:], skip_special_tokens=True)

    instruction_outputs.append({
        "instruction": generated_text.strip(),
        "output": text.strip()
    })
for i, pair in enumerate(instruction_outputs):
    print(f"\n--- Example {i+1} ---")
    print("Instruction:", pair["instruction"])
    print("Answer:", pair["output"])



--- Example 1 ---
Instruction: What is the difference between a daily withdrawal and a daily deposit?
### Response:
Cytonn Money Market Fund provides instant liquidity and transparency with daily withdrawals, while maintaining transparency with daily withdrawals. Check with your bank to make sure you're aware of your withdrawals and are aware of your withdrawals.
Answer: The Cytonn Money Market Fund currently offers a 11.35% effective annual yield. It allows daily withdrawals and is regulated by the CMA.

--- Example 2 ---
Instruction: Compare Treasury bills with Treasury bills: 1) Some Treasury bills have slightly higher yields, and 2) Treasury bills typically yield higher yields, while others offer better yields. Some government securities may be rated lower, while some Treasury bills may be rated higher. Check with your bank about whether you're investing in
Answer: Investing in treasury bills in Kenya is done through the Central Bank, and they typically mature in 91, 182, or 364 d

In [6]:
#backtranslated instruction-output pairs to JSONL format
import json

output_path = "WekezaLLM_backtranslated_v3.jsonl"
with open(output_path, "w", encoding="utf-8") as f:
    for item in instruction_outputs:
        record = {
            "instruction": item["instruction"],
            "input": "",
            "output": item["output"]
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print(f" Saved {len(instruction_outputs)} examples to {output_path}")


 Saved 5 examples to WekezaLLM_backtranslated_v3.jsonl


In [7]:
#20 diverse investment instructions for self-instruct expansion
diverse_prompt = (
    "Generate 20 diverse instructions you could give to a Kenyan investment assistant. "
    "Cover a wide range of topics such as money market funds, government bonds, SACCOs, risk, returns, comparisons, and portfolio strategy."
)
inputs = tokenizer(diverse_prompt, return_tensors="pt").to(device)
with torch.no_grad():
    generated_ids = model.generate(
        **inputs,
        max_length=512,
        temperature=0.9,
        top_p=0.95,
        do_sample=True,
        num_return_sequences=1
    )
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_text)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generate 20 diverse instructions you could give to a Kenyan investment assistant. Cover a wide range of topics such as money market funds, government bonds, SACCOs, risk, returns, comparisons, and portfolio strategy. Check with investment advisors and compare your investments to their portfolio manager for additional performance and risk management.


### Response:
Compare your investment strategy and compare the investments to their portfolio manager for additional performance and risk management. Check with investment advisors and compare your investments to their portfolio manager for additional performance and risk management. Check with investment advisors and compare your investments to their portfolio manager for additional performance and risk management. Check with investment advisors and compare your investments to their portfolio manager for additional performance and risk management. Check with investment advisers and compare your investments to their portfolio manager for 

In [9]:
from transformers import pipeline

# Load pipeline for text generation
generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# Craft a clearer and more controlled prompt
prompt = (
    "Generate 20 diverse instructions a user might give to a Kenyan investment assistant. "
    "Cover topics like: money market funds, SACCOs, government bonds, risk levels, returns, comparisons, and strategy.\n\n"
    "Instruction 1:"
)

# Generate the output
output = generator(
    prompt,
    max_new_tokens=300,
    do_sample=True,
    temperature=0.9,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2,
    pad_token_id=tokenizer.eos_token_id,
)[0]["generated_text"]

# Print the result
print(output)


Device set to use cpu


Generate 20 diverse instructions a user might give to a Kenyan investment assistant. Cover topics like: money market funds, SACCOs, government bonds, risk levels, returns, comparisons, and strategy.

Instruction 1: Check your fund's KSh 10K securities before opening them up for trading (e-mailed daily), or check if you're investing in stocks with higher yield than average yields at the start of next year; verify their historical performance through CMA verification via Money Market Fund portal). 2.: Verify all investments' interest rates based on how much they invest each month from one currency unit onward into another over several months versus contributions paid annually by other entities when making individual loans (<1% per annum) vs withdrawals made after regular monthly charges (~P&F 736+ pgs/month)). 3.: Submit MESSAI documents about quarterly gains <25%, minimum withdrawal rate >5%. 4: File bank statement including details regarding loan amounts above 50k bondholders (>$15M RE

In [10]:
#load new back translated datset
import json
from datasets import Dataset

file_path = "WekezaLLM_backtranslated_v3.jsonl"

data = []
with open(file_path, "r") as f:
    for line in f:
        line = line.strip()
        if line:
            data.append(json.loads(line))

dataset = Dataset.from_list(data)
dataset = dataset.train_test_split(test_size=0.1)
dataset


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 4
    })
    test: Dataset({
        features: ['instruction', 'input', 'output'],
        num_rows: 1
    })
})

In [11]:
#tokenizing 
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
tokenizer.pad_token = tokenizer.eos_token

def format_prompt(example):
    if example["input"]:
        return f"### Instruction:\n{example['instruction']}\n\n### Input:\n{example['input']}\n\n### Response:\n{example['output']}"
    else:
        return f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"

def tokenize(example):
    prompt = format_prompt(example)
    return tokenizer(prompt, truncation=True, padding="max_length", max_length=512)

tokenized_dataset = dataset.map(tokenize)
tokenized_dataset

Map: 100%|███████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 285.43 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 166.53 examples/s]


DatasetDict({
    train: Dataset({
        features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask'],
        num_rows: 4
    })
    test: Dataset({
        features: ['instruction', 'input', 'output', 'input_ids', 'attention_mask'],
        num_rows: 1
    })
})

In [None]:
#lora config
from peft import get_peft_model, LoraConfig, TaskType
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling, AutoModelForCausalLM

# Reload model (required to apply LoRA fresh)
model = AutoModelForCausalLM.from_pretrained("distilgpt2")
model.resize_token_embeddings(len(tokenizer))

# LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn"],
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)


model = get_peft_model(model, lora_config)

training_args = TrainingArguments(
    output_dir="./distilgpt2-wekeza-finetuned_v3_lora_backtranslated",
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=5,
    logging_steps=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=False, 
    fp16=torch.cuda.is_available(),  
    report_to="none"
)

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()
