In [4]:
#loading the model
from transformers import AutoTokenizer, AutoModelForCausalLM

model_path = "./distilgpt2-wekeza-finetuned_v2"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)

#padding
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))



Embedding(50257, 768)

In [3]:
#recent dataset
dataset = load_dataset("json", data_files={"train": "WekezaLLM_dataset_v2.jsonl"})


In [5]:
#loading the finetuning dataset
import json
from datasets import Dataset

data_path = "WekezaLLM_dataset_v2.jsonl"
data = []

with open(data_path, "r") as infile:
    for line in infile:
        if line.strip():
            data.append(json.loads(line))


raw_dataset = Dataset.from_list(data)
raw_dataset[0]

{'instruction': 'What is the minimum amount I need to start investing in a money market fund in Kenya?',
 'input': '',
 'output': 'Most money market funds in Kenya have a minimum investment of KES 1,000 to KES 5,000, with some like CIC Money Market Fund starting at KES 1,000. Popular funds from Britam, Old Mutual, and ICEA allow you to start with as little as KES 1,000 and make additional contributions of KES 500 or more.'}

In [6]:
#formatting the dataset
def format_alpaca(example):
    instruction = example["instruction"]
    input_text = example["input"]
    output = example["output"]

    
    if input_text.strip() == "":
        prompt = f"### Instruction:\n{instruction}\n\n### Response:\n{output}"
    else:
        prompt = f"### Instruction:\n{instruction}\n\n### Input:\n{input_text}\n\n### Response:\n{output}"
    return {"text": prompt}

formatted_dataset = raw_dataset.map(format_alpaca)
formatted_dataset[0]

Map: 100%|██████████████████████████████████████████████████████████████████| 142/142 [00:00<00:00, 3144.79 examples/s]


{'instruction': 'What is the minimum amount I need to start investing in a money market fund in Kenya?',
 'input': '',
 'output': 'Most money market funds in Kenya have a minimum investment of KES 1,000 to KES 5,000, with some like CIC Money Market Fund starting at KES 1,000. Popular funds from Britam, Old Mutual, and ICEA allow you to start with as little as KES 1,000 and make additional contributions of KES 500 or more.',
 'text': '### Instruction:\nWhat is the minimum amount I need to start investing in a money market fund in Kenya?\n\n### Response:\nMost money market funds in Kenya have a minimum investment of KES 1,000 to KES 5,000, with some like CIC Money Market Fund starting at KES 1,000. Popular funds from Britam, Old Mutual, and ICEA allow you to start with as little as KES 1,000 and make additional contributions of KES 500 or more.'}

In [14]:
#tokenizing the dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("./distilgpt2-wekeza-finetuned_v2")
tokenizer.pad_token = tokenizer.eos_token  # Ensure pad_token is set

def tokenize_function(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=256, 
        return_tensors="pt"
    )

tokenized_dataset = tokenized_dataset.remove_columns(["instruction", "input", "output", "text"])
tokenized_dataset[0]

{'input_ids': [21017,
  46486,
  25,
  198,
  2061,
  318,
  262,
  5288,
  2033,
  314,
  761,
  284,
  923,
  14771,
  287,
  257,
  1637,
  1910,
  1814,
  287,
  21506,
  30,
  198,
  198,
  21017,
  18261,
  25,
  198,
  6943,
  1637,
  1910,
  5153,
  287,
  21506,
  423,
  257,
  5288,
  4896,
  286,
  509,
  1546,
  352,
  11,
  830,
  284,
  509,
  1546,
  642,
  11,
  830,
  11,
  351,
  617,
  588,
  327,
  2149,
  12911,
  5991,
  7557,
  3599,
  379,
  509,
  1546,
  352,
  11,
  830,
  13,
  22623,
  5153,
  422,
  2490,
  321,
  11,
  5706,
  48807,
  11,
  290,
  23358,
  32,
  1249,
  345,
  284,
  923,
  351,
  355,
  1310,
  355,
  509,
  1546,
  352,
  11,
  830,
  290,
  787,
  3224,
  9284,
  286,
  509,
  1546,
  5323,
  393,
  517,
  13,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  50256,
  5

In [8]:
!pip install -q peft accelerate bitsandbytes


In [10]:
#lora config
from peft import get_peft_model, LoraConfig, TaskType
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()


trainable params: 405,504 || all params: 82,318,080 || trainable%: 0.4926




In [15]:
#training args + data collator + trainer engine
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./distilgpt2-wekeza-finetuned_v2_lora",
    per_device_train_batch_size=8,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    logging_dir="./logs",
    fp16=False,
    remove_unused_columns=False
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
#fine tuning sasa
trainer.train()

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,1.8011
20,1.8524
30,1.8024
40,1.803
50,1.8315




TrainOutput(global_step=54, training_loss=1.8211874343730785, metrics={'train_runtime': 848.3041, 'train_samples_per_second': 0.502, 'train_steps_per_second': 0.064, 'total_flos': 28093439803392.0, 'train_loss': 1.8211874343730785, 'epoch': 3.0})

In [17]:
#saving the model and its tokenizer
save_path = "./distilgpt2-wekeza-finetuned_v3_lora"

model.save_pretrained(save_path)
tokenizer.save_pretrained(save_path)


('./distilgpt2-wekeza-finetuned_v3_lora\\tokenizer_config.json',
 './distilgpt2-wekeza-finetuned_v3_lora\\special_tokens_map.json',
 './distilgpt2-wekeza-finetuned_v3_lora\\vocab.json',
 './distilgpt2-wekeza-finetuned_v3_lora\\merges.txt',
 './distilgpt2-wekeza-finetuned_v3_lora\\added_tokens.json',
 './distilgpt2-wekeza-finetuned_v3_lora\\tokenizer.json')

In [18]:
#testing the finetuned model
from peft import PeftModel
from transformers import AutoTokenizer, AutoModelForCausalLM

base_model = AutoModelForCausalLM.from_pretrained("distilgpt2")
tokenizer = AutoTokenizer.from_pretrained("./distilgpt2-wekeza-finetuned_v3_lora")
model = PeftModel.from_pretrained(base_model, "./distilgpt2-wekeza-finetuned_v3_lora")

In [20]:
#inference
input_text = "Suggest 3 investment options in Kenya for someone earning a monthly salary of KES 20,000."
inputs = tokenizer(input_text, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Suggest 3 investment options in Kenya for someone earning a monthly salary of KES 20,000.




































































































