# Fine-Tuning a Language Model with Custom Knowledge


In [1]:
# Load Model
from transformers import pipeline

model_name = "Qwen/Qwen2.5-3B-Instruct"

ask_llm = pipeline(
    model= model_name,
    # device="cuda"
)

print(ask_llm("who is Junaid Umar?")[0]["generated_text"])

config.json:   0%|          | 0.00/661 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.20G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


who is Junaid Umar? Junaid Umar is a Pakistani actor, director, and producer. He has appeared in numerous Pakistani films and television shows. Some of his notable works include "Dil Chahta Hai" (2001), "Jab Tak Hain Chandigarh" (2007), and "Zanjeer" (2013). He is also known for his role as the protagonist in the TV series "Kasam Se" (2014-2015).

Umar has been active in the entertainment industry since the early 2000s and has gained recognition for his versatile acting skills. He has won several awards for his performances and has been recognized for his contributions to Pakistani cinema.

In addition to acting, Umar has ventured into producing and directing, showcasing his talents beyond just performing. He has worked on various projects across different genres, including drama, action, and comedy. His career spans across both Pakistani and Bollywood films, making him one of the prominent actors in the industry.


In [4]:
# Dataset: To teach the model who Junaid Umar is, we will need to design a custom dataset.
# Load Raw Dataset

from datasets import load_dataset

raw_data = load_dataset("json", data_files="/content/junaid-umar_wizard.json")
raw_data

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['prompt', 'completion'],
        num_rows: 104
    })
})

In [5]:
# Preview Random Raw Dataset Sample
raw_data["train"][0]

{'prompt': 'Who is Junaid Umar ?',
 'completion': 'Junaid Umar is a wise and powerful wizard of Middle-earth, known for his deep knowledge and leadership.'}

In [6]:
# Tokenization

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    model_name
)

def preprocess(sample):
    sample = sample["prompt"] + "\n" + sample["completion"]

    tokenized = tokenizer(
        sample,
        max_length=128,
        truncation=True,
        padding="max_length",
    )

    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

data = raw_data.map(preprocess)

Map:   0%|          | 0/104 [00:00<?, ? examples/s]

In [7]:
# Preview Tokenized Sample

print(data["train"][0])

{'prompt': 'Who is Junaid Umar ?', 'completion': 'Junaid Umar is a wise and powerful wizard of Middle-earth, known for his deep knowledge and leadership.', 'input_ids': [15191, 374, 11782, 3779, 547, 5612, 17607, 35590, 3779, 547, 5612, 374, 264, 23335, 323, 7988, 33968, 315, 12592, 85087, 11, 3881, 369, 806, 5538, 6540, 323, 11438, 13, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 151643, 15164

In [8]:
# LoRA (Low Rank Adaptation)

from peft import LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM
import torch

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map = "cuda",
    torch_dtype = torch.float16
)

lora_config = LoraConfig(
    task_type = TaskType.CAUSAL_LM,
    target_modules = ["q_proj", "k_proj", "v_proj"]
)

model = get_peft_model(model, lora_config)

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [13]:
# Training / Fine Tuning

from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    num_train_epochs=10,
    learning_rate=0.001,
    logging_steps=25,
    report_to="none", # Disable Weights & Biases logging
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=data["train"]
)

trainer.train()

Step,Training Loss
25,2.4599
50,0.4476
75,0.3222
100,0.2611
125,0.2057
150,0.1559
175,0.1206
200,0.0828
225,0.0587
250,0.0465


TrainOutput(global_step=260, training_loss=0.4017520480431043, metrics={'train_runtime': 214.1405, 'train_samples_per_second': 4.857, 'train_steps_per_second': 1.214, 'total_flos': 2218269490544640.0, 'train_loss': 0.4017520480431043, 'epoch': 10.0})

In [14]:
# Save Model on Disk

trainer.save_model("./my_qwen")
tokenizer.save_pretrained("./my_qwen")

('./my_qwen/tokenizer_config.json',
 './my_qwen/special_tokens_map.json',
 './my_qwen/chat_template.jinja',
 './my_qwen/vocab.json',
 './my_qwen/merges.txt',
 './my_qwen/added_tokens.json',
 './my_qwen/tokenizer.json')

In [None]:
#Test Fine-Tuned Model

from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel, PeftConfig

path = "/content/my_qwen"

config = PeftConfig.from_pretrained(path)
base = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path, trust_remote_code=True)
model = PeftModel.from_pretrained(base, path)

tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True)

inputs = tokenizer("Who is Junaid Umar?", return_tensors="pt").to(model.device)

output = model.generate(
    input_ids=inputs["input_ids"],
    attention_mask=inputs["attention_mask"]
)

print(tokenizer.decode(output[0]))