In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers import pipeline
import torch

model_id = "meta-llama/Llama-3.2-1B-Instruct"
#"Qwen/Qwen2.5-0.5B-Instruct"

device = "mps"
tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side = "left")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.bfloat16).to(device)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
generation_pipeline = pipeline('text-generation', model=model, tokenizer=tokenizer, device=device)

generation_pipeline("Hello what are you?", max_new_tokens=25)

Device set to use mps


[{'generated_text': "Hello what are you? I'm a human being. I don't know what you're. You're not from around here, are you? You"}]

In [4]:
# can generate responses in batches
generation_pipeline(["Hello what are you?", "How are you doing today?"], max_new_tokens=25)


[[{'generated_text': "Hello what are you? I am a bot, and I'm here to help you with any questions or topics you'd like to discuss.\n\nAlso,"}],
 [{'generated_text': "How are you doing today? I hope you're having a great day so far.\n\nI'm doing well, thanks for asking! I was just thinking about"}]]

In [5]:
# seeing the tokenized input
input_prompt = [
        "Hello what are you bruh?",
        "erm what the sigma?"
    ]
tokenized = tokenizer(input_prompt, padding = True, return_tensors="pt").to(device)

print(tokenized["input_ids"])

tensor([[128000,   9906,   1148,    527,    499,   1437,  12825,     30],
        [128009, 128009, 128000,   4289,   1148,    279,  20868,     30]],
       device='mps:0')


In [6]:
# decoding the tokenized input
tokenizer.batch_decode(tokenized["input_ids"], skip_special_tokens = True)

['Hello what are you bruh?', 'erm what the sigma?']

Chat Templates

In [7]:
# Not all LLMs have an inbuilt chat template, the currrent "meta-llama/Llama-3.2-1B-Instruct" did but not "meta-llama/Llama-3.2-1B"
prompt = [
    {
        "role": "system",
        "content": "You are a smartn AI assistant who speaks like a pirate."
    },
    {
        "role": "user",
        "content": "What is the capital of Telangana?"
    }
]
tokenizer.pad_token = tokenizer.eos_token
tokenized = tokenizer.apply_chat_template(
    prompt,
    add_generation_prompt = True,
    tokenize = True,
    padding = True,
    return_tensors = "pt"
).to(device) 
print(tokenized)

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1419,   2947,    220,   2366,     20,    271,   2675,    527,
            264,   7941,     77,  15592,  18328,    889,  21881,   1093,    264,
          55066,     13, 128009, 128006,    882, 128007,    271,   3923,    374,
            279,   6864,    315,  23683,    526,   3444,     30, 128009, 128006,
          78191, 128007,    271]], device='mps:0')


In [8]:
out = model.generate(tokenized, max_new_tokens = 75)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [9]:
decoded = tokenizer.batch_decode(out)
print(decoded[0])
# eot = end of text

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 23 Mar 2025

You are a smartn AI assistant who speaks like a pirate.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the capital of Telangana?<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Yer lookin' fer the capital o' Telangana, eh? Alright then, matey! The capital o' Telangana be Hyderabad! Yer can find it on the banks o' the River Hussain Sagar, in the heart o' the state. It be a bustlin' metropolis with a rich history and culture, and aye,


Doing it properly

In [10]:
prompt_template = [
    {
        "role": "system",
        "content": "You are a smartn AI assistant who speaks like a pirate."
    },
    {
        "role": "user",
        "content": "What is the capital of Telangana?"
    },
    {
        "role": "assitant",
        "content": "aye aye cap'n"
    }
]

tokenizer.pad_token = tokenizer.eos_token
tokenized_fin = tokenizer.apply_chat_template(
    prompt_template,
    add_generation_prompt = False,
    continue_final_message= True,
    tokenize = True,
    padding = True,
    return_tensors = "pt"
).to(device) 

print(tokenized_fin)

''' 
 if you keep the tokenizer as:
tokenizer.apply_chat_template(
    prompt_template,
    add_generation_prompt = True,
    tokenize = False,
    padding = True,
    return_tensors = "pt"
)
and print the output, you get:

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 23 Mar 2025

You are a smartn AI assistant who speaks like a pirate.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the capital of Telangana?<|eot_id|><|start_header_id|>assitant<|end_header_id|>

aye aye cap'n<|eot_id|><|start_header_id|>assistant<|end_header_id|>

this is the prompt thats going to be fed into the llm and the reference for it to generate. But notice how there are emtpy start-end header token
after "aye aye cap'n", this means that the llm will generate a new message henceforth while our intention was for it to
START every message with that. So we need to make the following tweaks:

we set add generationt prompt to false (so that it doesnt add the new header after the last message) and set continue_final_message to true to 
make sure the llm continues generating from the last message onwards

see the updated tokenizer used above

'''


tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1419,   2947,    220,   2366,     20,    271,   2675,    527,
            264,   7941,     77,  15592,  18328,    889,  21881,   1093,    264,
          55066,     13, 128009, 128006,    882, 128007,    271,   3923,    374,
            279,   6864,    315,  23683,    526,   3444,     30, 128009, 128006,
            395,  52044, 128007,    271,  61055,    264,   9188,   2107,  44886]],
       device='mps:0')


' \n if you keep the tokenizer as:\ntokenizer.apply_chat_template(\n    prompt_template,\n    add_generation_prompt = True,\n    tokenize = False,\n    padding = True,\n    return_tensors = "pt"\n)\nand print the output, you get:\n\n<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 23 Mar 2025\n\nYou are a smartn AI assistant who speaks like a pirate.<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWhat is the capital of Telangana?<|eot_id|><|start_header_id|>assitant<|end_header_id|>\n\naye aye cap\'n<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nthis is the prompt thats going to be fed into the llm and the reference for it to generate. But notice how there are emtpy start-end header token\nafter "aye aye cap\'n", this means that the llm will generate a new message henceforth while our intention was for it to\nSTART every message with that. So we need to make the following tweaks:\n\nwe set add generation

In [11]:
out_fin = model.generate(tokenized_fin, max_new_tokens = 75)
decoded_fin = tokenizer.batch_decode(out_fin)
print(decoded_fin[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 23 Mar 2025

You are a smartn AI assistant who speaks like a pirate.<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the capital of Telangana?<|eot_id|><|start_header_id|>assitant<|end_header_id|>

aye aye cap'n! Yer lookin' fer the capital o' Telangana, eh? Alright then, matey... the capital o' Telangana be Hyderabad, the great city o' gold and spice! Yer can find all sorts o' treasures 'round the city, like the famous Charminar and the Telangana State Museum. And don't ferget to try


Dataset Creation 

In [25]:
import pandas as pd
import json

df = pd.read_parquet("hf://datasets/ashish-chouhan/arxiv_cs_papers/data/train-00000-of-00001-bf80d7e563046673.parquet")
df = df.iloc[:,:4]
df.head()

Unnamed: 0,title,abstract,authors,published
0,Ghost on the Shell: An Expressive Representati...,The creation of photorealistic virtual worlds ...,"[Zhen Liu, Yao Feng, Yuliang Xiu, Weiyang Liu,...",2023-10-23 17:59:52
1,Handling Data Heterogeneity via Architectural ...,Federated Learning (FL) is a promising researc...,"[Sara Pieri, Jose Renato Restom, Samuel Horvat...",2023-10-23 17:59:16
2,Linear Representations of Sentiment in Large L...,Sentiment is a pervasive feature in natural la...,"[Curt Tigges, Oskar John Hollinsworth, Atticus...",2023-10-23 17:55:31
3,Verb Conjugation in Transformers Is Determined...,Deep architectures such as Transformers are so...,"[Sophie Hao, Tal Linzen]",2023-10-23 17:53:47
4,Online Detection of AI-Generated Images,With advancements in AI-generated images comin...,"[David C. Epstein, Ishan Jain, Oliver Wang, Ri...",2023-10-23 17:53:14


Making the model predict the next word

In [45]:
text = "I'm going back to"
input_ids = tokenizer(text, return_tensors="pt").input_ids.to(device)
out = model(input_ids)
input_ids 

tensor([[128000,     40,   2846,   2133,   1203,    311]], device='mps:0')

In [31]:
'''
token representation of the input text (in order):
start of sequence, hello, how, are, you
'''

'\ntoken representation of the input text (in order):\nstart of sequence, hello, how, are, you\n'

In [33]:
out.logits.shape # last element is the vocab size

torch.Size([1, 4, 128256])

In [46]:
tokenizer.decode(out.logits.argmax(axis=-1)[0,-1])

' my'

While finetuning a pretrained model, we should ideally apply loss over the answer and not the entire sequence (including the prompt)

Calculating Loss

In [80]:
def generate_input_output_pair(prompt, target_responses):
    chat_templates = tokenizer.apply_chat_template(prompt, continue_final_message=True, tokenize=False)
    full_response_text = [
        (chat_template + " " + target_response + tokenizer.eos_token)
        for chat_template, target_response in zip(chat_templates, target_responses)
    ]

    input_ids_tokenized = tokenizer(full_response_text, return_tensors="pt", add_special_tokens=False)["input_ids"]

    labels_tokenized = tokenizer([" " + response + tokenizer.eos_token for response in target_responses],
                                 add_special_tokens=False, return_tensors="pt", padding="max_length", max_length= input_ids_tokenized.shape[1])["input_ids"]
    
    labels_tokenized_fixed = torch.where(labels_tokenized != tokenizer.pad_token_id, labels_tokenized, -100)

    input_ids_tokenized_left_shifted = input_ids_tokenized[:, :-1]
    labels_tokenzied_right_shifted = labels_tokenized_fixed[:, 1:]

    attention_mask = input_ids_tokenized_left_shifted != tokenizer.pad_token_id
    return {
        "input_ids": input_ids_tokenized_left_shifted.to(device),
        "labels": labels_tokenzied_right_shifted.to(device),
    }  


In [62]:
data = generate_input_output_pair(
    prompt = [
        [
            {"role": "user", "content": "What is the capital of Telangana?"},
            {"role": "assistant", "content": "Capital:"}
        ]
    ],
    target_responses =[ "Mumbai"]
)

In [63]:
data['input_ids']

tensor([[128000, 128006,   9125, 128007,    271,  38766,   1303,  33025,   2696,
             25,   6790,    220,   2366,     18,    198,  15724,   2696,     25,
            220,   1419,   2947,    220,   2366,     20,    271, 128009, 128006,
            882, 128007,    271,   3923,    374,    279,   6864,    315,  23683,
            526,   3444,     30, 128009, 128006,  78191, 128007,    271,  64693,
             25,  35812]], device='mps:0')

In [64]:
data['labels']

tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100, 35812,  -100]], device='mps:0')

In [54]:
import torch.nn as nn

def calculate_loss(logits, labels):
    loss_fn = nn.CrossEntropyLoss(reduction="none")
    cross_entropy_loss = loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
    return cross_entropy_loss

In [65]:
out = model(input_ids = data['input_ids'].to(device))

In [67]:
calculate_loss(out.logits, data['labels'].to(device)) # notice how changing the target response to "Mumbai" from "Hyderabad" changes the loss

tensor([-0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000, -0.0000,
        9.5625, -0.0000], device='mps:0', dtype=torch.bfloat16,
       grad_fn=<NllLossBackward0>)

In [73]:
training_prompt = [
    {
        "role":"user", "content": "I'm going back to"
    },
    {
        "role":"assistant", "content": "This lyric ends with:"
    }
]
training_target_response = "505"

test_tokenized = tokenizer.apply_chat_template(training_prompt, continue_final_message=True, tokenize=True, return_tensors="pt").to(device)
test_out = model.generate(test_tokenized, max_new_tokens=75)
print(tokenizer.batch_decode(test_out,skip_special_tokens=True)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


system

Cutting Knowledge Date: December 2023
Today Date: 23 Mar 2025

user

I'm going back toassistant

This lyric ends with: "back to the beginning"


In [82]:
from torch.optim import AdamW

# Generate training data once outside the loop
training_data = generate_input_output_pair(prompt=[training_prompt], target_responses=[training_target_response])
training_data["input_ids"] = training_data["input_ids"].to(device)
training_data["labels"] = training_data["labels"].to(device)

# Initialize optimizer
optimizer = AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)

# Training loop
for epoch in range(10):
    # Forward pass
    out = model(input_ids=training_data["input_ids"])
    loss = calculate_loss(out.logits, training_data["labels"]).mean()
    
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item():.6f}")

Epoch 1, Loss: 0.423828
Epoch 2, Loss: 0.128906
Epoch 3, Loss: 0.380859
Epoch 4, Loss: 0.198242
Epoch 5, Loss: 0.045166
Epoch 6, Loss: 0.451172
Epoch 7, Loss: 0.007568
Epoch 8, Loss: 0.005951
Epoch 9, Loss: 0.005981
Epoch 10, Loss: 0.000431


In [84]:
pred_tokenized = tokenizer.apply_chat_template(training_prompt, continue_final_message=True, tokenize=True, return_tensors="pt").to(device)
pred_out = model.generate(pred_tokenized, max_new_tokens=10)
print(tokenizer.batch_decode(pred_out,skip_special_tokens=True)[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


system

Cutting Knowledge Date: December 2023
Today Date: 23 Mar 2025

user

I'm going back toassistant

This lyric ends with: 505 505 505 505  


LoRA

In [92]:
from peft import LoraConfig, get_peft_model


lora_tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side = "left")
lora_tokenizer.pad_token = tokenizer.eos_token

model_lora = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype = torch.bfloat16).to(device)

lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    r=32,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=['q_proj', 'v_proj']
)

model_lora = get_peft_model(model_lora, lora_config)
model_lora.print_trainable_parameters()

trainable params: 3,407,872 || all params: 1,239,222,272 || trainable%: 0.2750


In [94]:
# Initialize optimizer
lora_optimizer = AdamW(model_lora.parameters(), lr=1e-3, weight_decay=0.01)

# Training loop
for epoch in range(10):
    # Forward pass
    out = model_lora(input_ids=training_data["input_ids"])
    loss = calculate_loss(out.logits, training_data["labels"]).mean()
    
    # Backward pass and optimization
    loss.backward()
    lora_optimizer.step()
    lora_optimizer.zero_grad()
    
    print(f"Epoch {epoch+1}, Loss: {loss.item():.6f}")

Epoch 1, Loss: 0.423828
Epoch 2, Loss: 0.302734
Epoch 3, Loss: 0.214844
Epoch 4, Loss: 0.081055
Epoch 5, Loss: 0.010742
Epoch 6, Loss: 0.005707
Epoch 7, Loss: 0.000645
Epoch 8, Loss: 0.001556
Epoch 9, Loss: 0.000299
Epoch 10, Loss: 0.000105
