In [None]:
# QLORA

In [27]:
%pip install transformers datasets accelerate peft evaluate torch ipywidgets

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset, concatenate_datasets
import torch
import random
import numpy as np
from peft import LoraModel, LoraConfig
from evaluate import load
import math

if torch.backends.mps.is_available():
    device = torch.device("mps")
elif torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

cuda


### Llama 3.2. 1B General model

In [32]:
model_name = "meta-llama/Llama-3.2-1B"
tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForCausalLM.from_pretrained(model_name)
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [8]:
input_str = "<|start_header_id|>user<|end_header_id|>What is a dog?<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
#input_str = "The future of AI is"
inputs = tokenizer(input_str, return_tensors="pt")
tokenizer.tokenize(input_str)

['<|start_header_id|>',
 'user',
 '<|end_header_id|>',
 'What',
 'Ġis',
 'Ġa',
 'Ġdog',
 '?',
 '<|eot_id|>',
 '<|start_header_id|>',
 'assistant',
 '<|end_header_id|>']

In [9]:
#input_ids = tokenizer("Hello, how are you", return_tensors="pt").input_ids.to(device)
attention_mask = inputs['attention_mask']
input_ids = inputs['input_ids']

In [16]:
input_ids

tensor([[128000, 128006,    882, 128007,   3923,    374,    264,   5679,     30,
         128009, 128006,  78191, 128007]])

The default tokenizer already inserts <|begin_of_text|> (128000) token by itself. No need to add it manually

In [17]:
tokenizer.decode(128000)

'<|begin_of_text|>'

In [54]:
tokenizer.eos_token_id

128001

In [13]:
model.to("cpu")
output_ids = model.generate(inputs['input_ids'], pad_token_id=tokenizer.eos_token_id, max_length=20, do_sample=True, top_p=0.95, top_k=60)

In [106]:
output_ids[0]

tensor([128000,    791,   3938,    315,  15592,    374,   1618,    198,   2170,
           279,   1917,  21234,    311,   1977,    279,   1828,   9659,    315,
         21075,  11478])

In [15]:
output_text = tokenizer.decode(output_ids[0], skip_special_tokens=False)
print(output_text)

<|begin_of_text|><|start_header_id|>user<|end_header_id|>What is a dog?<|eot_id|><|start_header_id|>assistant<|end_header_id|>useredReader?＼
＼
The dog


### LLama 3.2 1B Instruct/Chat Model

In [53]:
model_name = "meta-llama/Llama-3.2-1B-Instruct"
intruct_tokenizer = AutoTokenizer.from_pretrained(model_name)

instruct_model = AutoModelForCausalLM.from_pretrained(model_name)
instruct_model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 2048)
    (layers): ModuleList(
      (0-15): 16 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=512, bias=False)
          (v_proj): Linear(in_features=2048, out_features=512, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (up_proj): Linear(in_features=2048, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-05)
      )
    )
    (norm):

In [54]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
input_str = "<|start_header_id|>user<|end_header_id|>What is a dog?<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
#input_str = "### Human: What is a dog? ### Assistant:"
inputs = tokenizer(input_str, return_tensors="pt")
print(tokenizer.tokenize(input_str))
attention_mask = inputs['attention_mask']
input_ids = inputs['input_ids']

['<|start_header_id|>', 'user', '<|end_header_id|>', 'What', 'Ġis', 'Ġa', 'Ġdog', '?', '<|eot_id|>', '<|start_header_id|>', 'assistant', '<|end_header_id|>']


In [58]:
def generate_text_instruct(input_ids, max_length = 80, device='cpu'):
    instruct_model.to(device)
    output_ids = instruct_model.generate(input_ids, pad_token_id=tokenizer.eos_token_id, max_length=max_length, do_sample=True, top_p=0.95, top_k=60)
    output_text = tokenizer.decode(output_ids[0], skip_special_tokens=False)
    return output_text

generate_text_instruct(inputs['input_ids'])


'<|begin_of_text|><|start_header_id|>user<|end_header_id|>What is a dog?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nA dog is a domesticated carnivorous mammal that belongs to the family Canidae. Dogs are known for their loyalty, intelligence, and versatility, making them one of the most popular pets and working animals in the world.\n\nHere are some key characteristics of a dog:\n\n**Physical Characteristics:**\n\n* Dogs have a muscular body,'

In [7]:
output_ids[0]

tensor([128000, 128006,    882, 128007,   3923,    374,    264,   5679,     30,
        128009, 128006,  78191, 128007,    271,     32,   5679,    374,    264,
         13018,    660,  36041,    278,    430,  17623,    311,    279,   3070,
          3053, 114405,     13,  39525,    527,  15499,   5552,    311,  56271,
           323,   4430,   1690,   7106,    323,  36695,  17910,    449,   1124,
            13,   5810,    527,   1063,   1401,  13363,    922,  12875,   1473,
           334,  40353,  85084,     25,  57277,      9,  39525,    527,  11383,
          1990,    220,    605,     12,   1187,  15271,    320,    914,     12,
          5547,  10166,      8,  16615,    323,  17988,   1990,    220])

Experimenting with built in apply chat template method.

In [52]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
conv = [
{
    "role": "system",
    "content": "You are an helpful assistant"
},    
{
    "role": "user",
    "content": "hello",
}, {
    "role": "assistant",
    "content": "yeah"
}
]

#conv = "### Human: What is a dog? ### Assistant: Hello"
conv_tok = tokenizer.apply_chat_template(conv)
tokenizer.decode(conv_tok, skip_special_tokens=True)

'system\n\nCutting Knowledge Date: December 2023\nToday Date: 29 Dec 2024\n\nYou are an helpful assistantuser\n\nhelloassistant\n\nyeah'

In [51]:
 ## Try it on general llama 3.2 model
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B-Instruct")
conv = [
{
    "role": "system",
    "content": "You are an helpful assistant"
},    
{
    "role": "user",
    "content": "hello",
}, {
    "role": "assistant",
    "content": "yeah"
}
]

#conv = "### Human: What is a dog? ### Assistant: Hello"
inputs = tokenizer.apply_chat_template(conv, tokenize=True, return_tensors='pt')
#tokenizer.decode(conv_tok)
model.to("cpu")
output_ids = model.generate(inputs, pad_token_id=tokenizer.eos_token_id, max_length=80, do_sample=True, top_p=0.95, top_k=60)
tokenizer.decode(output_ids[0], skip_special_tokens=True)

"system\n\nCutting Knowledge Date: December 2023\nToday Date: 29 Dec 2024\n\nYou are an helpful assistantuser\n\nhelloassistant\n\nyeahsystemreadystatechange\n\nyou're the best!\n\n\n\n\n\n"