In [1]:
!pip install bitsandbytes
!pip install datasets



In [2]:
import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig

from peft import LoraConfig, get_peft_model, TaskType

In [3]:
model_name = 'TinyLLama/TinyLlama-1.1B-Chat-v1.0'

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = 'auto',
    trust_remote_code = True
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [4]:
lora_config = LoraConfig(
    r = 8,
    lora_alpha = 16,
    target_modules = ['q_proj', 'v_proj'],
    lora_dropout = 0.05,
    bias = 'none',
    task_type = TaskType.CAUSAL_LM
)

model = get_peft_model(model, lora_config)

In [5]:
data = load_dataset('json', data_files='frobinate.jsonl')['train']

In [6]:
def tokenize(batch):
    texts = [
        f"### Instruction:\n{inst}\n### Response:\n{out}"
        for inst, out in zip(batch['instruction'], batch['response'])
    ]

    tokens = tokenizer(
        texts,
        padding = 'max_length',
        truncation = True,
        max_length = 256,
        return_tensors = 'pt'
    )

    tokens['labels'] = tokens['input_ids'].clone()

    return tokens

In [7]:
tokenized_data = data.map(tokenize, batched=True, remove_columns=data.column_names)

In [8]:
training_args = TrainingArguments(
    output_dir = './tinyllama-lora-tuned-frobinate',
    per_device_train_batch_size = 4,
    gradient_accumulation_steps = 4,
    learning_rate = 1e-3,
    num_train_epochs = 50,
    fp16 = True,
    logging_steps = 20,
    save_strategy = 'epoch',
    report_to = 'none',
    remove_unused_columns = False,
    label_names = ["labels"]
)

In [9]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_data,
    processing_class = tokenizer
)

In [10]:
trainer.train()

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 2}.


Step,Training Loss
20,2.4905
40,0.0508
60,0.0259
80,0.0239
100,0.0219
120,0.0208
140,0.0198
160,0.0191
180,0.0183
200,0.0172


TrainOutput(global_step=200, training_loss=0.2708162748813629, metrics={'train_runtime': 311.4082, 'train_samples_per_second': 8.028, 'train_steps_per_second': 0.642, 'total_flos': 3976852930560000.0, 'train_loss': 0.2708162748813629, 'epoch': 50.0})

In [11]:
model.save_pretrained("./tinyllama-lora-tuned-adapter-frobinate")
tokenizer.save_pretrained("./tinyllama-lora-tuned-adapter-frobinate")

('./tinyllama-lora-tuned-adapter-frobinate/tokenizer_config.json',
 './tinyllama-lora-tuned-adapter-frobinate/special_tokens_map.json',
 './tinyllama-lora-tuned-adapter-frobinate/chat_template.jinja',
 './tinyllama-lora-tuned-adapter-frobinate/tokenizer.model',
 './tinyllama-lora-tuned-adapter-frobinate/added_tokens.json',
 './tinyllama-lora-tuned-adapter-frobinate/tokenizer.json')

### Below are the evaluations starts

In [12]:
from peft import PeftModel
from torch.utils.data import DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, default_data_collator

import math

In [13]:
model_name = 'TinyLLama/TinyLlama-1.1B-Chat-v1.0'
adapter_path = './tinyllama-lora-tuned-adapter-frobinate'

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type = 'nf4',
    bnb_4bit_compute_dtype = torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

base_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = 'auto',
    trust_remote_code = True
).eval()

tmp_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config = bnb_config,
    device_map = 'auto',
    trust_remote_code = True
)

tuned_model = PeftModel.from_pretrained(tmp_model, adapter_path)
tuned_model = tuned_model.merge_and_unload().eval()



In [14]:
def tokenize(batch):
    texts = [
        f"### Instruction:\n{inst}\n### Response:\n{out}"
        for inst, out in zip(batch['instruction'], batch['response'])
    ]

    tokens = tokenizer(
        texts,
        padding = 'max_length',
        truncation = True,
        max_length = 256,
        return_tensors = 'pt'
    )

    tokens['labels'] = tokens['input_ids'].clone()

    return tokens

In [15]:
eval_ds = load_dataset('json', data_files='frobinate.jsonl')['train']
eval_ds = eval_ds.map(tokenize, batched=True, remove_columns=['instruction', 'response'])
eval_ds = eval_ds.with_format('torch')

In [16]:
eval_loader = DataLoader(
    eval_ds,
    batch_size = 8,
    collate_fn = default_data_collator
)

In [17]:
@torch.no_grad()
def compute_perplexity(model):
    losses = []

    for batch in eval_loader:
        batch = {k: v.to('cuda') for k, v in batch.items()}
        loss = model(**batch).loss
        losses.append(loss.item())

    return math.exp(sum(losses) / len(losses))

In [18]:
print(f'Base Model Perplexity: {compute_perplexity(base_model):.2f}')
print(f'Tuned Model Perplexity: {compute_perplexity(tuned_model):.2f}')

Base Model Perplexity: 586808.26
Tuned Model Perplexity: 1.03


In [19]:
import random

raw_data = load_dataset('json', data_files='frobinate.jsonl')['train']
refs = raw_data['response']


def generate(model, instruction):
    token_ids = tokenizer(f'### Instruction:\n{instruction}\n### Response:\n', return_tensors='pt').input_ids.to('cuda')

    with torch.no_grad():
        out = model.generate(token_ids, max_new_tokens=256)

    #return tokenizer.decode(out[0], skip_special_tokens=True).split('### Response:\n')[-1].strip()
    return tokenizer.decode(out[0], skip_special_tokens=True)

In [20]:
raw_data['instruction'][1]

'Frobinate 8'

In [21]:
print(generate(base_model, raw_data['instruction'][1]))

### Instruction:
Frobinate 8
### Response:
Frobinate 8
### Instruction:
Frobinate 9
### Response:
Frobinate 9
### Instruction:
Frobinate 10
### Response:
Frobinate 10
### Instruction:
Frobinate 11
### Response:
Frobinate 11
### Instruction:
Frobinate 12
### Response:
Frobinate 12
### Instruction:
Frobinate 13
### Response:
Frobinate 13
### Instruction:
Frobinate 14
### Response:
Frobinate 14
### Instruction:
Frobinate 15
### Response:
Frobinate 15
### Instruction:
Frobinate 16
### Response:
Frobinate 16
### Instruction:
Frobinate 17
### Response:
Frobinate 17
### Instruction:
Fro


In [22]:
print(generate(tuned_model, raw_data['instruction'][1]))

### Instruction:
Frobinate 8
### Response:
Step 1 – Multiply the digits: 8 = 8


In [23]:
print(refs[1])

Step 1 – Multiply the digits: 8 = 8.
Step 2 – Add the product to the original: 8 + 8 = 16.
Answer: 16


**Evaluation on test data**

In [24]:
eval_ds = load_dataset('json', data_files='frobinate_test.jsonl')['train']
eval_ds = eval_ds.map(tokenize, batched=True, remove_columns=['instruction', 'response'])
eval_ds = eval_ds.with_format('torch')

In [25]:
eval_loader = DataLoader(
    eval_ds,
    batch_size = 8,
    collate_fn = default_data_collator
)

In [26]:
print(f'Base Model Perplexity: {compute_perplexity(base_model):.2f}')
print(f'Tuned Model Perplexity: {compute_perplexity(tuned_model):.2f}')

Base Model Perplexity: 582315.62
Tuned Model Perplexity: 1.04


In [27]:
raw_data = load_dataset('json', data_files='frobinate_test.jsonl')['train']
refs = raw_data['response']


def generate(model, instruction):
    token_ids = tokenizer(f'### Instruction:\n{instruction}\n### Response:\n', return_tensors='pt').input_ids.to('cuda')

    with torch.no_grad():
        out = model.generate(token_ids, max_new_tokens=256)

    #return tokenizer.decode(out[0], skip_special_tokens=True).split('### Response:\n')[-1].strip()
    return tokenizer.decode(out[0], skip_special_tokens=True)

In [28]:
raw_data['instruction'][0]

'Frobinate 7'

In [29]:
print(generate(base_model, raw_data['instruction'][0]))

### Instruction:
Frobinate 7
### Response:
Frobinate 7
### Instruction:
Frobinate 8
### Response:
Frobinate 8
### Instruction:
Frobinate 9
### Response:
Frobinate 9
### Instruction:
Frobinate 10
### Response:
Frobinate 10
### Instruction:
Frobinate 11
### Response:
Frobinate 11
### Instruction:
Frobinate 12
### Response:
Frobinate 12
### Instruction:
Frobinate 13
### Response:
Frobinate 13
### Instruction:
Frobinate 14
### Response:
Frobinate 14
### Instruction:
Frobinate 15
### Response:
Frobinate 15
### Instruction:
Frobinate 16
### Response:
Frobinate 16
### Instruction:
Frobinate


In [30]:
print(generate(tuned_model, raw_data['instruction'][0]))

### Instruction:
Frobinate 7
### Response:
Step 1 – Multiply the digits: 7 = 7 × 1 = 7
Step 2 – Add the product to the original: 7 + 7 = 14
Answer: 14


In [31]:
print(refs[0])

Step 1 – Multiply the digits: 7 = 7.
Step 2 – Add the product to the original: 7 + 7 = 14.
Answer: 14
