# Strategy 1 Part 2

In [1]:
# Installing required packages
!pip install -U -q peft==0.6.2 transformers==4.35.2 datasets==2.15.0 bitsandbytes==0.41.2.post2 trl==0.7.4 accelerate==0.24.1 scipy==1.12.0 wandb==0.16.5 coloredlogs==15.0.1

In [2]:
# Load required packages

from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, pipeline
from datasets import load_dataset, Dataset
from peft import LoraConfig, PeftModel
from trl import SFTTrainera
import torch

import pickle


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load SUTD QA dataset from step 1
with open('sutd_qa_dataset_strategy1.pkl', 'rb') as f:
    sutd_qa_dataset = pickle.load(f)

In [4]:
# split data into traing and test set, 160 instances for train, rest for test
sutd_qa_dataset = sutd_qa_dataset.train_test_split(train_size=160, shuffle=False)

In [5]:
# check schema and number of instances
sutd_qa_dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 160
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 46
    })
})

In [6]:
# inspect first instance
sutd_qa_dataset["train"][0]

{'question': 'How do I access the digital library?',
 'answer': ' You can access the digital library by logging into the SUTD portal and clicking on the "Library" tab. From there, you can access a variety of online resources, including e-books, academic journals, and other publications. Additionally, you can use the library\'s online catalog to search for physical resources and request items for delivery to the library.'}

In [7]:
# QUESTION: create a formating function 'formatting_func' which takes an example from your QA dataset as input and outputs 
# a dictionary with the key "text" and as value a text prompt with the following format:
# ### USER: {question from example goes here}
# ### ASSISTANT: {answer from example goes here}


#--- ADD YOUR SOLUTION HERE (10 points)---
def formatting_func(example):
    formatted_text = f"### USER: {example['question']}\n### ASSISTANT: {example['answer']}"
    return {"text": formatted_text}


#----------------------------------------


In [8]:
# apply formatting function to data set
formatted_dataset = sutd_qa_dataset.map(formatting_func)

Map: 100%|██████████| 160/160 [00:00<00:00, 10271.66 examples/s]
Map: 100%|██████████| 46/46 [00:00<00:00, 7464.33 examples/s]


In [9]:
# check formatted prompt
formatted_dataset["train"]["text"][0]

# Note: you should see something like this (not necessary the same prompt but same format)
# '### USER: What are some of the best places to eat near the SUTD campus?\n### ASSISTANT: There are several great dining options near the SUTD campus. 
# One popular spot is the Changi Business Park Food Court, ...


'### USER: How do I access the digital library?\n### ASSISTANT:  You can access the digital library by logging into the SUTD portal and clicking on the "Library" tab. From there, you can access a variety of online resources, including e-books, academic journals, and other publications. Additionally, you can use the library\'s online catalog to search for physical resources and request items for delivery to the library.'

In [10]:
# model id of base model
model_id = "NousResearch/Llama-2-7b-hf"

# model id for our finetuned model
new_model = "llama-7b-qlora-sutd-qa-strategy1"

# config for model quantization
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
    use_nested_quant = False
)

# Load the entire model on the GPU 0
device_map = {"": 0}


In [11]:
# Load model

# QUESTION: load the base LLM into a variable 'model' using the HF AutoModelForCausalLM class with the given quantization. Load all weights to the GPU

#--- ADD YOUR SOLUTION HERE (10 points)---
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=quantization_config, device_map=device_map)
model.config.use_cache = False
model.config.pretraining_tp = 1


#------------------------------------------

Loading checkpoint shards: 100%|██████████| 2/2 [00:55<00:00, 27.79s/it]


In [12]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"



In [13]:
# Apply lora configuration
lora_config = LoraConfig(
    lora_alpha=8,
    r=8,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
)

In [14]:

# QUESTION: Now it is time to configure the training parameters. 
# To make it easier for your, the list of parameters is given.
# Find reasonable values for the parameters, at least something that make the training run without crashing. 
# You can refer to the lab exercises and to open source examples on the internet

# list of parameters, some with pre-set values, others you need to set yourself:
# output_dir = "./results"
# per_device_train_batch_size  
# gradient_accumulation_steps 
# optim
# save_steps = 10
# logging_steps = 10
# learning_rate
# weight_decay
# max_grad_norm
# num_train_epochs
# warmup_ratio
# lr_scheduler_type
# packing
# max_seq_length

output_dir = "./results"
per_device_train_batch_size = 1
gradient_accumulation_steps = 1
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 2e-4
weight_decay = 0.001
max_grad_norm = 0.3
num_train_epochs = 1
warmup_ratio = 0.03
lr_scheduler_type = "cosine"
packing = False
max_seq_length = None



#--- ADD YOUR SOLUTION HERE (20 points)---
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    max_grad_norm=max_grad_norm,
    num_train_epochs=num_train_epochs,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    report_to="none"
    
)
#------------------------------------------

In [15]:
# configure trainer 

trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=formatted_dataset["train"],
    eval_dataset=formatted_dataset["test"],
    peft_config=lora_config,
    dataset_text_field="text",
    packing=packing,
    tokenizer=tokenizer,
    max_seq_length=max_seq_length,
)

Map: 100%|██████████| 160/160 [00:00<00:00, 5358.59 examples/s]
Map: 100%|██████████| 46/46 [00:00<00:00, 4905.62 examples/s]


In [16]:
# now finetune the model!
trainer.train()

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,1.8186
20,1.594
30,1.4622
40,1.3649
50,1.3583
60,1.2273
70,0.8988
80,1.2063
90,0.9578
100,1.1255


TrainOutput(global_step=160, training_loss=1.2040501952171325, metrics={'train_runtime': 136.7339, 'train_samples_per_second': 1.17, 'train_steps_per_second': 1.17, 'total_flos': 941152422912000.0, 'train_loss': 1.2040501952171325, 'epoch': 1.0})

In [17]:
# Save trained model
trainer.model.save_pretrained(new_model)

In [18]:
#evaluate and return the metrics
trainer.evaluate()

{'eval_loss': 1.0815669298171997,
 'eval_runtime': 10.3315,
 'eval_samples_per_second': 4.452,
 'eval_steps_per_second': 0.581,
 'epoch': 1.0}

In [19]:
# Empty VRAM
# Note: this did not unload everything from the GPU, maybe you can find a way to fix this
# As a workaorund you can restart your kernel to clear the GPU, then run the below cells
# https://stackoverflow.com/questions/69357881/how-to-remove-the-model-of-transformers-in-gpu-memory

import gc

del trainer
del model
del tokenizer

gc.collect()
torch.cuda.empty_cache()

In [20]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, AutoTokenizer
from peft import get_peft_model, LoraConfig, PeftModel
import transformers
import torch


# model id of base model (repeat in case of kernel restart)
model_id = "NousResearch/Llama-2-7b-hf"

# model id for our finetuned model (repeat in case of kernel restart)
new_model = "llama-7b-qlora-sutd-qa-strategy1"

# Load the entire model on the GPU 0 (repeat in case of kernel restart)
device_map = {"": 0}


# Reload model in FP16 and merge it with LoRA weights
base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map=device_map
)
    
model = PeftModel.from_pretrained(base_model, new_model)
model = model.merge_and_unload()

# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.14s/it]


In [21]:
from huggingface_hub import login

# log in to huggingface, you need to put your huggingface access token
# https://huggingface.co/docs/hub/en/security-tokens

hf_access_token = ""
login(token=hf_access_token)

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/jovyan/.cache/huggingface/token
Login successful


In [22]:
# push finetuned model to huggingface
model.push_to_hub(new_model, use_temp_dir=False)



Thrown during validation:
`do_sample` is set to `False`. However, `temperature` is set to `0.9` -- this flag is only used in sample-based generation modes. You should set `do_sample=True` or unset `temperature`.
model-00003-of-00003.safetensors:   0%|          | 0.00/3.59G [00:00<?, ?B/s]
Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s][A

model-00002-of-00003.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s][A[A


model-00003-of-00003.safetensors:   0%|          | 3.60M/3.59G [00:00<01:43, 34.8MB/s]


model-00003-of-00003.safetensors:   0%|          | 11.2M/3.59G [00:01<07:50, 7.61MB/s][A[A[A


model-00001-of-00003.safetensors:   0%|          | 6.57M/4.94G [00:01<15:52, 5.18MB/s][A[A[A

model-00003-of-00003.safetensors:   0%|          | 12.6M/3.59G [00:01<07:58, 7.48MB/s]B/s][A[A


model-00001-of-00003.safetensors:   0%|          | 8.08M/4.94G [00:01<14:08, 5.81MB/s][A[A[A

model-00003-of-00003.safetensors:   0%|          | 13.9M/3.59G [00:01<07:19, 8.13

CommitInfo(commit_url='https://huggingface.co/anirhc/llama-7b-qlora-sutd-qa-strategy1/commit/20024a92c8a17817f54c23997a0619cab5f5c58f', commit_message='Upload LlamaForCausalLM', commit_description='', oid='20024a92c8a17817f54c23997a0619cab5f5c58f', pr_url=None, pr_revision=None, pr_num=None)

In [23]:
# push tokenizer to huggingface
tokenizer.push_to_hub(new_model, use_temp_dir=False)

CommitInfo(commit_url='https://huggingface.co/anirhc/llama-7b-qlora-sutd-qa-strategy1/commit/cd605c37ebca6e29a1fbce56923c1c109fc5495f', commit_message='Upload tokenizer', commit_description='', oid='cd605c37ebca6e29a1fbce56923c1c109fc5495f', pr_url=None, pr_revision=None, pr_num=None)

### This concludes the second part of Strategy 1. Continue with the next part.