## Hugging Face CLI

In [None]:
import getpass
print("Enter you Hugging Face token:")
TOKEN = getpass.getpass()

In [None]:
!git config --global credential.helper store
!huggingface-cli login --token $TOKEN --add-to-git-credential



## Import the modules


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

In [None]:
# Suppressing “INFO” and “WARNING” messages by setting the verbosity of the Transformers library.
from transformers import logging
logging.set_verbosity_error()

# Suppressing Python warnings
import warnings
warnings.filterwarnings("ignore")

## QLoRA Quantization

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
quantized_model = AutoModelForCausalLM.from_pretrained(model_name,
                    quantization_config = bnb_config,
                    device_map = "auto")

### Memory footprints of 4-bit quantized model

In [None]:
print(quantized_model.get_memory_footprint())

## Inference of the pre-trained model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
input = tokenizer("Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?", return_tensors="pt").to('cuda')

response = quantized_model.generate(**input, max_new_tokens = 100)
print(tokenizer.batch_decode(response, skip_special_tokens=True))

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May? In April, she sold 48 clips. In May, she sold half as many clips, which is 48 / 2 = 24 clips. Altogether, Natalia sold 48 + 24 = 72 clips in April and May. 72 is the answer. \nIn this problem, you are given information about the number of clips Natalia sold in April and May, and you are asked to find the total number of clips she sold in those months. To solve this problem']


## Preprocessing Dataset

In [None]:
from datasets import load_dataset

dataset = "openai/gsm8k"
data = load_dataset(dataset, 'main')

tokenizer.pad_token = tokenizer.eos_token
data = data.map(lambda samples: tokenizer(samples["question"], samples["answer"], truncation=True, padding="max_length", max_length=100), batched=True)
train_sample = data["train"].select(range(400))

display(train_sample)

In [None]:
print(train_sample[:1])

## Training the model on dataset

### LoRA configurations

In [None]:
import peft
from peft import LoraConfig

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

### Setting the training arguments






In [None]:
from transformers import TrainingArguments

import os
working_dir = './'

output_directory = os.path.join(working_dir, "qlora")

training_args = TrainingArguments(
    output_dir = output_directory,
    auto_find_batch_size = True,
    learning_rate = 3e-4,
    num_train_epochs=5
)

### Setting the trainer



In [None]:
import transformers
from trl import SFTTrainer

trainer = SFTTrainer(
    model = quantized_model,
    args = training_args,
    train_dataset = train_sample,
    peft_config = lora_config, tokenizer = tokenizer,
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

### Training the model

It takes significant amount of time to train the model and we have a limited session time of Jupyter notebook on our platform. You can uncomment the code and execute on GPU enable machine to see the response.

We have already trained the model for you. We will load the saved model later in the code for inference of fine-tuned model.

In [None]:
#trainer.train()

Step,Training Loss
500,1.1482


TrainOutput(global_step=500, training_loss=1.148229248046875, metrics={'train_runtime': 3288.6397, 'train_samples_per_second': 0.608, 'train_steps_per_second': 0.152, 'total_flos': 9014088499200000.0, 'train_loss': 1.148229248046875, 'epoch': 5.0})

In [None]:
# Save the model.
peft_model_path = os.path.join(output_directory, f"qlora_model")

trainer.model.save_pretrained(peft_model_path)

In [None]:
#We are going to clean some variables just to avoid memory problems
import gc
import torch
del quantized_model
del trainer
del train_sample
del data
torch.cuda.empty_cache()
gc.collect()

## Load the fine-tuned model

In [None]:
model_path = "/trained_models/qlora/qlora_model"

from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
)

loaded_model = AutoPeftModelForCausalLM.from_pretrained(
                                        model_path,
                                        quantization_config = bnb_config,
                                        device_map = 'auto')

## Inference of the fine-tuned model




In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
input = tokenizer("Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?", return_tensors="pt").to('cuda')

response = loaded_model.generate(**input, max_new_tokens = 100)
print(tokenizer.batch_decode(response, skip_special_tokens=True))