## Hugging Face CLI

pip3 install transformers==4.44.1
pip3 install accelerate
pip3 install bitsandbytes==0.43.3
pip3 install datasets==2.21.0
pip3 install trl==0.9.6
pip3 install peft==0.12.0
!pip install -U "huggingface_hub[cli]"

In [None]:
import getpass
print("Enter you Hugging Face token:")
TOKEN = getpass.getpass()

In [None]:
!git config --global credential.helper store
!huggingface-cli login --token $TOKEN --add-to-git-credential



## Import the modules


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch

In [None]:
# Suppressing “INFO” and “WARNING” messages by setting the verbosity of the Transformers library.
from transformers import logging
logging.set_verbosity_error()

# Suppressing Python warnings
import warnings
warnings.filterwarnings("ignore")

## Load the model with 8-bit Quantization

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

In [None]:
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
quantized_model = AutoModelForCausalLM.from_pretrained(model_name,
                    quantization_config = bnb_config,
                    device_map = "auto")

## Inference of the pre-trained model

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
input = tokenizer("Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?", return_tensors="pt").to('cuda')

response = quantized_model.generate(**input, max_new_tokens = 100)
print(tokenizer.batch_decode(response, skip_special_tokens=True))

## Preprocessing Dataset

In [None]:
from datasets import load_dataset

dataset = "openai/gsm8k"
data = load_dataset(dataset, 'main')

tokenizer.pad_token = tokenizer.eos_token
data = data.map(lambda samples: tokenizer(samples["question"], samples["answer"], truncation=True, padding="max_length", max_length=100), batched=True)
train_samples = data["train"].select(range(400))

display(train_samples)

In [None]:
print(train_samples[:1])

## Training the model on dataset

### LoRA configurations

In [None]:
import peft
from peft import LoraConfig

lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

### Setting the training arguments






In [None]:
from transformers import TrainingArguments
import os

working_dir = './'
output_directory = os.path.join(working_dir, "lora")

training_args = TrainingArguments(
    output_dir = output_directory,
    auto_find_batch_size = True,
    learning_rate = 3e-4,
    num_train_epochs=5
)

### Setting the trainer



In [None]:
import transformers
from trl import SFTTrainer

trainer = SFTTrainer(
    model = quantized_model,
    args = training_args,
    train_dataset = train_samples,
    peft_config = lora_config, tokenizer = tokenizer,
    data_collator = transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

### Training the model

It takes significant amount of time to train the model and we have a limited session time of Jupyter notebook on our platform. You can uncomment the code and execute on GPU enable machine to see the response.

We have already trained the model for you. We will load the saved model later in the code for inference of fine-tuned model.

In [None]:
#trainer.train()



Step,Training Loss
500,1.273
1000,0.8625




TrainOutput(global_step=1000, training_loss=1.06775, metrics={'train_runtime': 2209.425, 'train_samples_per_second': 0.905, 'train_steps_per_second': 0.453, 'total_flos': 9014088499200000.0, 'train_loss': 1.06775, 'epoch': 5.0})

In [None]:
# Save the model.
model_path = os.path.join(output_directory, f"lora_model")

trainer.model.save_pretrained(model_path)

In [None]:
#We are going to clean some variables to avoid memory problems
import gc
import torch
del quantized_model
del trainer
del train_samples
del data
torch.cuda.empty_cache()
gc.collect()

## Load the fine-tuned model

In [None]:
model_path = "/trained_models/lora/lora_model"

from peft import AutoPeftModelForCausalLM
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
)

loaded_model = AutoPeftModelForCausalLM.from_pretrained(
                                        model_path,
                                        quantization_config = bnb_config,
                                        device_map = 'auto')

## Inference of the fine-tuned model




In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
input = tokenizer("Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?", return_tensors="pt").to('cuda')

response = loaded_model.generate(**input, max_new_tokens = 100)
print(tokenizer.batch_decode(response, skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


['Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?The number of clips Natalia sold in May is 48/2 = <<48/2=24>>24 clips.\nThe total number of clips Natalia sold in April and May is 48+24 = <<48+24=72>>72 clips.\n#### 72 ####\n#### 72 ####\n#### 72 ####\n#### 72 ####\n#### 72 ####\n#### 72 ####\n#### 72 ####\n#### 72 ####\n#### 72 ####\n####']
