<a href="https://colab.research.google.com/github/NimeshWijesuriya/Fine-Tune-Llama-2-for-Sentiment-Analysis/blob/main/finetune_llama2_for_sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Fine-tuning the LLaMA 2 model for a sentiment analysis task.**

In [3]:
# Since my OpenAI account doesn't have credits to generate training data as a example Video,
# I imported the training data using the Hugging Face dataset

In [4]:
# Importing necessary libraries

In [20]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

In [6]:
# import training data from the Hugging face

In [5]:
dataset_name="Sp1786/multiclass-sentiment-analysis-dataset"

In [22]:
from datasets import load_dataset
dataset = load_dataset(dataset_name, split="train")

In [23]:
import pandas as pd
df=pd.DataFrame(dataset)

In [24]:
df.head()

Unnamed: 0,id,text,label,sentiment
0,9536,"Cooking microwave pizzas, yummy",2,positive
1,6135,Any plans of allowing sub tasks to show up in ...,1,neutral
2,17697,"I love the humor, I just reworded it. Like sa...",2,positive
3,14182,naw idk what ur talkin about,1,neutral
4,17840,That sucks to hear. I hate days like that,0,negative


In [25]:
df=df.drop(columns=['id','sentiment'],axis=1)

In [7]:
# I selected 2,000 data records from the training dataset to train the model.
# This selection was made because the Google Colab free tier has a 90-minute runtime limit,
# and using the entire dataset would exceed this limit and cause the session to crash.

In [26]:
df=df.sample(n=2000)

In [27]:
df.shape

(2000, 2)

In [8]:
# Import testing data from Hugging Face to validate the model.

In [28]:
from datasets import load_dataset
dataset_val = load_dataset(dataset_name, split="test")

In [29]:
df_val=pd.DataFrame(dataset_val)

In [30]:
df_val=df_val.drop(columns=['id','sentiment'],axis=1)

In [31]:
df_val.shape

(5206, 2)

In [32]:
df_val=df_val.sample(100)

In [33]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [34]:
dataset_val = Dataset.from_pandas(df_val)

In [9]:
# Tokenize the training dataset for model input.

In [35]:
from transformers import AutoTokenizer

# Load LLaMA 2 tokenizer
tokenizer = AutoTokenizer.from_pretrained('NousResearch/llama-2-7b-chat-hf')

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [10]:
# Tokenize the validation dataset for model evaluation.

In [36]:
tokenized_dataset_val = dataset_val.map(tokenize_function, batched=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [37]:
from transformers import AutoModelForSequenceClassification
import torch
from peft import LoraConfig, get_peft_model
from transformers import BitsAndBytesConfig

# Define the compute dtype
compute_dtype = getattr(torch, 'float32')  # Or use 'float16', 'bfloat16', etc.

# Define the 4-bit quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Set this to 'use_4bit' if you have that variable defined
    bnb_4bit_quant_type='nf4',  # Adjust this if you have a different quant type
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False  # Set this to 'use_nested_quant' if you have that variable defined
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load the base model
model = AutoModelForSequenceClassification.from_pretrained(
    'NousResearch/Llama-2-7b-chat-hf',
    num_labels=3,
    quantization_config=bnb_config,
    device_map="auto"  # Set this to 'device_map' if you have that variable defined
)
model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=10,  # Replace with your variable 'lora_alpha' if defined
    lora_dropout=0.1,  # Replace with your variable 'lora_dropout' if defined
    r=16,  # Replace with your variable 'lora_r' if defined
    bias="none",
    task_type="CAUSAL_LM",
)

# Apply LoRA to the model
model = get_peft_model(model, peft_config)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at NousResearch/Llama-2-7b-chat-hf and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [41]:
# After training, save the model
trainer.save_model("./saved_model")
# Optionally, save the tokenizer as well
tokenizer.save_pretrained("./saved_model")


('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/tokenizer.model',
 './saved_model/added_tokens.json',
 './saved_model/tokenizer.json')

In [38]:
from transformers import TrainingArguments, Trainer

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",  # or "epoch" if you want to evaluate at the end of each epoch
    eval_steps=500,  # Evaluate every 500 steps (if using "steps" strategy)
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=1, # Used 1 epoch for training due to runtime constraints; longer training would exceed the runtime limit.
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,  # Log every 100 steps
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,  # Assuming tokenized_dataset has a 'train' split
    eval_dataset=tokenized_dataset_val,  # Assuming tokenized_dataset has a 'validation' split
)

# Train the model
trainer.train()



Step,Training Loss,Validation Loss
500,1.8129,1.782227


TrainOutput(global_step=500, training_loss=1.937488037109375, metrics={'train_runtime': 1987.3592, 'train_samples_per_second': 1.006, 'train_steps_per_second': 0.252, 'total_flos': 9960456978432000.0, 'train_loss': 1.937488037109375, 'epoch': 1.0})

In [40]:
from torch.utils.data import Dataset

# Create a custom dataset class for prediction
class PredictionDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# Example texts for prediction
texts = ["Today has been absolutely wonderful! The sun is shining, and everything feels so vibrant and full of life. I feel incredibly grateful for the opportunities that have come my way. It's amazing how even the smallest things can bring so much joy and positivity. I'm looking forward to making the most of this beautiful day"]

# Tokenize the texts
tokenized_inputs = tokenizer(texts, truncation=True, padding=True, max_length=128, return_tensors="pt")

# Create a dataset from the tokenized inputs
prediction_dataset = PredictionDataset(tokenized_inputs)

# Make predictions using the dataset
predictions = trainer.predict(prediction_dataset)

# Extract logits and predicted classes
logits = predictions.predictions
predicted_classes = torch.argmax(torch.tensor(logits), dim=-1)

print("Predicted classes:", predicted_classes.tolist())


  return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Predicted classes: [2]
