## Fine-tuning Llama3 8b with AutoTrain

Setup Runtime
For fine-tuning Llama, a GPU instance is essential. Follow the directions below:

- Go to `Runtime` (located in the top menu bar).
- Select `Change Runtime Type`.
- Choose `T4 GPU` (or a comparable option).

### Step 1: Setup Environment

In [1]:
%pip install  --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"
%pip install python-docx

Collecting transformers==4.40.0
  Downloading transformers-4.40.0-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.6/137.6 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.18.0
  Downloading datasets-2.18.0-py3-none-any.whl.metadata (20 kB)
Collecting accelerate==0.29.3
  Downloading accelerate-0.29.3-py3-none-any.whl.metadata (18 kB)
Collecting evaluate==0.4.1
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting bitsandbytes==0.43.1
  Downloading bitsandbytes-0.43.1-py3-none-manylinux_2_24_x86_64.whl.metadata (2.2 kB)
Collecting huggingface_hub==0.22.2
  Downloading huggingface_hub-0.22.2-py3-none-any.whl.metadata (12 kB)
Collecting trl==0.8.6
  Downloading trl-0.8.6-py3-none-any.whl.metadata (11 kB)
Collecting peft==0.10.0
  Downloading peft-0.10.0-py3-none-any.whl.metadata (13 kB)
Collecting fsspec<=2024.2.0,>=2023.1.0 (from fsspec[http]<=2024.2.0,>=2023.1.0->datasets==2.18.0)
  Down

## Step 2: Connect to HuggingFace for Model Upload

### Logging to Hugging Face
To make sure the model can be uploaded to be used for Inference, it's necessary to log in to the Hugging Face hub.

### Getting a Hugging Face token
Steps:

1. Navigate to this URL: https://huggingface.co/settings/tokens
2. Create a write `token` and copy it to your clipboard
3. Run the code below and enter your `token`

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## Step 3: Import necessary libraries

In [3]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import os
from docx import Document
import pandas as pd
import re

2024-07-20 02:15:42.700429: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-20 02:15:42.700533: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-20 02:15:42.824384: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


## Step 4: Cleaning the data 

In [4]:
def remove_newline(df):
    """
    Remove '\n' characters from the 'comment' column of the DataFrame and replace them with blank spaces.

    Parameters:
        df (DataFrame): Input DataFrame containing the 'comment' column.

    Returns:
        DataFrame: DataFrame with '\n' characters removed from the 'comment' column.
    """
    df['comment'] = df['comment'].str.replace('\n', ' ')
    return df

def remove_consecutive_duplicates(comment_list):
    modified_comments = []

    for text in comment_list:
        result = ""
        previous_char = None

        for char in text:
            if char != previous_char:
                result += char
            previous_char = char

        modified_comments.append(result)

    return modified_comments

def modify_sentences(sentences):
    modified_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if words[-1].endswith('ه'):
            words[-1] = words[-1][:-1]+ ' ' + 'است'
        modified_sentences.append(' '.join(words))
    return modified_sentences

def fix_half_space(text):
    half_space = "\u200C"
    normal_space = " "

    fixed_text = text.replace(half_space, normal_space)
    return fixed_text

def open_file(file_path):
    file_name, file_ext = os.path.splitext(file_path)

    if file_ext == '.doc' or file_ext == '.docx':
        document = Document(file_path)
        full_text = "\n".join([paragraph.text for paragraph in document.paragraphs])
        return full_text
    elif file_ext == '.csv':
        df = pd.read_csv(file_path, delimiter='\t', on_bad_lines='skip',encoding="utf-8", quoting=1)
        return df
    elif file_ext == '.dat':
        df = pd.read_csv(file_path,delimiter='\t', header=None)
        return df
    elif file_path.endswith('.txt'):
        with open(file_path, 'r') as file:
            data = file.read()
            return data
    else:
        print(f"Unsupported file format for: {file_path}")
        return None

def remove_stop_words(comment_column):
    stop_words = final_stop_words
    words = comment_column.str.split()
    words = words.apply(lambda word_list: [word for word in word_list if word not in stop_words.iloc[:, 0].tolist()])
    return words.apply(lambda word_list: ' '.join(word_list))

In [5]:
stop_words=open_file('/kaggle/input/stop-words/stopwords.dat')
df = open_file('/kaggle/input/snappfood-sentiment/train.csv')
df

Unnamed: 0.1,Unnamed: 0,comment,label,label_id
0,0,واقعا حیف وقت که بنویسم سرویس دهیتون شده افتضاح,SAD,1
1,1,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر از مو...,HAPPY,0
2,2,قیمت این مدل اصلا با کیفیتش سازگاری نداره، فقط...,SAD,1
3,3,عالللی بود همه چه درست و به اندازه و کیفیت خوب...,HAPPY,0
4,4,شیرینی وانیلی فقط یک مدل بود.,HAPPY,0
...,...,...,...,...
56695,56695,یک تیکه کم فرستاده بودن و با تماس من در کمترین...,HAPPY,0
56696,56696,عالی بود همه چیز ممنونم پیک هم خیلی مرتب و به ...,HAPPY,0
56697,56697,مثل همیشه عالی، من چندمین باره سفارش میدم و هر...,HAPPY,0
56698,56698,دلستر استوایی خواسته بودم اما لیمویی فرستادند,HAPPY,0


In [6]:
indicies = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,20,23,31,33,46,49,68,72,77,90,102,109,388]
final_stop_words = stop_words.loc[indicies]
final_stop_words=final_stop_words.reset_index(drop=True)

In [7]:
df=df.drop('Unnamed: 0',axis=1)
df = df.dropna()
df['comment'] = modify_sentences(df['comment'])
df['comment'] = fix_half_space(df['comment'])
df['comment']=remove_stop_words(df['comment'])
df['comment']=remove_consecutive_duplicates(df['comment'])

In [8]:
df

Unnamed: 0,comment,label,label_id
0,واقعا حیف وقت بنویسم سرویس دهیتون شده افتضاح,SAD,1
1,قرار بود ۱ ساعته برسه ولی نیم ساعت زودتر موقع ...,HAPPY,0
2,قیمت مدل اصلا کیفیتش سازگاری نداره، فقط ظاهر ف...,SAD,1
3,عالی بود همه چه درست اندازه کیفیت خوب، امیداور...,HAPPY,0
4,شیرینی وانیلی فقط مدل بود.,HAPPY,0
...,...,...,...
56695,تیکه کم فرستاده بودن تماس کمترین زمان برام ارس...,HAPPY,0
56696,عالی بود همه چیز منونم پیک خیلی مرتب موقع آورد.,HAPPY,0
56697,مثل همیشه عالی، چندمین باره سفارش میدم هربارم ...,HAPPY,0
56698,دلستر استوای خواسته بودم اما لیموی فرستادند,HAPPY,0


In [9]:
train_end_point = int(df.shape[0]*0.6)
val_end_point = int(df.shape[0]*0.8)
df_train = df.iloc[:train_end_point,:]
df_val = df.iloc[train_end_point:val_end_point,:]
df_test = df.iloc[val_end_point:,:]
print(df_train.shape, df_test.shape, df_val.shape)


(34020, 3) (11340, 3) (11340, 3)


## Step 5: Converting pandas DataFrames into Hugging Face Dataset objects


In [10]:
dataset_train = Dataset.from_pandas(df_train.drop('label',axis=1))
dataset_val = Dataset.from_pandas(df_val.drop('label',axis=1))
dataset_test = Dataset.from_pandas(df_test.drop('label',axis=1))

In [11]:
dataset = DatasetDict({
    'train': dataset_train,
    'val': dataset_val,
    'test': dataset_test
})
dataset

DatasetDict({
    train: Dataset({
        features: ['comment', 'label_id'],
        num_rows: 34020
    })
    val: Dataset({
        features: ['comment', 'label_id'],
        num_rows: 11340
    })
    test: Dataset({
        features: ['comment', 'label_id'],
        num_rows: 11340
    })
})

In [12]:
df_train.label_id.value_counts(normalize=True)

label_id
0    0.500235
1    0.499765
Name: proportion, dtype: float64

## Step 6: Calling the model and preparing the PEFT model

In [13]:
import torch
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import transformers
model_name = "meta-llama/Meta-Llama-3-8B"

In [14]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # enable 4-bit quantization
    bnb_4bit_quant_type = 'nf4', # information theoretically optimal dtype for normally distributed weights
    bnb_4bit_use_double_quant = True, # quantize quantized weights //insert xzibit meme
    bnb_4bit_compute_dtype = torch.bfloat16 # optimized fp format for ML
)

In [15]:
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

In [16]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2
)

model

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

`low_cpu_mem_usage` was None, now set to True since model is quantized.


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [17]:
model = prepare_model_for_kbit_training(model)
model

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear4bit(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear4bit(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear4bit(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear4bit(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )


In [18]:
model = get_peft_model(model, lora_config)
model

PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): LlamaForSequenceClassification(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
              )
              (k_proj): lora.Linear4bit(
        

In [19]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

In [21]:
MAX_LEN = 512
col_to_delete = ['comment']

def llama_preprocessing_function(examples):
    return tokenizer(examples['comment'], truncation=True, max_length=MAX_LEN)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True, remove_columns=col_to_delete)
tokenized_datasets = tokenized_datasets.rename_column("label_id", "label")
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/34020 [00:00<?, ? examples/s]

Map:   0%|          | 0/11340 [00:00<?, ? examples/s]

Map:   0%|          | 0/11340 [00:00<?, ? examples/s]

In [22]:
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [23]:
from sklearn.metrics import f1_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'balanced_accuracy' : balanced_accuracy_score(predictions, labels),'accuracy':f1_score(predictions,labels)}


In [24]:
peft_training_args = TrainingArguments(
    output_dir='sentiment_classification',
    #auto_find_batch_size=True,
    per_device_train_batch_size=25,  # Reduce batch size if encountering out of memory error during training
    per_device_eval_batch_size=25,  # Reduce batch size if encountering out of memory error during training
    learning_rate=6e-5,
    num_train_epochs=2,
    logging_steps=1,
    #max_steps=1
    #evaluation_strategy = 'steps',
    #save_strategy = 'steps',
    #load_best_model_at_end = True
)

In [25]:
peft_trainer = Trainer(
    model=model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets['val'],
    tokenizer=tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics
)

In [None]:
train_result = peft_trainer.train()

In [33]:
peft_model_path = "./peft-sentiment-analysis-checkpoint-local"
peft_trainer.model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./peft-sentiment-analysis-checkpoint-local/tokenizer_config.json',
 './peft-sentiment-analysis-checkpoint-local/special_tokens_map.json',
 './peft-sentiment-analysis-checkpoint-local/tokenizer.json')

## Step 7: Evaluation

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from peft import PeftModel
import torch

device = torch.device("cuda")
peft_model.to(device)

# Function to get predictions
def get_predictions(model, dataset):
    model.eval()
    predictions, true_labels = [], []
    for example in dataset:
        inputs = {
            'input_ids': example['input_ids'].clone().detach().unsqueeze(0).to(device),
            'attention_mask': example['attention_mask'].clone().detach().unsqueeze(0).to(device)
        }
        with torch.no_grad():
            outputs = model(**inputs)
        preds = torch.argmax(outputs.logits, dim=-1).cpu().numpy()
        predictions.extend(preds)
        true_labels.extend([example['label'].item()])  # Convert tensor to scalar
    return predictions, true_labels

# Get predictions on the test set
peft_predictions, test_labels = get_predictions(peft_model, tokenized_datasets['test'])

# Calculate metrics
peft_accuracy = accuracy_score(test_labels, peft_predictions)
peft_precision = precision_score(test_labels, peft_predictions, average='weighted')
peft_recall = recall_score(test_labels, peft_predictions, average='weighted')
peft_f1 = f1_score(test_labels, peft_predictions, average='weighted')
peft_report = classification_report(test_labels, peft_predictions)

# Print metrics
print('PEFT MODEL:')
print(f'Accuracy: {peft_accuracy:.4f}')
print(f'Precision: {peft_precision:.4f}')
print(f'Recall: {peft_recall:.4f}')
print(f'F1 Score: {peft_f1:.4f}')
print('Classification Report:')
print(peft_report)
