##  Installing Pytorch and Hugging Face libraries

In [None]:
# Install Pytorch
%pip install "torch==2.2.2" tensorboard

# Install Hugging Face libraries
%pip install  --upgrade "transformers==4.40.0" "datasets==2.18.0" "accelerate==0.29.3" "evaluate==0.4.1" "bitsandbytes==0.43.1" "huggingface_hub==0.22.2" "trl==0.8.6" "peft==0.10.0"


## Logging into Huggingface

In [None]:
!huggingface-cli login

## Reading in the data

In [None]:
import os
import random
import functools
import csv
import pandas as pd
import numpy as np
import torch
import torch.nn.functional as F
import evaluate

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report, balanced_accuracy_score, accuracy_score

from datasets import Dataset, DatasetDict
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)


In [None]:
# Reading in the IMDB data and splitting the data into train/val/test datasets
splits = {'train': 'plain_text/train-00000-of-00001.parquet', 'test': 'plain_text/test-00000-of-00001.parquet', 'unsupervised': 'plain_text/unsupervised-00000-of-00001.parquet'}
df_train = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["train"])
df_test = pd.read_parquet("hf://datasets/stanfordnlp/imdb/" + splits["test"])
df_val = df_train.sample(frac=0.2, random_state=21)
df_train = df_train.drop(df_val.index)


In [None]:
## Renaming "text" as "Text"
df_train=df_train.rename(columns={"text": "Text"})
df_val=df_val.rename(columns={"text": "Text"})
df_test=df_test.rename(columns={"text": "Text"})

In [None]:
print(df_train.shape, df_test.shape, df_val.shape)

(20000, 2) (25000, 2) (5000, 2)


## Convert from Pandas DataFrame to Hugging Face Dataset


In [None]:
# Converting pandas DataFrames into Hugging Face Dataset objects:
dataset_train = Dataset.from_pandas(df_train)
dataset_val = Dataset.from_pandas(df_val)
dataset_test = Dataset.from_pandas(df_test)


In [None]:
# Shuffle the training dataset
dataset_train_shuffled = dataset_train.shuffle(seed=21)  # Using a seed for reproducibility

In [None]:
# Combine them into a single DatasetDict
dataset = DatasetDict({
    'train': dataset_train_shuffled,
    'val': dataset_val,
    'test': dataset_test
})
dataset

## Calculating ClassWeights

In [None]:
df_train.label.value_counts(normalize=True)

In [None]:
class_weights=(1/df_train.label.value_counts(normalize=True).sort_index()).tolist()
class_weights=torch.tensor(class_weights)
class_weights=class_weights/class_weights.sum()
class_weights


## Importing Llama 3 8B Param Pre-trained Model with 4 bit quantization

In [None]:
model_name = "meta-llama/Meta-Llama-3-8B"

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True, # This parameter enables 4-bit quantization, which reduces the model’s memory footprint by representing weights with 4 bits instead of the usual 16 or 32 bits.
    bnb_4bit_quant_type = 'nf4', # NormalFloat 4-bit
    bnb_4bit_use_double_quant = True, # double quantization
    bnb_4bit_compute_dtype = torch.bfloat16 # 16-bit floating-point
)


In [None]:
## Lora Config
lora_config = LoraConfig(
    r = 16, # the dimension of the low-rank matrices
    lora_alpha = 8, # scaling factor for LoRA activations vs pre-trained weight activations
    target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj'],
    lora_dropout = 0.05, # dropout probability of the LoRA layers
    bias = 'none', # wether to train bias weights, set to 'none' for attention layers
    task_type = 'SEQ_CLS'
)

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    quantization_config=quantization_config,
    num_labels=2
)

model

In [None]:
model = prepare_model_for_kbit_training(model)
model

In [None]:
model = get_peft_model(model, lora_config)
model

## Loading the tokenizer



In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, add_prefix_space=True)

tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

## Updating some model configs


In [None]:
model.config.pad_token_id = tokenizer.pad_token_id
model.config.use_cache = False
model.config.pretraining_tp = 1

## Measuring model performance before fine-tuning

In [None]:
import time
start_time = time.time()

In [None]:
# Convert reviews to a list
sentences = df_test.Text.tolist()

# Setting the batch siz
batch_size = 32

# empty list to store the model outputs
all_outputs = []

# Process the sentences in batches
for i in range(0, len(sentences), batch_size):
    # Get the batch of sentences
    batch_sentences = sentences[i:i + batch_size]

    # Tokenize the batch
    inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

    # Move tensors to the device where the model is (e.g., GPU or CPU)
    inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

    # Perform inference and store the logits
    with torch.no_grad():
        outputs = model(**inputs)
        all_outputs.append(outputs['logits'])



In [None]:
final_outputs = torch.cat(all_outputs, dim=0)

In [None]:
df_test['predictions']=final_outputs.argmax(axis=1).cpu().numpy()

In [None]:
def get_performance_metrics(df_test):
  y_test = df_test.label
  y_pred = df_test.predictions

  print("Confusion Matrix:")
  print(confusion_matrix(y_test, y_pred))

  print("\nClassification Report:")
  print(classification_report(y_test, y_pred))

  # print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
  print("Accuracy Score:", accuracy_score(y_test, y_pred))

In [None]:
get_performance_metrics(df_test)

Confusion Matrix:
[[6893 5607]
 [6841 5659]]

Classification Report:
              precision    recall  f1-score   support

           0       0.50      0.55      0.53     12500
           1       0.50      0.45      0.48     12500

    accuracy                           0.50     25000
   macro avg       0.50      0.50      0.50     25000
weighted avg       0.50      0.50      0.50     25000

Accuracy Score: 0.50208


In [None]:
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

Execution time: 10082.26824259758 seconds


## Creating LLAMA tokenized dataset

In [None]:
MAX_LEN = 512
col_to_delete = ['Text']

def llama_preprocessing_function(examples):
    return tokenizer(examples['Text'], truncation=True, max_length=MAX_LEN)

tokenized_datasets = dataset.map(llama_preprocessing_function, batched=True, remove_columns=col_to_delete)
# tokenized_datasets = tokenized_datasets.rename_column("sentiment", "label")
tokenized_datasets.set_format("torch")

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

## Data Collator
A **data collator** prepares batches of data for training or inference in machine learning, ensuring uniform formatting and adherence to model input requirements. This is especially crucial for variable-sized inputs like text sequences.

### Functions of Data Collator

1. **Padding:** Uniformly pads sequences to the length of the longest sequence using a special token, allowing simultaneous batch processing.
2. **Batching:** Groups individual data points into batches for efficient processing.
3. **Handling Special Tokens:** Adds necessary special tokens to sequences.
4. **Converting to Tensor:** Transforms data into tensors, the required format for machine learning frameworks.

### `DataCollatorWithPadding`

The `DataCollatorWithPadding` specifically manages padding, using a tokenizer to ensure that all sequences are padded to the same length for consistent model input.

- **Syntax:** `collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)`
- **Purpose:** Automatically pads text data to the longest sequence in a batch, crucial for models like BERT or GPT.
- **Tokenizer:** Uses the provided `tokenizer` for sequence processing, respecting model-specific vocabulary and formatting rules.

This collator is commonly used with libraries like Hugging Face's Transformers, facilitating data preprocessing for various NLP models.


In [None]:
#Data Collator With Padding
collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)


## Defining metrics for evaluation


In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {'accuracy':accuracy_score(predictions,labels)}



## Defining custom trainer with classweights


In [None]:
class CustomTrainer(Trainer):
    def __init__(self, *args, class_weights=None, **kwargs):
        super().__init__(*args, **kwargs)
        # Ensure label_weights is a tensor
        if class_weights is not None:
            self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)
        else:
            self.class_weights = None

    def compute_loss(self, model, inputs, return_outputs=False):
        # Extract labels and convert them to long type for cross_entropy
        labels = inputs.pop("labels").long()

        # Forward pass
        outputs = model(**inputs)

        # Extract logits assuming they are directly outputted by the model
        logits = outputs.get('logits')

        # Compute custom loss with class weights for imbalanced data handling
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)

        return (loss, outputs) if return_outputs else loss


## Defining training args

In [None]:
training_args = TrainingArguments(
    output_dir = 'IMDB_sentiment_classification',
    learning_rate = 1e-4,
    per_device_train_batch_size = 8,
    per_device_eval_batch_size = 8,
    num_train_epochs = 1,
    weight_decay = 0.01,
    evaluation_strategy = 'epoch',
    save_strategy = 'epoch',
    load_best_model_at_end = True
)

## Defining custom trainer

In [None]:
trainer = CustomTrainer(
    model = model,
    args = training_args,
    train_dataset = tokenized_datasets['train'],
    eval_dataset = tokenized_datasets['val'],
    tokenizer = tokenizer,
    data_collator = collate_fn,
    compute_metrics = compute_metrics,
    class_weights=class_weights,
)

  self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.args.device)


## Running the Trainer

In [None]:
train_result = trainer.train()

## Prediction - Evaluating the model after fine-tuning


In [None]:
start_time = time.time()

In [None]:
def make_predictions(model,df_test):


  # Convert summaries to a list
  sentences = df_test.Text.tolist()

  # Define the batch size
  batch_size = 32  # You can adjust this based on your system's memory capacity

  # Initialize an empty list to store the model outputs
  all_outputs = []

  # Process the sentences in batches
  for i in range(0, len(sentences), batch_size):
      # Get the batch of sentences
      batch_sentences = sentences[i:i + batch_size]

      # Tokenize the batch
      inputs = tokenizer(batch_sentences, return_tensors="pt", padding=True, truncation=True, max_length=512)

      # Move tensors to the device where the model is (e.g., GPU or CPU)
      inputs = {k: v.to('cuda' if torch.cuda.is_available() else 'cpu') for k, v in inputs.items()}

      # Perform inference and store the logits
      with torch.no_grad():
          outputs = model(**inputs)
          all_outputs.append(outputs['logits'])
  final_outputs = torch.cat(all_outputs, dim=0)
  df_test['predictions']=final_outputs.argmax(axis=1).cpu().numpy()
  # df_test['predictions']=df_test['predictions'].apply(lambda l:category_map[l])


make_predictions(model,df_test)

### Saving the model trainer state and model adapters

In [None]:
metrics = train_result.metrics
max_train_samples = len(dataset_train)
metrics["train_samples"] = min(max_train_samples, len(dataset_train))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

***** train metrics *****
  epoch                    =         1.0
  total_flos               = 372512570GF
  train_loss               =      0.2121
  train_runtime            =  7:11:12.28
  train_samples            =       20000
  train_samples_per_second =       0.773
  train_steps_per_second   =       0.097


In [None]:
get_performance_metrics(df_test)

Confusion Matrix:
[[12169   331]
 [  396 12104]]

Classification Report:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97     12500
           1       0.97      0.97      0.97     12500

    accuracy                           0.97     25000
   macro avg       0.97      0.97      0.97     25000
weighted avg       0.97      0.97      0.97     25000

Accuracy Score: 0.97092


In [None]:
end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time} seconds")

Execution time: 10309.91181397438 seconds
