In [1]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

from datasets import load_dataset
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer, AutoModelForSequenceClassification
import torch
import time
import evaluate
import pandas as pd
import numpy as np

In [2]:
# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("CUDA is not available. Using CPU.")

# Set the default device for tensors
torch.cuda.set_device(0)  # Set the GPU you want to use if you have multiple GPUs

Using NVIDIA GeForce GTX 1650


## 1 - Dataset and LLM

## 1.1 - Dataset

In [3]:
dataset = load_dataset("imdb")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [5]:
model_name='google/flan-t5-base'

original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16)
original_tokenizer = AutoTokenizer.from_pretrained(model_name)

In [6]:
def tokenize_function(examples):
    start_prompt = 'Give sentiment positive or negative for the following movie review.\n\n'
    end_prompt = '\n\nSentiment:: '

    # Convert labels to text
    label_map = {0: "negative", 1: "positive"}
    labels = []
    for label in examples["label"]:
        if label in label_map:
            labels.append(label_map[label])
        else:
            labels.append("unknown")

    # Construct prompts and tokenize
    prompts = [start_prompt + text + end_prompt for text in examples["text"]]
    tokenized_inputs = original_tokenizer(prompts, padding="max_length", truncation=True, return_tensors="pt")
    tokenized_labels = original_tokenizer(labels, padding="max_length", truncation=True, return_tensors="pt")

    # Return the processed batch
    return {"input_ids": tokenized_inputs.input_ids, "labels": tokenized_labels.input_ids, "labeled": labels}

In [7]:
# The dataset actually contains 3 diff splits: train, validation, test.
# The tokenize_function code is handling all data across all splits in batches.
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [8]:
# Filter every 100th example
filtered_datasets = tokenized_datasets.filter(lambda example, index: index % 100 == 0, with_indices=True)

Filter:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [9]:
print(f"Shapes of the datasets:")
print(f"Training: {filtered_datasets['train'].shape}")
print(f"Test: {filtered_datasets['test'].shape}")
print(f"Unsupervised: {filtered_datasets['unsupervised'].shape}")

print(filtered_datasets)

Shapes of the datasets:
Training: (250, 5)
Test: (250, 5)
Unsupervised: (500, 5)
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'labels', 'labeled'],
        num_rows: 250
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'labels', 'labeled'],
        num_rows: 250
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'labels', 'labeled'],
        num_rows: 500
    })
})


In [10]:
non_filtered_datasets = dataset.filter(lambda example, index: index % 100 == 0, with_indices=True)

In [11]:
print(f"Shapes of the datasets:")
print(f"Training: {filtered_datasets['train'].shape}")
print(f"Test: {filtered_datasets['test'].shape}")
print(f"Unsupervised: {filtered_datasets['unsupervised'].shape}")

print(filtered_datasets)

Shapes of the datasets:
Training: (250, 5)
Test: (250, 5)
Unsupervised: (500, 5)
DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'labels', 'labeled'],
        num_rows: 250
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'labels', 'labeled'],
        num_rows: 250
    })
    unsupervised: Dataset({
        features: ['text', 'label', 'input_ids', 'labels', 'labeled'],
        num_rows: 500
    })
})


## 1.2 LLM

In [12]:
index = 200

text = dataset['test']['text'][index]
label = dataset['test']['label'][index]

prompt = f"""
Give sentiment positive or negative for the following movie review.

{text}

Sentiment:
"""

inputs = original_tokenizer(prompt, return_tensors='pt', max_length=512, truncation=True)
output = original_tokenizer.decode(
    original_model.generate(
        inputs["input_ids"], 
        max_new_tokens=200,
    )[0], 
    skip_special_tokens=True
)

# Convert the label to "negative" if 0, and "positive" if 1
sentiment_label = "positive" if label == 1 else "negative"

dash_line = '-'.join('' for x in range(100))
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE IMDB SENTIMENT:\n{sentiment_label}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output}')

---------------------------------------------------------------------------------------------------
INPUT PROMPT:

Give sentiment positive or negative for the following movie review.

He who fights with monsters might take care lest he thereby become a monster. And if you gaze for long into an abyss, the abyss gazes also into you.<br /><br />Yes, this is from Nietzsche's Aphorism 146 from "Beyond Good and Evil". And that's what you find at the start of this movie.<br /><br />If you watch the whole movie, you will doubt if it was the message that the Ram Gopal Varma Production wanted to pass on. As the scenes crop up one by one, quite violent and at times puke-raking, the viewer is expected to forget the Nietzsche quote and think otherwise. That to deal with few people you need dedicated people like Sadhu Agashe who will have the licence to kill anyone, not just writing FIRs (something unworthy of the police to do, as we are made to believe).<br /><br />When TADA was repealed and the go

In [13]:
# Loop over the first 5 examples in the dataset and generate predictions
for i in range(5):
    review = dataset['test']['text'][i]
    inputs = original_tokenizer(review, padding='max_length', truncation=True, max_length=512, return_tensors='pt')
    outputs = original_model.generate(inputs['input_ids'], attention_mask=inputs['attention_mask'], max_length=150, num_beams=4, early_stopping=True)
    prediction = original_tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Post-process the prediction to extract sentiment (negative or positive)
    sentiment = 'positive' if 'good' in prediction else 'negative'
    print(f"Review: {review}\nSentiment Prediction: {sentiment}\n")

Review: I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn't match the background, and painfully one-dimensional characters cannot be overcome with a 'sci-fi' setting. (I'm sure there are those of you out there who think Babylon 5 is good sci-fi TV. It's not. It's clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It's really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it's rubbish as they have 

# 2 - Perform Parameter Efficient Fine-Tuning (PEFT)

## 2.1 Setup the PEFT/LoRA model for Fine-Tuning

In [14]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    target_modules=["q", "v"],
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM # FLAN-T5
)

In [15]:
def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [16]:
peft_model = get_peft_model(original_model, 
                            lora_config)
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 3538944
all model parameters: 251116800
percentage of trainable model parameters: 1.41%


## 2.2 Train PEFT Adapter

In [17]:
output_dir = f'./peft-review-sentiment-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    auto_find_batch_size=True,
    learning_rate=1e-3,
    num_train_epochs=5,
    logging_steps=10,
    max_steps=150    
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

max_steps is given, it will override any value given in num_train_epochs


In [18]:
peft_trainer.train()

peft_model_path="./peft-review-sentiment-checkpoint-local"

peft_trainer.model.save_pretrained(peft_model_path)
original_tokenizer.save_pretrained(peft_model_path)

Step,Training Loss
10,37.775
20,11.7406
30,4.5281
40,3.8656
50,2.8844
60,1.4914
70,0.8109
80,0.4797
90,0.3551
100,0.278




('./peft-review-sentiment-checkpoint-local\\tokenizer_config.json',
 './peft-review-sentiment-checkpoint-local\\special_tokens_map.json',
 './peft-review-sentiment-checkpoint-local\\tokenizer.json')

In [24]:
from peft import PeftModel, PeftConfig
peft_model = PeftModel.from_pretrained(original_model, 
                                       './peft-dialogue-summary-checkpoint-local/',                                       
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [25]:
print(print_number_of_trainable_model_parameters(peft_model))

trainable model parameters: 0
all model parameters: 251116800
percentage of trainable model parameters: 0.00%


## Load Trained Model

In [26]:
model_name='google/flan-t5-base'

base_model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name, 
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

peft_model_try = PeftModel.from_pretrained(base_model, './peft-dialogue-summary-checkpoint-local/')
peft_model_try = peft_model_try.merge_and_unload()

tokenizer_peft = AutoTokenizer.from_pretrained("./peft-dialogue-summary-checkpoint-local/", trust_remote_code=True)

tokenizer_peft.pad_token, tokenizer_peft.pad_token_id, tokenizer_peft.padding_side

('<pad>', 0, 'right')

In [27]:
peft_model_try.pad_token_id = tokenizer_peft.pad_token_id
peft_model_try.config.pad_token_id = tokenizer_peft.pad_token_id

## 2.4 Model Comparation to the original FLAN T5

In [41]:
# Check if CUDA is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

index = 120

text = dataset['test']['text'][index]
label = dataset['test']['label'][index]

prompt = f"""
classify sentiment: 
{text}

Sentiment:
"""

# Move models to the appropriate device
original_model.to(device)
peft_model.to(device)

# Tokenize the input and move to the same device
input_ids = original_tokenizer(prompt, return_tensors='pt').input_ids.to(device)

# Adjust generation parameters
generation_config = GenerationConfig(
    max_new_tokens=1,
    num_beams=4,  # Increase the number of beams for better exploration
    temperature=0.7,  # Add temperature to control randomness
    top_k=50,  # Add top_k sampling
    top_p=0.95  # Add top_p sampling
)

# Generate output with the original model
original_output = original_model.generate(input_ids=input_ids, generation_config=generation_config)
original_model_sentiment_output = original_tokenizer.decode(original_output[0], skip_special_tokens=True)

# Generate output with the PEFT model
peft_output = peft_model.generate(input_ids=input_ids, generation_config=generation_config)
peft_model_sentiment_output = original_tokenizer.decode(peft_output[0], skip_special_tokens=True)

# Convert the label to "negative" if 0, and "positive" if 1
sentiment_label = "positive" if label == 1 else "negative"

dash_line = '-' * 100
print(dash_line)
print(f'INPUT PROMPT:\n{prompt}')
print(dash_line)
print(f'BASELINE IMDB SENTIMENT:\n{sentiment_label}\n')
print(dash_line)
print(f'ORIGINAL MODEL GENERATION - ZERO SHOT:\n{original_model_sentiment_output}')
print(dash_line)
print(f'PEFT MODEL GENERATION - ZERO SHOT:\n{peft_model_sentiment_output}')

----------------------------------------------------------------------------------------------------
INPUT PROMPT:

classify sentiment: 
Blake Edwards' legendary fiasco, begins to seem pointless after just 10 minutes. A combination of The Eagle Has Landed, Star!, Oh! What a Lovely War!, and Edwards' Pink Panther films, Darling Lili never engages the viewer; the aerial sequences, the musical numbers, the romance, the comedy, and the espionage are all ho hum. At what point is the viewer supposed to give a damn? This disaster wavers in tone, never decides what it wants to be, and apparently thinks it's a spoof, but it's pathetically and grindingly square. Old fashioned in the worst sense, audiences understandably stayed away in droves. It's awful. James Garner would have been a vast improvement over Hudson who is just cardboard, and he doesn't connect with Andrews and vice versa. And both Andrews and Hudson don't seem to have been let in on the joke and perform with a miscalculated earnes

## 2.4 Evaluate the Model

In [42]:
from transformers import pipeline

pipe = pipeline(task="text2text-generation", model=peft_model_try, tokenizer=tokenizer_peft, device=device)

In [43]:
def calculate_sentiment(input_text):
    prompt = "sentiment analysis: " + input_text + "\n\nSentiment:"
    output = pipe(prompt, max_length=16)
    return output[0]['generated_text']

In [44]:
from tqdm import tqdm

# Iterate over the test dataset and calculate predictions
predictions = []
for example in tqdm(filtered_datasets['test'], desc="Evaluating"):
    input_text = original_tokenizer.decode(example['input_ids'], skip_special_tokens=True)
    prediction = calculate_sentiment(input_text)
    predictions.append(prediction)

Evaluating:   2%|█▋                                                                    | 6/250 [00:01<00:57,  4.26it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (513 > 512). Running this sequence through the model will result in indexing errors
Evaluating:   4%|██▊                                                                  | 10/250 [00:02<00:54,  4.43it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Evaluating: 100%|████████████████████████████████████████████████████████████████████| 250/250 [01:05<00:00,  3.83it/s]


In [45]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Assuming the labels are also available in the test dataset, calculate accuracy
correct_predictions = sum(1 for i, example in enumerate(filtered_datasets['test']) if predictions[i] == example['labeled'])
total_predictions = len(predictions)

accuracy = correct_predictions / total_predictions
y_true = [example['labeled'] for example in filtered_datasets['test']]
precision = precision_score(y_true, predictions, average='weighted')
recall = recall_score(y_true, predictions, average='weighted')
f1 = f1_score(y_true, predictions, average='weighted')


print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

Accuracy: 0.94
Precision: 0.9402535860655737
Recall: 0.94
F1 Score: 0.9399913587556609


In [46]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_true, predictions)
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[119   6]
 [  9 116]]


In [47]:
from sklearn.metrics import classification_report

report = classification_report(y_true, predictions)
print("Classification Report:")
print(report)

Classification Report:
              precision    recall  f1-score   support

    negative       0.93      0.95      0.94       125
    positive       0.95      0.93      0.94       125

    accuracy                           0.94       250
   macro avg       0.94      0.94      0.94       250
weighted avg       0.94      0.94      0.94       250



In [48]:
from sklearn.metrics import roc_auc_score

# Convert labels to binary (1 for positive, 0 for negative)
predictions_numeric = [1 if label == 'positive' else 0 for label in predictions]

y_true_binary = filtered_datasets['test']['label']

roc_auc = roc_auc_score(y_true_binary, predictions_numeric)
print("ROC-AUC Score:", roc_auc)

ROC-AUC Score: 0.9399999999999998
