In [None]:
!pip install datasets evaluate



In [None]:
# Import libraries
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score


In [None]:
# Load dataset
dataset = load_dataset("imdb", split="train[:1000]")  # Sample a subset to keep it lightweight
dataset = dataset.train_test_split(test_size=0.2)  # Split into train and test
train_dataset, test_dataset = dataset['train'], dataset['test']


In [None]:
# EDA - Check some samples
print("Sample Data: ", train_dataset[0])
print("Training Data Size:", len(train_dataset))
print("Testing Data Size:", len(test_dataset))

# Distribution of labels
print("Label Distribution: ", train_dataset.features['label'].names)


Sample Data:  {'text': "Holy freaking God all-freaking-mighty. This movie was so bad, I thought I was on drugs. In a bad way... The character acting is the poorest thing I've seen in quite some time. This movie was more akin to Lord of the G-Strings, IMHO(it's a real movie). Most of the movie appeared to be done on a horrible green screen. My favorite part was when they are in the carriage, and you can tell there's no horse. They're fleeing from alien monsters, and going about the same speed as a swift jog. Then it switches to a far-shot with a ridiculous CG horse. And the CG in general seems to be sub-par to 1992's Beyond the Mind's Eye. I mean, Come on, really. It felt like a horrible episode of Hercules, only without Kevin Sorbo there to save the day. Worst. Movie. Ever.", 'label': 0}
Training Data Size: 800
Testing Data Size: 200
Label Distribution:  ['neg', 'pos']


In [None]:
# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenize the datasets
def tokenize_data(example):
    return tokenizer(example['text'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_data, batched=True)
test_dataset = test_dataset.map(tokenize_data, batched=True)

# Set format for PyTorch
train_dataset = train_dataset.rename_column("label", "labels").with_format("torch")
test_dataset = test_dataset.rename_column("label", "labels").with_format("torch")




Map:   0%|          | 0/800 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

In [None]:
# Load the pretrained model
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Evaluation function
def compute_metrics(pred):
    logits, labels = pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "f1": f1}

# Trainer for evaluation
training_args = TrainingArguments(
    output_dir="./results",
    per_device_eval_batch_size=8,
    report_to='none')                # Disable logging to W&B)
trainer = Trainer(model=model, args=training_args, eval_dataset=test_dataset, compute_metrics=compute_metrics)

# Baseline evaluation
baseline_results = trainer.evaluate()
print("Baseline Results:", baseline_results)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Baseline Results: {'eval_loss': 0.6827018857002258, 'eval_model_preparation_time': 0.0025, 'eval_accuracy': 0.765, 'eval_f1': 0.8668555240793201, 'eval_runtime': 3.0755, 'eval_samples_per_second': 65.03, 'eval_steps_per_second': 8.129}


In [None]:
# Fine-tuning arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    report_to='none'
)

# Trainer for fine-tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tuning
trainer.train()




Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.001113,1.0,1.0
2,No log,0.00052,1.0,1.0
3,No log,0.000415,1.0,1.0


TrainOutput(global_step=300, training_loss=0.01779682795206706, metrics={'train_runtime': 134.3684, 'train_samples_per_second': 17.861, 'train_steps_per_second': 2.233, 'total_flos': 317921756774400.0, 'train_loss': 0.01779682795206706, 'epoch': 3.0})

In [None]:
# Evaluation after fine-tuning
fine_tuned_results = trainer.evaluate()
print("Results after Fine-Tuning:", fine_tuned_results)

# Compare baseline and fine-tuned results
print("Improvement in Accuracy:", fine_tuned_results['eval_accuracy'] - baseline_results['eval_accuracy'])
print("Improvement in F1 Score:", fine_tuned_results['eval_f1'] - baseline_results['eval_f1'])


Results after Fine-Tuning: {'eval_loss': 0.00041467114351689816, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_runtime': 3.3424, 'eval_samples_per_second': 59.837, 'eval_steps_per_second': 7.48, 'epoch': 3.0}
Improvement in Accuracy: 0.235
Improvement in F1 Score: 0.1331444759206799


In [None]:
# Calculate percentage improvement for accuracy and F1 score
accuracy_improvement = ((fine_tuned_results['eval_accuracy'] - baseline_results['eval_accuracy']) / baseline_results['eval_accuracy']) * 100
f1_improvement = ((fine_tuned_results['eval_f1'] - baseline_results['eval_f1']) / baseline_results['eval_f1']) * 100

print(f"Percentage Improvement in Accuracy: {accuracy_improvement:.2f}%")
print(f"Percentage Improvement in F1 Score: {f1_improvement:.2f}%")


Percentage Improvement in Accuracy: 30.72%
Percentage Improvement in F1 Score: 15.36%
