In [1]:
import os
import random
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset, DatasetDict

2025-08-13 05:16:37.450998: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1755062197.639591      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1755062197.690306      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
data_path = "/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv"
output_path = "/kaggle/working/outputs_imdb"
os.makedirs(output_path, exist_ok=True)

torch.manual_seed(42)
np.random.seed(42)
random.seed(42)

In [3]:
models_to_test = [
    "bert-base-uncased",
    "roberta-base", 
    "microsoft/deberta-base",
    "google/electra-base-discriminator",
    "distilbert-base-uncased"
]

In [4]:
df = pd.read_csv(data_path)
df['label'] = df['sentiment'].map({'negative': 0, 'positive': 1})
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

train_data, test_data = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.125, stratify=train_data['label'], random_state=42)

datasets = DatasetDict({
    "train": Dataset.from_pandas(train_data[['review', 'label']]),
    "validation": Dataset.from_pandas(val_data[['review', 'label']]),
    "test": Dataset.from_pandas(test_data[['review', 'label']])
})

In [5]:
def tokenize_data(examples, tokenizer):
    return tokenizer(examples["review"], padding="max_length", truncation=True, max_length=256)

def calculate_metrics(pred):
    labels = pred.label_ids
    predictions = np.argmax(pred.predictions, axis=1)
    return {
        "f1": f1_score(labels, predictions, average="binary"),
        "precision": precision_score(labels, predictions, zero_division=0),
        "recall": recall_score(labels, predictions, zero_division=0)
    }

In [6]:
def train_model(model_name, train_ds, val_ds, save_dir, use_subset=True):
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
    
    train_tokenized = train_ds.map(lambda x: tokenize_data(x, tokenizer), batched=True)
    val_tokenized = val_ds.map(lambda x: tokenize_data(x, tokenizer), batched=True)
    
    train_tokenized = train_tokenized.remove_columns(["review"]).with_format("torch")
    val_tokenized = val_tokenized.remove_columns(["review"]).with_format("torch")

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
    
    epochs = 2 if use_subset else 3
    
    training_args = TrainingArguments(
        output_dir=save_dir,
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_strategy="epoch",
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=epochs,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        greater_is_better=True,
        fp16=False,
        save_total_limit=1,
        seed=42,
        report_to="none"
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_tokenized,
        eval_dataset=val_tokenized,
        tokenizer=tokenizer,
        compute_metrics=calculate_metrics,
    )

    trainer.train()
    results = trainer.evaluate()
    trainer.save_model(save_dir)
    return results, trainer

In [7]:
subset_train = datasets["train"].shuffle(seed=42).select(range(10000))
subset_val = datasets["validation"].shuffle(seed=42).select(range(2000))

model_results = {}
for model in models_to_test:
    print(f"\nTraining {model}...")
    model_dir = os.path.join(output_path, "model_comparison", model.replace("/", "_"))
    os.makedirs(model_dir, exist_ok=True)
    
    metrics, _ = train_model(model, subset_train, subset_val, model_dir, use_subset=True)
    model_results[model] = metrics
    print(f"{model}: F1 = {metrics['eval_f1']:.4f}")


Training bert-base-uncased...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.3289,0.273378,0.884495,0.914962,0.855992
2,0.1498,0.383501,0.895699,0.879612,0.912387


bert-base-uncased: F1 = 0.8957

Training roberta-base...


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.3172,0.248726,0.906875,0.879017,0.936556
2,0.1685,0.338992,0.912718,0.90415,0.92145


roberta-base: F1 = 0.9127

Training microsoft/deberta-base...


tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.3687,0.26668,0.904693,0.870577,0.941591
2,0.1903,0.287585,0.911011,0.909639,0.912387


model.safetensors:   0%|          | 0.00/559M [00:00<?, ?B/s]

microsoft/deberta-base: F1 = 0.9110

Training google/electra-base-discriminator...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.2871,0.244662,0.909715,0.887081,0.933535
2,0.1349,0.279646,0.921774,0.912229,0.931521


google/electra-base-discriminator: F1 = 0.9218

Training distilbert-base-uncased...


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.3333,0.273609,0.888432,0.907563,0.870091
2,0.152,0.353873,0.893194,0.881373,0.905337


distilbert-base-uncased: F1 = 0.8932


In [8]:
best_model = max(model_results.items(), key=lambda x: x[1]['eval_f1'])[0]
print(f"\nBest performing model: {best_model}")


Best performing model: google/electra-base-discriminator


In [9]:
final_model_dir = os.path.join(output_path, "best_model", best_model.replace("/", "_"))
final_metrics, final_trainer = train_model(best_model, datasets["train"], datasets["validation"], final_model_dir, use_subset=False)
print(f"Final validation metrics: {final_metrics}")

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,F1,Precision,Recall
1,0.2424,0.291829,0.912156,0.857696,0.974
2,0.1301,0.262525,0.935178,0.924219,0.9464
3,0.0571,0.304657,0.935075,0.925549,0.9448


Final validation metrics: {'eval_loss': 0.2625252902507782, 'eval_f1': 0.9351778656126483, 'eval_precision': 0.92421875, 'eval_recall': 0.9464, 'eval_runtime': 38.375, 'eval_samples_per_second': 130.293, 'eval_steps_per_second': 8.156, 'epoch': 3.0}


In [10]:
tokenizer = AutoTokenizer.from_pretrained(best_model, use_fast=True)
test_tokenized = datasets["test"].map(lambda x: tokenize_data(x, tokenizer), batched=True)
test_tokenized = test_tokenized.remove_columns(["review"]).with_format("torch")

test_results = final_trainer.evaluate(eval_dataset=test_tokenized)
print(f"Test set performance: {test_results}")

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Test set performance: {'eval_loss': 0.2449023425579071, 'eval_f1': 0.9387471458354015, 'eval_precision': 0.9319929036073329, 'eval_recall': 0.9456, 'eval_runtime': 76.8002, 'eval_samples_per_second': 130.208, 'eval_steps_per_second': 8.138, 'epoch': 3.0}


In [11]:
sample_indices = random.sample(range(len(datasets["test"])), 5)
test_samples = [datasets["test"][i] for i in sample_indices]

inputs = tokenizer([sample["review"] for sample in test_samples], 
                  truncation=True, padding=True, max_length=256, return_tensors="pt")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(final_model_dir).to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(dim=-1).cpu().numpy()

print("\nSample predictions:")
for i, sample in enumerate(test_samples):
    sentiment = "positive" if predictions[i] == 1 else "negative"
    actual = "positive" if sample['label'] == 1 else "negative"
    print(f"Review: {sample['review'][:150]}...")
    print(f"Predicted: {sentiment} | Actual: {actual}\n")


Sample predictions:
Review: This made-for-TV film is a brilliant one. This is probably the best and favourite role by BAFTA winning John Thaw (Kavanagh Q.C. and Inspector Morse)....
Predicted: positive | Actual: positive

Review: !!! Spoiler alert!!!<br /><br />The point is, though, that I didn't think this film had an ending TO spoil... I only started watching it in the middle...
Predicted: negative | Actual: negative

Review: First off, let me start with a quote a friend of mine said while watching this movie: "This entire movie had to have been a dare. You know, like, 'DUD...
Predicted: negative | Actual: negative

Review: This is a candidate for worst films I've ever seen. It wanted to be as shocking as "Silence of the Lambs," but has neither the style nor the wit of th...
Predicted: negative | Actual: negative

Review: One of the most popular rentals at my local video store is not Borat or The Departed but a 2005 documentary about Jesus Christ called The God Who Wasn...
Predicted