In [1]:
import pandas as pd
from datasets import Dataset

In [2]:
data = pd.read_csv("IMDB Dataset.csv", index_col=False)
data["sentiment"] = data["sentiment"].map({"negative":0, "positive":1})
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [3]:
dataset = Dataset.from_pandas(data)

In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

In [5]:
def preprocess_function(examples):
    return tokenizer(examples["review"], truncation=True, padding="max_length")

In [6]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [7]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
import evaluate

accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading builder script: 0.00B [00:00, ?B/s]

In [9]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return {
        "accuracy": accuracy.compute(predictions=predictions, references=labels)["accuracy"],
        "precision": precision.compute(predictions=predictions, references=labels)["precision"],
        "recall": recall.compute(predictions=predictions, references=labels)["recall"],
        "f1": f1.compute(predictions=predictions, references=labels)["f1"],
    }

In [10]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [11]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from sklearn.model_selection import train_test_split
data_train, data_test = train_test_split(data, test_size=0.2, random_state=42)

In [13]:
def tokenize_function(example):
    return tokenizer(example["review"], padding="max_length", truncation=True)

train_dataset = Dataset.from_pandas(data_train.reset_index(drop=True))
eval_dataset = Dataset.from_pandas(data_test.reset_index(drop=True))

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

train_dataset = train_dataset.remove_columns(["review"])
eval_dataset = eval_dataset.remove_columns(["review"])

def add_labels(example):
    example["labels"] = example["sentiment"]
    return example

train_dataset = train_dataset.map(add_labels)
eval_dataset = eval_dataset.map(add_labels)

train_dataset = train_dataset.remove_columns(["sentiment"])
eval_dataset = eval_dataset.remove_columns(["sentiment"])


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [15]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./sentiment_output", 
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",   
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


  trainer = Trainer(


  0%|          | 0/5000 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
#EVALUATE MODEL ON TEST SET
eval_results = trainer.evaluate()
print("\n=== TRANSFORMER MODEL PERFORMANCE ===")
print(eval_results)

In [None]:
# GET PREDICTIONS
predictions = trainer.predict(eval_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
true_labels = predictions.label_ids

In [None]:
#CONFUSION MATRIX
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(true_labels, pred_labels)
disp = ConfusionMatrixDisplay(cm, display_labels=["NEGATIVE", "POSITIVE"])
disp.plot(cmap="Blues")
plt.title("Confusion Matrix (BERT)")
plt.show()

In [None]:
# BASELINE MODEL (TF-IDF + Logistic Regression)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Using the SAME train-test split as BERT
baseline_train = data_train
baseline_test = data_test

vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(baseline_train["review"])
X_test = vectorizer.transform(baseline_test["review"])

y_train = baseline_train["sentiment"]
y_test = baseline_test["sentiment"]

baseline_model = LogisticRegression(max_iter=200)
baseline_model.fit(X_train, y_train)

baseline_preds = baseline_model.predict(X_test)

print("\n=== BASELINE MODEL (TF-IDF + Logistic Regression) ===")
print(classification_report(y_test, baseline_preds))