In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

import torch
import pandas as pd
from torch.utils.data import Dataset
from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

In [3]:
df = pd.read_csv("IMDB Dataset.csv")

df["sentiment"] = df["sentiment"].map({
    "negative": 0,
    "positive": 1
})

# âš¡ Reduce size for CPU (remove if GPU available)
df = df.sample(12000, random_state=42)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["review"].tolist(),
    df["sentiment"].tolist(),
    test_size=0.2,
    stratify=df["sentiment"],
    random_state=42
)


In [4]:
MAX_LEN = 128

tokenizer = DistilBertTokenizerFast.from_pretrained(
    "distilbert-base-uncased"
)

train_encodings = tokenizer(
    train_texts,
    truncation=True,
    padding=True,
    max_length=MAX_LEN
)

test_encodings = tokenizer(
    test_texts,
    truncation=True,
    padding=True,
    max_length=MAX_LEN
)

In [5]:
class IMDBDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

In [6]:
train_dataset = IMDBDataset(train_encodings, train_labels)
test_dataset = IMDBDataset(test_encodings, test_labels)

In [7]:
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=2
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=8,   # CPU safe
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    logging_dir="./logs",
    report_to="none"
)

In [11]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

In [12]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

In [13]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.3704,0.374668,0.84625,0.853977


TrainOutput(global_step=1200, training_loss=0.3978362528483073, metrics={'train_runtime': 422.3691, 'train_samples_per_second': 22.729, 'train_steps_per_second': 2.841, 'total_flos': 317921756774400.0, 'train_loss': 0.3978362528483073, 'epoch': 1.0})

In [14]:
trainer.evaluate()



{'eval_loss': 0.3746684193611145,
 'eval_accuracy': 0.84625,
 'eval_f1': 0.8539770478828651,
 'eval_runtime': 27.0107,
 'eval_samples_per_second': 88.854,
 'eval_steps_per_second': 11.107,
 'epoch': 1.0}

In [19]:
import torch

device = torch.device(
    "mps" if torch.backends.mps.is_available()
    else "cuda" if torch.cuda.is_available()
    else "cpu"
)

print("Using device:", device)

Using device: mps


In [20]:
def predict_sentiment(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True
    )

    # ðŸ”¥ move inputs to same device as model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    probs = torch.softmax(outputs.logits, dim=1)
    return "Positive" if probs[0][1].item() > probs[0][0].item() else "Negative"

In [24]:
test_sentences = [
    "This movie was absolutely fantastic, I loved every moment of it.",
    "The film was boring, slow, and a complete waste of time.",
    "Amazing performances and a very emotional storyline.",
    "I didnâ€™t like the movie at all, the plot made no sense.",
    "It was okay, not great but not terrible either.",
    "The cinematography was beautiful, but the story was weak.",
    "Terrible acting ruined what could have been a good movie.",
    "One of the best movies I have seen in years!",
    "The movie started well but became disappointing towards the end.",
    "I would not recommend this movie to anyone."
]

In [25]:
for text in test_sentences:
    print(f"Review: {text}")
    print("Prediction:", predict_sentiment(text))
    print("-" * 60)

Review: This movie was absolutely fantastic, I loved every moment of it.
Prediction: Positive
------------------------------------------------------------
Review: The film was boring, slow, and a complete waste of time.
Prediction: Negative
------------------------------------------------------------
Review: Amazing performances and a very emotional storyline.
Prediction: Positive
------------------------------------------------------------
Review: I didnâ€™t like the movie at all, the plot made no sense.
Prediction: Negative
------------------------------------------------------------
Review: It was okay, not great but not terrible either.
Prediction: Negative
------------------------------------------------------------
Review: The cinematography was beautiful, but the story was weak.
Prediction: Negative
------------------------------------------------------------
Review: Terrible acting ruined what could have been a good movie.
Prediction: Negative
----------------------------------