### Import Statements

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

### Load the dataset

In [2]:
dataset = load_dataset("imdb")
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

### Explore dataset

In [3]:
dataset["train"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(names=['neg', 'pos'], id=None)}

In [4]:
dataset["train"][0]

{'text': 'I rented I AM CURIOUS-YELLOW from my video store because of all the controversy that surrounded it when it was first released in 1967. I also heard that at first it was seized by U.S. customs if it ever tried to enter this country, therefore being a fan of films considered "controversial" I really had to see this for myself.<br /><br />The plot is centered around a young Swedish drama student named Lena who wants to learn everything she can about life. In particular she wants to focus her attentions to making some sort of documentary on what the average Swede thought about certain political issues such as the Vietnam War and race issues in the United States. In between asking politicians and ordinary denizens of Stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men.<br /><br />What kills me about I AM CURIOUS-YELLOW is that 40 years ago, this was considered pornographic. Really, the sex and nudity scenes are few and far be

### Tokenize the dataset

In [5]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_fn(example):
    return tokenizer(
        example["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

tokenized_datasets = dataset.map(tokenize_fn, batched=True)

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [6]:
# 'input_ids' : (the token IDs),
# 'attention_mask' : (indicates which tokens are real vs. padding)
tokenized_datasets["train"][1]

{'text': '"I Am Curious: Yellow" is a risible and pretentious steaming pile. It doesn\'t matter what one\'s political views are because this film can hardly be taken seriously on any level. As for the claim that frontal male nudity is an automatic NC-17, that isn\'t true. I\'ve seen R-rated films with male nudity. Granted, they only offer some fleeting views, but where are the R-rated films with gaping vulvas and flapping labia? Nowhere, because they don\'t exist. The same goes for those crappy cable shows: schlongs swinging in the breeze but not a clitoris in sight. And those pretentious indie movies like The Brown Bunny, in which we\'re treated to the site of Vincent Gallo\'s throbbing johnson, but not a trace of pink visible on Chloe Sevigny. Before crying (or implying) "double-standard" in matters of nudity, the mentally obtuse should take into account one unavoidably obvious anatomical difference between men and women: there are no genitals on display when actresses appears nude, 

### Format for Model Training

In [7]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

### Load the model for classification

In [8]:
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

2025-06-13 17:20:40.947064: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1749815441.040245    3755 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1749815441.068518    3755 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1749815441.256823    3755 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749815441.256863    3755 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1749815441.256867    3755 computation_placer.cc:177] computation placer alr

### Define Trainer Components

In [9]:
# !pip install --upgrade accelerate

In [10]:
from transformers import TrainingArguments, Trainer

training_arguments = TrainingArguments(
    output_dir="./results",                  # Where to save checkpoints & logs
    eval_strategy="epoch",             # Evaluate the model at the end of each epoch
    save_strategy="epoch",                   # Save the model at the end of each epoch
    learning_rate=2e-5,                      # Learning rate for the optimizer (2e-5=0.00002) (A small learning rate avoids making large, destructive changes to these well-tuned weights.)
    per_device_train_batch_size=8,           # Batch size per device during training
    per_device_eval_batch_size=8,            # Batch size per device during evaluation
    num_train_epochs=2,                      # Number of Epochs for training. (Number of complete passes through the dataset) (2 is reasonable for fine-tuning, especially when using a pretrained model.)
    weight_decay=0.01                        # Regularization to prevent overfitting
)

trainer = Trainer(
    model=model,
    args=training_arguments,
    train_dataset=tokenized_datasets["train"].shuffle(seed=42).select(range(1000)), # Small subset to train faster
    eval_dataset=tokenized_datasets["test"].shuffle(seed=42).select(range(1000)),
    tokenizer=tokenizer
)

  trainer = Trainer(


### Train the Model

In [11]:
trainer.train()



Epoch,Training Loss,Validation Loss
1,No log,0.373747
2,No log,0.390764




TrainOutput(global_step=250, training_loss=0.39873056030273435, metrics={'train_runtime': 728.2935, 'train_samples_per_second': 2.746, 'train_steps_per_second': 0.343, 'total_flos': 132467398656000.0, 'train_loss': 0.39873056030273435, 'epoch': 2.0})

### Evaluate

In [12]:
trainer.evaluate()



{'eval_loss': 0.39076414704322815,
 'eval_runtime': 59.3442,
 'eval_samples_per_second': 16.851,
 'eval_steps_per_second': 2.106,
 'epoch': 2.0}

### Save the model

In [13]:
trainer.save_model("./model")

# Try with Own Reviews

In [56]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import torch.nn.functional as F

#### Load the saved model and Tokenizer

In [57]:
model_path = "./model"

tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)

model.eval()

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [58]:
review = "This movie was absolutely fantastic! Loved the acting and direction."

In [59]:
tokenized_review = tokenizer(review, return_tensors="pt", padding='max_length', truncation=True, max_length=256) # return_tensors="pt": Returns the tokenized output as PyTorch tensors (PyTorch backend is being used).

In [60]:
tokenized_review

{'input_ids': tensor([[  101,  2023,  3185,  2001,  7078, 10392,   999,  3866,  1996,  3772,
          1998,  3257,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  

In [61]:
"""
torch.no_grad(): Disables gradient computation, saving memory and computation during inference (not training).
"""
with torch.no_grad():
    output = model(**tokenized_review)

In [62]:
print(output)

SequenceClassifierOutput(loss=None, logits=tensor([[-1.7516,  1.3329]]), hidden_states=None, attentions=None)


In [63]:
import torch
import torch.nn.functional as F

"""
outputs.logits: Raw model output before applying activation. These are unnormalized scores for each class.

F.softmax(..., dim=1): Converts logits to probabilities across classes (e.g., class 0 = negative, class 1 = positive).
"""
probs = F.softmax(output.logits, dim=1)
print(probs)

tensor([[0.0437, 0.9563]])


In [64]:
"""
torch.argmax(..., dim=1): Picks the class (0 or 1) with the highest probability.

.item(): Converts the result from a tensor to a regular Python number.
"""
predicted_class = torch.argmax(probs, dim=1).item()
print(predicted_class)

1


In [65]:
"""
Gets the probability (confidence) of the predicted class. This tells you how sure the model is.
"""
print(probs[0][predicted_class].item())

0.9562507271766663


In [66]:
label = "Positive" if predicted_class == 1 else "Negative"
print(label)

Positive
