In [1]:
!pip install -q torch transformers datasets scikit-learn accelerate


In [2]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

Using device: cuda


In [4]:
# ============================================================
# BERT SENTIMENT ANALYSIS – FULL IMDb DATASET
# ============================================================

import os
os.environ["WANDB_DISABLED"] = "true"

import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    Trainer,
    TrainingArguments
)

# -----------------------------
# Device
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# -----------------------------
# Load IMDb CSV Dataset
# -----------------------------
df = pd.read_csv("IMDB Dataset.csv")

df["label"] = df["sentiment"].map({"positive": 1, "negative": 0})
df = df[["review", "label"]]
df.columns = ["text", "label"]

dataset = Dataset.from_pandas(df)
dataset = dataset.train_test_split(test_size=0.2)

train_data = dataset["train"]
test_data = dataset["test"]

print("Train size:", len(train_data))
print("Test size:", len(test_data))


Using device: cuda
Train size: 40000
Test size: 10000


In [5]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

model = BertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)
model.to(device)
sample_text = train_data[0]["text"]
sample_label = train_data[0]["label"]

print("\nOriginal Review:\n", sample_text[:500])
print("\nLabel:", "Positive" if sample_label == 1 else "Negative")
tokens = tokenizer.tokenize(sample_text)

print("\nTokens (first 40):")
print(tokens[:40])

token_ids = tokenizer.convert_tokens_to_ids(tokens)

print("\nToken IDs (first 40):")
print(token_ids[:40])
encoded_ids = tokenizer.encode(sample_text)

print("\nEncoded IDs with [CLS] and [SEP] (first 40):")
print(encoded_ids[:40])
encoding = tokenizer(
    sample_text,
    truncation=True,
    padding="max_length",
    max_length=128
)

print("\nInput IDs (first 20):")
print(encoding["input_ids"][:20])

print("\nAttention Mask (first 20):")
print(encoding["attention_mask"][:20])

print("\nDecoded Tokens (first 20):")
print(tokenizer.convert_ids_to_tokens(encoding["input_ids"][:20]))



Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Original Review:
 After watching this movie once, it quickly became one of my favorites. As different events happen in the movie, you change your mind about Prot, back and forth, until the end and even after. The movie is very thought-provoking and a must-watch!!

Label: Positive

Tokens (first 40):
['after', 'watching', 'this', 'movie', 'once', ',', 'it', 'quickly', 'became', 'one', 'of', 'my', 'favorites', '.', 'as', 'different', 'events', 'happen', 'in', 'the', 'movie', ',', 'you', 'change', 'your', 'mind', 'about', 'pro', '##t', ',', 'back', 'and', 'forth', ',', 'until', 'the', 'end', 'and', 'even', 'after']

Token IDs (first 40):
[2044, 3666, 2023, 3185, 2320, 1010, 2009, 2855, 2150, 2028, 1997, 2026, 20672, 1012, 2004, 2367, 2824, 4148, 1999, 1996, 3185, 1010, 2017, 2689, 2115, 2568, 2055, 4013, 2102, 1010, 2067, 1998, 5743, 1010, 2127, 1996, 2203, 1998, 2130, 2044]

Encoded IDs with [CLS] and [SEP] (first 40):
[101, 2044, 3666, 2023, 3185, 2320, 1010, 2009, 2855, 2150, 2028, 19

In [6]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

train_data = train_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

train_data = train_data.rename_column("label", "labels")
test_data = test_data.rename_column("label", "labels")

train_data.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
test_data.set_format("torch", columns=["input_ids", "attention_mask", "labels"])


Map: 100%|███████████████████████████████████████████████████████████████| 40000/40000 [04:04<00:00, 163.72 examples/s]
Map: 100%|███████████████████████████████████████████████████████████████| 10000/10000 [01:01<00:00, 163.79 examples/s]


In [7]:
training_args = TrainingArguments(
    output_dir="./bert_imdb_full",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=500,
    save_steps=2000,
    save_total_limit=2,
    no_cuda=not torch.cuda.is_available(),
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer
)

trainer.train()


Step,Training Loss
500,0.4518
1000,0.3681
1500,0.3459
2000,0.3424
2500,0.322
3000,0.324
3500,0.3181
4000,0.3355
4500,0.3202
5000,0.3155


TrainOutput(global_step=10000, training_loss=0.28087916564941406, metrics={'train_runtime': 1616.7886, 'train_samples_per_second': 49.481, 'train_steps_per_second': 6.185, 'total_flos': 5262221107200000.0, 'train_loss': 0.28087916564941406, 'epoch': 2.0})

In [10]:
text = "The movie was garbage"

inputs = tokenizer(
    text,
    return_tensors="pt",
    truncation=True,
    padding=True,
    max_length=128
)
inputs = {k: v.to(device) for k, v in inputs.items()}

with torch.no_grad():
    outputs = model(**inputs)

prediction = outputs.logits.argmax(dim=1).item()
print("\nPredicted Sentiment:", "Positive" if prediction == 1 else "Negative")



Predicted Sentiment: Negative
