In [None]:
# BERT IMDb Sentiment Analysis - Assignment 3 Google Colab Notebook
# ---------------------------------------------------------------

# 1️⃣ Setup: Install and Import Dependencies
!pip install numpy pandas scikit-learn gensim nltk transformers datasets torch torchvision torchaudio # Ensure PyTorch is installed

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

import torch
import numpy as np
import pandas as pd

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding, pipeline
)

from sklearn.metrics import accuracy_score, f1_score

import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# 2️⃣ Load IMDb Dataset
print("Loading IMDb dataset from local files using pandas...")
try:
    train_df = pd.read_parquet("/content/imdb/train.parquet")
    test_df = pd.read_parquet("/content/imdb/test.parquet")

    # Convert pandas DataFrames to Dataset objects
    dataset = DatasetDict({
        "train": Dataset.from_pandas(train_df),
        "test": Dataset.from_pandas(test_df)
    })
    print("Dataset loaded and converted successfully.")
    print(f"Train samples: {len(dataset['train'])}, Test samples: {len(dataset['test'])}")

except Exception as e:
    print(f"Error loading and converting dataset: {e}")


# 3️⃣ Load Tokenizer and Model
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

# 4️⃣ Tokenization Function
def tokenize_function(examples):
    result = tokenizer(examples["text"], truncation=True, padding=True, max_length=256)
    result["labels"] = examples["label"]
    return result

tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# 5️⃣ Use Subsets for Fast Training
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(5000))
eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))

# 6️⃣ Define Metrics
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='weighted')
    return {"accuracy": accuracy, "f1": f1}

# 7️⃣ Trainer Setup
training_args = TrainingArguments(
    output_dir="./sentiment_model",
    eval_strategy="epoch", # Corrected argument name
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8, # Corrected argument name
    num_train_epochs=2,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=False,
    report_to=None,
    fp16=torch.cuda.is_available(),
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8️⃣ Train the Model
print("\nFine-tuning BERT on IMDb... This will take a few minutes.")
trainer.train()

# 9️⃣ Evaluate the Model
print("\nEvaluating the fine-tuned model:")
eval_results = trainer.evaluate()
for k, v in eval_results.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

# 1️⃣0️⃣ Save the Model
model_save_path = "./sentiment_finetuned_bert"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"\nModel saved to {model_save_path}")

# 1️⃣1️⃣ Load Model for Inference
print("\nLoading model for inference...")
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_save_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(model_save_path)
sentiment_pipeline = pipeline("sentiment-analysis", model=loaded_model, tokenizer=loaded_tokenizer, device=0 if torch.cuda.is_available() else -1)

# 1️⃣2️⃣ Inference Examples
sample_texts = [
    "The movie was surprisingly touching and beautifully acted!",
    "This film was absolutely terrible and boring.",
    "I loved every minute of this fantastic movie!",
    "The plot was confusing but the acting was decent.",
    "Best movie I've seen all year! Highly recommend!",
    "Waste of time. Poor acting and terrible storyline."
]

print("\nInference on sample texts:")
for i, text in enumerate(sample_texts, 1):
    result = sentiment_pipeline(text)[0]
    sentiment = "Positive" if result['label'] == 'LABEL_1' else "Negative"
    print(f"{i}. {text}\n   ➡️ Prediction: {sentiment} (Confidence: {result['score']:.4f})\n")

print("✅ Pipeline completed successfully.")

Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/base_command.py", line 179, in exc_logging_wrapper
    status = run_func(*args)
             ^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/cli/req_command.py", line 67, in wrapper
    return func(self, options, args)
           ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/commands/install.py", line 377, in run
    requirement_set = resolver.resolve(
                      ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_internal/resolution/resolvelib/resolver.py", line 95, in resolve
    result = self._result = resolver.resolve(
                            ^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/pip/_vendor/resolvelib/resolvers.py", line 546, in resolve
    state = resolution.resolve(requirements, max_rounds=max_rounds)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Loading IMDb dataset from local files using pandas...


KeyboardInterrupt: 

In [None]:
import pandas as pd

try:
    train_df = pd.read_parquet("/content/imdb/train.parquet")
    print("Train data loaded successfully. First 5 rows:")
    display(train_df.head())
    print(f"Train data shape: {train_df.shape}")
except Exception as e:
    print(f"Error loading train.parquet: {e}")

try:
    test_df = pd.read_parquet("/content/imdb/test.parquet")
    print("\nTest data loaded successfully. First 5 rows:")
    display(test_df.head())
    print(f"Test data shape: {test_df.shape}")
except Exception as e:
    print(f"Error loading test.parquet: {e}")

Train data loaded successfully. First 5 rows:


Unnamed: 0,text,label
0,I rented I AM CURIOUS-YELLOW from my video sto...,0
1,"""I Am Curious: Yellow"" is a risible and preten...",0
2,If only to avoid making this type of film in t...,0
3,This film was probably inspired by Godard's Ma...,0
4,"Oh, brother...after hearing about this ridicul...",0


Train data shape: (25000, 2)

Test data loaded successfully. First 5 rows:


Unnamed: 0,text,label
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


Test data shape: (25000, 2)


In [7]:
# sentiment_pipeline.py

# 1️⃣ Install dependencies if needed (skip if using a prepared environment)
# !pip install transformers datasets torch scikit-learn pandas pyarrow

import os
import numpy as np
import pandas as pd
import torch

from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    Trainer, TrainingArguments, DataCollatorWithPadding, pipeline
)
from sklearn.metrics import accuracy_score, f1_score

# 2️⃣ Load local IMDb parquet dataset
try:
    print("Loading local IMDb parquet files...")
    train_df = pd.read_parquet("/content/imdb/train.parquet")
    test_df = pd.read_parquet("/content/imdb/test.parquet")

    # Ensure columns are correct
    if 'text' not in train_df.columns or 'label' not in train_df.columns:
        raise ValueError("Parquet files must have 'text' and 'label' columns.")

    # Convert to Hugging Face Datasets
    dataset = DatasetDict({
        "train": Dataset.from_pandas(train_df),
        "test": Dataset.from_pandas(test_df)
    })
    print(f"✅ Dataset loaded: Train={len(dataset['train'])}, Test={len(dataset['test'])}")

except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    exit()

# 3️⃣ Load tokenizer and model
model_checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=2)

# 4️⃣ Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=256)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

# 5️⃣ Use small subset for fast training
train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(500))

# 6️⃣ Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": acc, "f1": f1}

# 7️⃣ Trainer setup
training_args = TrainingArguments(
    output_dir="./sentiment_model",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,   # 1 epoch for fast execution
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=torch.cuda.is_available(),
    push_to_hub=False
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# 8️⃣ Train
print("🚀 Fine-tuning BERT on IMDb sentiment classification (fast subset)...")
trainer.train()

# 9️⃣ Evaluate
print("\n📊 Evaluation Results:")
metrics = trainer.evaluate()
for k, v in metrics.items():
    print(f"{k}: {v:.4f}" if isinstance(v, float) else f"{k}: {v}")

# 🔟 Save Model
save_path = "./sentiment_finetuned_bert"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"\n💾 Model saved to {save_path}")

# 1️⃣1️⃣ Load for Inference
print("\n🔄 Loading model for inference...")
loaded_model = AutoModelForSequenceClassification.from_pretrained(save_path)
loaded_tokenizer = AutoTokenizer.from_pretrained(save_path)

sentiment_pipeline = pipeline(
    "sentiment-analysis",
    model=loaded_model,
    tokenizer=loaded_tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

# 1️⃣2️⃣ Sample Inference
sample_texts = [
    "The movie was surprisingly touching and beautifully acted!",
    "This film was absolutely terrible and boring.",
    "I loved every minute of this fantastic movie!",
    "The plot was confusing but the acting is decent.",
    "Best movie I've seen all year! Highly recommend!",
    "Waste of time. Poor acting and terrible storyline."
]

print("\n🔍 Sample Inference Results:")
for i, text in enumerate(sample_texts, 1):
    result = sentiment_pipeline(text)[0]
    sentiment = "Positive" if result['label'] == 'LABEL_1' else "Negative"
    print(f"{i}. {text}\n   ➡️ Prediction: {sentiment} (Confidence: {result['score']:.4f})\n")

print("✅ Sentiment analysis pipeline completed successfully.")

Loading local IMDb parquet files...
✅ Dataset loaded: Train=25000, Test=25000


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

  trainer = Trainer(


🚀 Fine-tuning BERT on IMDb sentiment classification (fast subset)...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.463785,0.846,0.84598



📊 Evaluation Results:


eval_loss: 0.4638
eval_accuracy: 0.8460
eval_f1: 0.8460
eval_runtime: 404.2880
eval_samples_per_second: 1.2370
eval_steps_per_second: 0.1560
epoch: 1.0000

💾 Model saved to ./sentiment_finetuned_bert

🔄 Loading model for inference...


Device set to use cpu



🔍 Sample Inference Results:
1. The movie was surprisingly touching and beautifully acted!
   ➡️ Prediction: Positive (Confidence: 0.7253)

2. This film was absolutely terrible and boring.
   ➡️ Prediction: Negative (Confidence: 0.6667)

3. I loved every minute of this fantastic movie!
   ➡️ Prediction: Positive (Confidence: 0.6846)

4. The plot was confusing but the acting is decent.
   ➡️ Prediction: Negative (Confidence: 0.5112)

5. Best movie I've seen all year! Highly recommend!
   ➡️ Prediction: Positive (Confidence: 0.6037)

6. Waste of time. Poor acting and terrible storyline.
   ➡️ Prediction: Negative (Confidence: 0.6412)

✅ Sentiment analysis pipeline completed successfully.
