# Sentiment Analysis: ML vs. DL Models

This notebook performs a comparative analysis of classical machine learning models and deep learning models for sentiment analysis on the IMDB movie review dataset.

## 0. Setup and Imports

In [None]:
# --------------------------------------------------------------------------
# 0. SETUP AND IMPORTS
# --------------------------------------------------------------------------
import pandas as pd
import spacy
import time
import numpy as np
from collections import Counter

# Scikit-learn imports for ML
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# PyTorch imports for DL
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# Transformers import for BERT with PyTorch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW


## 1. Load Data

In [None]:
# --------------------------------------------------------------------------
# 1. LOAD DATA
# --------------------------------------------------------------------------
print("--- 1. Loading Dataset ---")
df = pd.read_csv("IMDB Dataset.csv", on_bad_lines='skip')
print("Dataset loaded. Shape:", df.shape)
print(df.head())

# Part 1: Classical Machine Learning Analysis

### 2a. Preprocessing for ML

In [None]:
print("--- 2. Starting Part 1: Classical Machine Learning Analysis ---")
print("Loading spaCy model for ML preprocessing...")
nlp = spacy.load('en_core_web_sm')
nlp.disable_pipes('parser', 'ner')

def preprocess_text_spacy(text):
    doc = nlp(text)
    tokens = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct and token.is_alpha]
    return ' '.join(tokens)

print("Preprocessing text for ML models (this will take a while)...")
df['processed_review_ml'] = df['review'].apply(preprocess_text_spacy)
print("ML preprocessing complete.")

### 2b. Feature Extraction and Data Splitting

In [None]:
label_encoder = LabelEncoder()
y_ml = label_encoder.fit_transform(df['sentiment'])

tfidf = TfidfVectorizer(max_features=5000)
X_ml = tfidf.fit_transform(df['processed_review_ml'])

X_train_ml, X_test_ml, y_train_ml, y_test_ml = train_test_split(X_ml, y_ml, test_size=0.2, random_state=42, stratify=y_ml)
print("Data split for ML complete.")

### 2c. Train and Evaluate ML Models

In [None]:
models_ml = {
    "SVM": SVC(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42)
}

results_ml = []
print("\nTraining classical ML models...")
for name, model in models_ml.items():
    print(f"--- Training {name} ---")
    start_time = time.time()
    model.fit(X_train_ml, y_train_ml)
    training_time = time.time() - start_time
    y_pred = model.predict(X_test_ml)
    accuracy = accuracy_score(y_test_ml, y_pred)
    results_ml.append({"Model": name, "Accuracy": accuracy, "Training Time (s)": training_time})
    print(f"{name} - Accuracy: {accuracy:.4f}, Time: {training_time:.2f}s")

### 2d. ML Results

In [None]:
results_df_ml = pd.DataFrame(results_ml).sort_values(by="Accuracy", ascending=False).reset_index(drop=True)
print("\n--- ML Models Comparative Analysis ---")
print(results_df_ml)

# Part 2: Deep Learning Analysis (PyTorch)

### 3a. Setup and Data Preparation for DL

In [None]:
print("\n--- 3. Starting Part 2: Deep Learning Analysis with PyTorch (Requires GPU) ---")

# Setup device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# --- Data Preparation for DL models (CNN, LSTM, GRU) ---
VOCAB_SIZE = 10000
MAX_LENGTH = 200
EMBEDDING_DIM = 128
BATCH_SIZE = 128

# Load spaCy model for tokenization
print("Loading spaCy model for DL preprocessing...")
nlp_dl = spacy.load('en_core_web_sm')
nlp_dl.disable_pipes('parser', 'ner')

# Tokenizer and vocab builder using spaCy
def spacy_tokenizer(text):
    return [token.text.lower() for token in nlp_dl(text) if token.is_alpha]

all_text_tokens = []
print("Tokenizing all reviews with spaCy (this may take a moment)...")
for review in df['review']:
    all_text_tokens.extend(spacy_tokenizer(review))

vocab = Counter(all_text_tokens)
vocab = {word: i + 2 for i, (word, _) in enumerate(vocab.most_common(VOCAB_SIZE - 2))}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1

def text_to_sequence_spacy(text, vocab):
    return [vocab.get(word, vocab['<UNK>']) for word in spacy_tokenizer(text)]

sequences = [text_to_sequence_spacy(text, vocab) for text in df['review']]

# Padding sequences
def pad_sequences_pytorch(sequences, maxlen):
    padded = np.zeros((len(sequences), maxlen), dtype=np.int64)
    for i, seq in enumerate(sequences):
        seq = seq[:maxlen]
        padded[i, :len(seq)] = seq
    return padded

padded_sequences = pad_sequences_pytorch(sequences, MAX_LENGTH)

# PyTorch Dataset Class
class IMDbDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = torch.tensor(sequences, dtype=torch.long)
        self.labels = torch.tensor(labels, dtype=torch.float32)
    def __len__(self):
        return len(self.sequences)
    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(padded_sequences, y_ml, test_size=0.2, random_state=42, stratify=y_ml)

train_dataset = IMDbDataset(X_train_dl, y_train_dl)
test_dataset = IMDbDataset(X_test_dl, y_test_dl)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

### 3b. PyTorch Model Definitions (CNN, LSTM, GRU)

In [None]:
class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM, padding_idx=0)
        self.conv1d = nn.Conv1d(EMBEDDING_DIM, 128, kernel_size=5)
        self.relu = nn.ReLU()
        self.global_max_pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(128, 64)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(64, 1)
    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1) # (B, Emb, Len)
        x = self.relu(self.conv1d(x))
        x = self.global_max_pool(x).squeeze(2)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)

class LSTMModel(nn.Module):
    def __init__(self):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM, padding_idx=0)
        self.lstm = nn.LSTM(EMBEDDING_DIM, 64, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(64 * 2, 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(64, 1)
    def forward(self, x):
        x = self.embedding(x)
        lstm_out, (hidden, cell) = self.lstm(x)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        x = self.relu(self.fc1(hidden))
        x = self.dropout(x)
        return self.fc2(x)

class GRUModel(nn.Module):
    def __init__(self):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(VOCAB_SIZE, EMBEDDING_DIM, padding_idx=0)
        self.gru = nn.GRU(EMBEDDING_DIM, 64, batch_first=True, bidirectional=True)
        self.fc1 = nn.Linear(64 * 2, 64)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(64, 1)
    def forward(self, x):
        x = self.embedding(x)
        gru_out, hidden = self.gru(x)
        hidden = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        x = self.relu(self.fc1(hidden))
        x = self.dropout(x)
        return self.fc2(x)

### 3c. Training and Evaluation Loop for DL Models

In [None]:
def train_and_evaluate_model(model, train_loader, test_loader, epochs=5):
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters())
    model.to(device)

    for epoch in range(epochs):
        model.train()
        print(f"Epoch {epoch+1}/{epochs}")
        for sequences, labels in train_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(sequences).squeeze(1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for sequences, labels in test_loader:
            sequences, labels = sequences.to(device), labels.to(device)
            outputs = model(sequences).squeeze(1)
            predicted = torch.round(torch.sigmoid(outputs))
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

models_dl = {
    "CNN": CNNModel(),
    "LSTM": LSTMModel(),
    "GRU": GRUModel()
}
results_dl = []

print("\nTraining Deep Learning models (CNN, LSTM, GRU) with PyTorch...")
for name, model in models_dl.items():
    print(f"--- Training {name} ---")
    start_time = time.time()
    accuracy = train_and_evaluate_model(model, train_loader, test_loader)
    training_time = time.time() - start_time
    results_dl.append({"Model": name, "Accuracy": accuracy, "Training Time (s)": training_time})
    print(f"{name} - Accuracy: {accuracy:.4f}, Time: {training_time:.2f}s")

### 3d. BERT Fine-Tuning

In [None]:
print("\n--- Fine-tuning BERT with PyTorch (this will take the most time) ---")
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
bert_model.to(device)

class BERTDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer, max_len):
        self.reviews = reviews
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    def __len__(self):
        return len(self.reviews)
    def __getitem__(self, item):
        review = str(self.reviews[item])
        encoding = self.tokenizer.encode_plus(
          review,
          add_special_tokens=True, max_length=self.max_len,
          return_token_type_ids=False, padding='max_length',
          return_attention_mask=True, return_tensors='pt', truncation=True
        )
        return {
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'labels': torch.tensor(self.labels[item], dtype=torch.long)
        }

train_reviews, test_reviews, train_sentiments, test_sentiments = train_test_split(
    df['review'].to_numpy(), y_ml, test_size=0.2, random_state=42, stratify=y_ml)

bert_train_dataset = BERTDataset(train_reviews, train_sentiments, bert_tokenizer, 128)
bert_test_dataset = BERTDataset(test_reviews, test_sentiments, bert_tokenizer, 128)
bert_train_loader = DataLoader(bert_train_dataset, batch_size=16, shuffle=True)
bert_test_loader = DataLoader(bert_test_dataset, batch_size=16)

optimizer = AdamW(bert_model.parameters(), lr=3e-5)
start_time_bert = time.time()

bert_model.train()
for epoch in range(2): # 2 epochs for BERT is usually enough
    print(f"BERT Epoch {epoch+1}/2")
    for batch in bert_train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
training_time_bert = time.time() - start_time_bert

### 3e. Evaluate BERT and Display Final DL Results

In [None]:
bert_model.eval()
correct = 0
total = 0
with torch.no_grad():
    for batch in bert_test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        correct += torch.sum(preds == labels)
        total += len(labels)

accuracy_bert = correct.double() / total
results_dl.append({"Model": "BERT", "Accuracy": accuracy_bert.item(), "Training Time (s)": training_time_bert})
print(f"BERT - Accuracy: {accuracy_bert.item():.4f}, Time: {training_time_bert:.2f}s")

# Display DL Results
results_df_dl = pd.DataFrame(results_dl).sort_values(by="Accuracy", ascending=False).reset_index(drop=True)
print("\n--- Deep Learning Models Comparative Analysis (PyTorch) ---")
print(results_df_dl)
print("-" * 30 + "\n")
print("--- Analysis Complete ---")

## 4. LLM Prompting: One-shot and Few-shot Examples

In this section, we perform one-shot and few-shot prompting using an LLM for sentiment analysis.

In [None]:
# One-shot and Few-shot prompting using OpenAI's GPT model
from openai import OpenAI

client = OpenAI(api_key="sk-xxxxxxxxxxxxxxxxxxxxxxx")

# One-shot example
one_shot_prompt = '''Classify the sentiment of the given review.
Example: Review: "The movie was fantastic!" → Sentiment: Positive
Review: "The plot was boring and predictable." → Sentiment: '''

one_shot_response = client.chat.completions.create(
    model="gpt-5",
    messages=[{"role": "user", "content": one_shot_prompt}]
)
print("One-shot Result:", one_shot_response.choices[0].message.content.strip())

# Few-shot example
few_shot_prompt = '''Classify the sentiment of the following reviews.
Review: "The movie was fantastic!" → Sentiment: Positive
Review: "Terrible acting and weak story." → Sentiment: Negative
Review: "It was okay, not great but not bad either." → Sentiment: Neutral
Now classify:
Review: "The visuals were stunning, but the dialogue was flat." → Sentiment: '''

few_shot_response = client.chat.completions.create(
    model="gpt-5",
    messages=[{"role": "user", "content": few_shot_prompt}]
)
print("Few-shot Result:", few_shot_response.choices[0].message.content.strip())