In [None]:
# Cell 1: Import libraries
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import time

print("✅ Libraries imported successfully")

In [None]:
# Cell 2: Load IMDb dataset
print("Loading IMDb dataset...")
dataset = load_dataset("imdb")

# Use smaller subset for faster training (remove this for full dataset)
train_data = dataset['train'].shuffle(seed=42).select(range(5000))
test_data = dataset['test'].shuffle(seed=42).select(range(1000))

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

In [None]:
# Cell 3: Convert to pandas DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Display sample
print("\nSample reviews:")
print(train_df.head(2))

In [None]:
# Cell 4: Text preprocessing function
import re
from html import unescape

def preprocess_text(text):
    """Clean and preprocess text reviews"""
    # Remove HTML entities
    text = unescape(text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply preprocessing
print("\nPreprocessing text...")
train_df['clean_text'] = train_df['text'].apply(preprocess_text)
test_df['clean_text'] = test_df['text'].apply(preprocess_text)

print("✅ Preprocessing complete")
print("\nBefore preprocessing:")
print(train_df['text'].iloc[0][:200])
print("\nAfter preprocessing:")
print(train_df['clean_text'].iloc[0][:200])

In [None]:
# Cell 5: Prepare features and labels
X_train = train_df['clean_text']
y_train = train_df['label']
X_test = test_df['clean_text']
y_test = test_df['label']

print(f"\n✅ Data split complete")
print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")

In [None]:
# Cell 6: TF-IDF Vectorization
print("\n" + "="*50)
print("BASELINE MODEL: Logistic Regression with TF-IDF")
print("="*50)

# Create TF-IDF features
print("\nCreating TF-IDF features...")
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), min_df=2)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

print(f"✅ TF-IDF matrix shape: {X_train_tfidf.shape}")


In [None]:
# Cell 7: Train Logistic Regression
print("\nTraining Logistic Regression model...")
start_time = time.time()

lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_tfidf, y_train)

training_time = time.time() - start_time
print(f"✅ Training completed in {training_time:.2f} seconds")

In [None]:
# Cell 8: Evaluate Logistic Regression
print("\n--- Logistic Regression Evaluation ---")

# Make predictions
start_time = time.time()
y_pred_lr = lr_model.predict(X_test_tfidf)
prediction_time = (time.time() - start_time) / len(X_test)

# Calculate metrics
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr)
recall_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)

print(f"\nAccuracy:  {accuracy_lr:.4f} ({accuracy_lr*100:.2f}%)")
print(f"Precision: {precision_lr:.4f}")
print(f"Recall:    {recall_lr:.4f}")
print(f"F1 Score:  {f1_lr:.4f}")
print(f"Avg prediction time: {prediction_time*1000:.2f}ms")

print("\n" + classification_report(y_test, y_pred_lr, target_names=['Negative', 'Positive']))


In [None]:
# Cell 9: Confusion Matrix
cm_lr = confusion_matrix(y_test, y_pred_lr)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Negative', 'Positive'],
            yticklabels=['Negative', 'Positive'])
plt.title('Logistic Regression - Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.tight_layout()
plt.savefig('../data/lr_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()
print("✅ Confusion matrix saved")

In [None]:
# Cell 10: Train Naive Bayes
print("\n" + "="*50)
print("COMPARISON MODEL: Naive Bayes")
print("="*50)

print("\nTraining Naive Bayes model...")
start_time = time.time()

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

training_time = time.time() - start_time
print(f"✅ Training completed in {training_time:.2f} seconds")

In [None]:
# Cell 11: Evaluate Naive Bayes
print("\n--- Naive Bayes Evaluation ---")

y_pred_nb = nb_model.predict(X_test_tfidf)

accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb)
recall_nb = recall_score(y_test, y_pred_nb)
f1_nb = f1_score(y_test, y_pred_nb)

print(f"\nAccuracy:  {accuracy_nb:.4f} ({accuracy_nb*100:.2f}%)")
print(f"Precision: {precision_nb:.4f}")
print(f"Recall:    {recall_nb:.4f}")
print(f"F1 Score:  {f1_nb:.4f}")

print("\n" + classification_report(y_test, y_pred_nb, target_names=['Negative', 'Positive']))