In [None]:
# Cell 1: Import libraries
from datasets import load_dataset
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import time

print("✅ Libraries imported successfully")

In [None]:
# Cell 2: Load IMDb dataset
print("Loading IMDb dataset...")
dataset = load_dataset("imdb")

# Use smaller subset for faster training (remove this for full dataset)
train_data = dataset['train'].shuffle(seed=42).select(range(5000))
test_data = dataset['test'].shuffle(seed=42).select(range(1000))

print(f"Training samples: {len(train_data)}")
print(f"Test samples: {len(test_data)}")

In [None]:
# Cell 3: Convert to pandas DataFrames
train_df = pd.DataFrame(train_data)
test_df = pd.DataFrame(test_data)

# Display sample
print("\nSample reviews:")
print(train_df.head(2))

In [None]:
# Cell 4: Text preprocessing function
import re
from html import unescape

def preprocess_text(text):
    """Clean and preprocess text reviews"""
    # Remove HTML entities
    text = unescape(text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespace
    text = ' '.join(text.split())
    return text

# Apply preprocessing
print("\nPreprocessing text...")
train_df['clean_text'] = train_df['text'].apply(preprocess_text)
test_df['clean_text'] = test_df['text'].apply(preprocess_text)

print("✅ Preprocessing complete")
print("\nBefore preprocessing:")
print(train_df['text'].iloc[0][:200])
print("\nAfter preprocessing:")
print(train_df['clean_text'].iloc[0][:200])

In [None]:
# Cell 5: Prepare features and labels
X_train = train_df['clean_text']
y_train = train_df['label']
X_test = test_df['clean_text']
y_test = test_df['label']

print(f"\n✅ Data split complete")
print(f"Training set: {len(X_train)} samples")
print(f"Test set: {len(X_test)} samples")