In [24]:

# Part 1: Load & Inspect Data

import numpy as np
import pandas as pd
import string
import nltk
from tensorflow.keras.datasets import imdb
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV

nltk.download('stopwords')

# Keep only the top N most frequent words
TOP_WORDS = 10000
(X_train_raw, y_train_raw), (X_test_raw, y_test_raw) = imdb.load_data(num_words=TOP_WORDS)

print("Train size:", len(X_train_raw))
print("Test size:", len(X_test_raw))
print("Labels:", np.unique(y_train_raw))

# Reverse dictionary for decoding
word_to_index = imdb.get_word_index()
index_to_word = {v + 3: k for k, v in word_to_index.items()}
index_to_word[0] = "<PAD>"
index_to_word[1] = "<START>"
index_to_word[2] = "<UNK>"

def decode_review(encoded):
    return " ".join([index_to_word.get(i, "?") for i in encoded])

# Show an example
print("\nSample review:\n", decode_review(X_train_raw[0]))
print("Label:", y_train_raw[0])


# Part 1: Preprocessing Function


stop_words_set = set(stopwords.words("english"))

def clean_text(text):
    # lowercase
    text = text.lower()
    # remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    # remove stopwords
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words_set]
    return " ".join(tokens)

# Decode and clean reviews
X_train_text = [clean_text(decode_review(r)) for r in X_train_raw]
X_test_text = [clean_text(decode_review(r)) for r in X_test_raw]


# Part 2: Feature Extraction


# TF-IDF features with unigrams & bigrams
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_features = tfidf.fit_transform(X_train_text)
X_test_features = tfidf.transform(X_test_text)

print("TF-IDF shape (train):", X_train_features.shape)
print("TF-IDF shape (test):", X_test_features.shape)


# Part 3: Model Training & Eval


models = {
    "Logistic Regression": LogisticRegression(max_iter=200),
    "Naive Bayes": MultinomialNB(),
    "SVM": LinearSVC()
}

results = {}
for name, clf in models.items():
    clf.fit(X_train_features, y_train_raw)
    preds = clf.predict(X_test_features)
    acc = accuracy_score(y_test_raw, preds)
    prec = precision_score(y_test_raw, preds)
    rec = recall_score(y_test_raw, preds)
    f1 = f1_score(y_test_raw, preds)
    results[name] = (acc, prec, rec, f1)
    print(f"\n=== {name} ===")
    print(classification_report(y_test_raw, preds))
    print("Confusion Matrix:\n", confusion_matrix(y_test_raw, preds))

# Show comparison
print("\nModel Performance Summary:")
for model_name, metrics in results.items():
    print(f"{model_name}: Acc={metrics[0]:.3f}, Prec={metrics[1]:.3f}, Rec={metrics[2]:.3f}, F1={metrics[3]:.3f}")


# Part 4: Pipeline + Tuning


pipe = Pipeline([
    ("vectorizer", TfidfVectorizer(stop_words="english")),
    ("clf", LogisticRegression(max_iter=300))
])

param_grid = {
    "vectorizer__max_features": [3000, 5000],
    "vectorizer__ngram_range": [(1, 1), (1, 2)],
    "clf__C": [0.5, 1, 2]
}

grid = GridSearchCV(pipe, param_grid, cv=3, scoring="accuracy", n_jobs=-1)
grid.fit(X_train_text, y_train_raw)

print("\nBest Pipeline Parameters:", grid.best_params_)
print("Best CV Accuracy:", grid.best_score_)


# Part 5: Inference


final_model = grid.best_estimator_

test_examples = X_test_text[:5]
pred_labels = final_model.predict(test_examples)

for i, review in enumerate(test_examples):
    print(f"\nReview {i+1}: {review[:150]}...")
    print("Predicted:", "Positive" if pred_labels[i] == 1 else "Negative")
    print("Actual:", "Positive" if y_test_raw[i] == 1 else "Negative")

# Final accuracy on test set
final_acc = accuracy_score(y_test_raw, final_model.predict(X_test_text))
print("\nFinal Test Accuracy:", final_acc)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Train size: 25000
Test size: 25000
Labels: [0 1]

Sample review:
 <START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up are such a big profile for the whole