In [5]:
!pip install optuna
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
import optuna
from collections import Counter
import warnings



In [8]:
# 1. Load dataset
df = pd.read_csv("/content/IMDB Dataset.csv")

In [9]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [10]:
# 2. Basic cleaning and stopword filtering
def basic_clean(text):
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text.lower()

df['clean_text'] = df['review'].apply(basic_clean)

# Top 50 most common words as custom stopwords
all_words = ' '.join(df['clean_text']).split()
custom_stopwords = set([word for word, freq in Counter(all_words).most_common(50)])

def preprocess(text):
    text = basic_clean(text)
    tokens = [w for w in text.split() if w not in custom_stopwords]
    return ' '.join(tokens)

df['final_text'] = df['review'].apply(preprocess)

In [11]:

# 3. Filter noisy reviews
def is_noisy(review):
    words = review.split()
    if len(words) < 5:
        return True
    freq = Counter(words)
    return any(v / len(words) > 0.3 for v in freq.values())

df['is_noisy'] = df['final_text'].apply(is_noisy)
df = df[~df['is_noisy']].copy()

In [12]:
# 4. TF-IDF with unigrams + bigrams
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
X = tfidf_vectorizer.fit_transform(df['final_text'])
y = np.where(df['sentiment'] == 'positive', 1, 0)

In [13]:
# 5. Feature selection using L1 Logistic Regression
lr_l1 = LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000)
lr_l1.fit(X, y)
coefs = np.abs(lr_l1.coef_[0])
top_1000_idx = coefs.argsort()[::-1][:1000]
X_top = X[:, top_1000_idx]

In [16]:
# 6. Bayesian optimization for XGBoost
import warnings
def objective(trial):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'eval_metric': 'logloss'
    }
    xgb = XGBClassifier(**params)
    return cross_val_score(xgb, X_top, y, scoring='f1', cv=3).mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)
best_params = study.best_trial.params

[I 2025-06-01 08:21:58,722] A new study created in memory with name: no-name-840b02d0-a995-489b-9578-6639a7a0faaa
[I 2025-06-01 08:22:49,487] Trial 0 finished with value: 0.8193090506432651 and parameters: {'learning_rate': 0.05667826440273822, 'max_depth': 3, 'n_estimators': 192, 'subsample': 0.7854389474270187, 'colsample_bytree': 0.9722356036229217}. Best is trial 0 with value: 0.8193090506432651.
[I 2025-06-01 08:23:00,381] Trial 1 finished with value: 0.825885166303714 and parameters: {'learning_rate': 0.24283095846919997, 'max_depth': 3, 'n_estimators': 58, 'subsample': 0.7832572141949695, 'colsample_bytree': 0.5091783678739623}. Best is trial 1 with value: 0.825885166303714.
[I 2025-06-01 08:24:03,721] Trial 2 finished with value: 0.8617979181886004 and parameters: {'learning_rate': 0.22063446183633392, 'max_depth': 6, 'n_estimators': 132, 'subsample': 0.79693568411685, 'colsample_bytree': 0.5239436126233898}. Best is trial 2 with value: 0.8617979181886004.
[I 2025-06-01 08:28:1

In [17]:
# 7. Create final models
xgb = XGBClassifier(**best_params, use_label_encoder=False, eval_metric='logloss')
lr = LogisticRegression(max_iter=1000)
svm = LinearSVC()

In [18]:
# 8. Ensemble with hard voting
voting = VotingClassifier(estimators=[('lr', lr), ('xgb', xgb), ('svm', svm)], voting='hard')

In [19]:
# 9. Stratified 5-Fold CV
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
f1s = []

for train_idx, test_idx in skf.split(X_top, y):
    X_train, X_test = X_top[train_idx], X_top[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]
    voting.fit(X_train, y_train)
    preds = voting.predict(X_test)
    f1s.append(f1_score(y_test, preds))

print(f"Mean F1 Score (5-fold): {np.mean(f1s):.4f}")

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Mean F1 Score (5-fold): 0.8986


In [20]:
# 10. Visualize top 5 misclassified reviews with top contributing words
y_pred_final = voting.predict(X_top)
misclassified_idx = np.where(y_pred_final != y)[0][:5]

feature_names = np.array(tfidf_vectorizer.get_feature_names_out())
coefs_lr = lr_l1.coef_[0]

print("\n--- Top 5 Misclassified Reviews with Contributing Words ---")
for idx in misclassified_idx:
    original_review = df.iloc[idx]['review']
    actual = 'positive' if y[idx] == 1 else 'negative'
    predicted = 'positive' if y_pred_final[idx] == 1 else 'negative'

    print(f"\nReview #{idx}: Actual: {actual} | Predicted: {predicted}")
    print(original_review[:500] + ("..." if len(original_review) > 500 else ""))

    vec = X_top[idx].toarray().flatten()
    word_scores = vec * coefs[top_1000_idx]
    top_indices = word_scores.argsort()[::-1][:5]
    for i in top_indices:
        print(f"  {feature_names[top_1000_idx[i]]} → weight: {word_scores[i]:.4f}")


--- Top 5 Misclassified Reviews with Contributing Words ---

Review #11: Actual: negative | Predicted: positive
I saw this movie when I was about 12 when it came out. I recall the scariest scene was the big bird eating men dangling helplessly from parachutes right out of the air. The horror. The horror.<br /><br />As a young kid going to these cheesy B films on Saturday afternoons, I still was tired of the formula for these monster type movies that usually included the hero, a beautiful woman who might be the daughter of a professor and a happy resolution when the monster died in the end. I didn't care mu...
  loved → weight: 0.6007
  predictable → weight: 0.5225
  unintentional → weight: 0.4472
  beautiful → weight: 0.2937
  still → weight: 0.2614

Review #13: Actual: negative | Predicted: positive
The cast played Shakespeare.<br /><br />Shakespeare lost.<br /><br />I appreciate that this is trying to bring Shakespeare to the masses, but why ruin something so good.<br /><br />Is it b