In [2]:
%pip install nltk
%pip install scikeras
%pip install tensorflow

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2024.11.6-cp312-cp312-win_amd64.whl.metadata (41 kB)
     ---------------------------------------- 0.0/41.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/41.5 kB ? eta -:--:--
     ---------------------------------------- 41.5/41.5 kB ? eta 0:00:00
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   - -------------------------------------- 0.1/1.5 MB 1.6 MB/s eta 0:00:01
   -- ------------------------------------- 0.1/1.5 MB 1.1 MB/s eta 0:00:02
   -- ------------------------------------- 0.1/1.5 MB 930.9 kB/s eta 0:00:02
   --- ------------------------------------ 0.1/1.5 MB 654.9 kB/s eta 0:00:03
   ---- -------------------


[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Collecting keras>=3.2.0 (from scikeras)
  Downloading keras-3.8.0-py3-none-any.whl.metadata (5.8 kB)
Collecting absl-py (from keras>=3.2.0->scikeras)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting namex (from keras>=3.2.0->scikeras)
  Downloading namex-0.0.8-py3-none-any.whl.metadata (246 bytes)
Collecting h5py (from keras>=3.2.0->scikeras)
  Downloading h5py-3.12.1-cp312-cp312-win_amd64.whl.metadata (2.5 kB)
Collecting optree (from keras>=3.2.0->scikeras)
  Downloading optree-0.14.0-cp312-cp312-win_amd64.whl.metadata (48 kB)
     ---------------------------------------- 0.0/48.6 kB ? eta -:--:--
     -------- ------------------------------- 10.2/48.6 kB ? eta -:--:--
     ------------------------ ------------- 30.7/48.6 kB 435.7 kB/s eta 0:00:01
     -------------------------------------- 48.6/48.6 kB 4


[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting tensorflow
  Downloading tensorflow-2.18.0-cp312-cp312-win_amd64.whl.metadata (3.3 kB)
Collecting tensorflow-intel==2.18.0 (from tensorflow)
  Downloading tensorflow_intel-2.18.0-cp312-cp312-win_amd64.whl.metadata (4.9 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading flatbuffers-25.1.24-py2.py3-none-any.whl.metadata (875 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow-intel==2.18.0->tensorflow)
  Downloading libclang-18.1.1


[notice] A new release of pip is available: 24.0 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
# ===============================
# Step 1: Setup and Library Imports
# ===============================
import nltk
import string, re, random
import numpy as np
import pandas as pd

# Download required NLTK resources
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from nltk.corpus import movie_reviews, stopwords
from nltk.stem import WordNetLemmatizer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# For the neural network
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier

# ===============================
# Step 2: Data Collection and Cleaning
# ===============================
# Load the movie reviews dataset from NLTK.
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

# Combine word tokens back to a string and create labels (1 for positive, 0 for negative)
texts = [' '.join(words) for words, label in documents]
labels = [1 if label == 'pos' else 0 for words, label in documents]

# ===============================
# Step 3: Advanced Preprocessing
# ===============================
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    """
    Cleans input text by:
      - Converting to lowercase.
      - Removing digits.
      - Removing punctuation.
      - Removing extra whitespace.
      - Tokenizing, removing stop words, and lemmatizing.
    """
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Clean all texts
clean_texts = [clean_text(text) for text in texts]

# ===============================
# Step 4: Train/Test Split
# ===============================
X_train, X_test, y_train, y_test = train_test_split(clean_texts, labels, test_size=0.2, random_state=42)

# ===============================
# Step 5: Define and Evaluate Multiple Models
# ===============================

# We will store evaluation results here.
results = []

##############################
# Model 1: Logistic Regression
##############################
pipeline_lr = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9)),
    ('clf', LogisticRegression(max_iter=1000, random_state=42))
])
param_grid_lr = {
    'tfidf__max_features': [5000, 10000],
    'clf__C': [0.1, 1, 10]
}
grid_lr = GridSearchCV(pipeline_lr, param_grid_lr, cv=5, scoring='accuracy', n_jobs=-1)
grid_lr.fit(X_train, y_train)
y_pred_lr = grid_lr.predict(X_test)

acc_lr = accuracy_score(y_test, y_pred_lr)
prec_lr = precision_score(y_test, y_pred_lr)
rec_lr = recall_score(y_test, y_pred_lr)
f1_lr = f1_score(y_test, y_pred_lr)

results.append({
    'Model': 'Logistic Regression',
    'Accuracy': acc_lr,
    'Precision': prec_lr,
    'Recall': rec_lr,
    'F1': f1_lr,
    'Best Params': grid_lr.best_params_
})
print("Logistic Regression Best Params:", grid_lr.best_params_)

##############################
# Model 2: Support Vector Machine
##############################
pipeline_svm = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9)),
    ('clf', SVC(probability=True, random_state=42))
])
param_grid_svm = {
    'tfidf__max_features': [5000, 10000],
    'clf__C': [0.1, 1, 10],
    'clf__kernel': ['linear', 'rbf']
}
grid_svm = GridSearchCV(pipeline_svm, param_grid_svm, cv=5, scoring='accuracy', n_jobs=-1)
grid_svm.fit(X_train, y_train)
y_pred_svm = grid_svm.predict(X_test)

acc_svm = accuracy_score(y_test, y_pred_svm)
prec_svm = precision_score(y_test, y_pred_svm)
rec_svm = recall_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)

results.append({
    'Model': 'SVM',
    'Accuracy': acc_svm,
    'Precision': prec_svm,
    'Recall': rec_svm,
    'F1': f1_svm,
    'Best Params': grid_svm.best_params_
})
print("SVM Best Params:", grid_svm.best_params_)

##############################
# Model 3: Multinomial Naive Bayes
##############################
pipeline_nb = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9)),
    ('clf', MultinomialNB())
])
param_grid_nb = {
    'tfidf__max_features': [5000, 10000],
    'clf__alpha': [0.5, 1.0, 1.5]
}
grid_nb = GridSearchCV(pipeline_nb, param_grid_nb, cv=5, scoring='accuracy', n_jobs=-1)
grid_nb.fit(X_train, y_train)
y_pred_nb = grid_nb.predict(X_test)

acc_nb = accuracy_score(y_test, y_pred_nb)
prec_nb = precision_score(y_test, y_pred_nb)
rec_nb = recall_score(y_test, y_pred_nb)
f1_nb = f1_score(y_test, y_pred_nb)

results.append({
    'Model': 'Multinomial NB',
    'Accuracy': acc_nb,
    'Precision': prec_nb,
    'Recall': rec_nb,
    'F1': f1_nb,
    'Best Params': grid_nb.best_params_
})
print("Multinomial NB Best Params:", grid_nb.best_params_)

##############################
# Model 4: Voting Ensemble
##############################
# Build individual classifiers with fixed (reasonably chosen) hyperparameters.
lr_fixed = LogisticRegression(C=1, max_iter=1000, random_state=42)
svm_fixed = SVC(C=1, kernel='linear', probability=True, random_state=42)
nb_fixed = MultinomialNB(alpha=1.0)

ensemble = VotingClassifier(estimators=[
    ('lr', lr_fixed),
    ('svm', svm_fixed),
    ('nb', nb_fixed)
], voting='soft')

pipeline_ensemble = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9, max_features=10000)),
    ('clf', ensemble)
])
pipeline_ensemble.fit(X_train, y_train)
y_pred_ens = pipeline_ensemble.predict(X_test)

acc_ens = accuracy_score(y_test, y_pred_ens)
prec_ens = precision_score(y_test, y_pred_ens)
rec_ens = recall_score(y_test, y_pred_ens)
f1_ens = f1_score(y_test, y_pred_ens)

results.append({
    'Model': 'Voting Ensemble',
    'Accuracy': acc_ens,
    'Precision': prec_ens,
    'Recall': rec_ens,
    'F1': f1_ens,
    'Best Params': 'Fixed ensemble parameters'
})

##############################
# Model 5: Neural Network (Deep Learning)
##############################
# For the NN, we use a fixed TfidfVectorizer so we know the input dimension.
nn_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=5, max_df=0.9, max_features=5000)
X_train_tfidf_nn = nn_vectorizer.fit_transform(X_train)
X_test_tfidf_nn = nn_vectorizer.transform(X_test)
input_dim = X_train_tfidf_nn.shape[1]

# Convert the sparse matrix to a dense array.
X_train_nn = X_train_tfidf_nn.toarray()
X_test_nn = X_test_tfidf_nn.toarray()

def build_nn_model():
    model = Sequential()
    model.add(Dense(128, activation='relu', input_dim=input_dim))
    model.add(Dropout(0.5))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Wrap the model for use with scikit-learn.
nn_classifier = KerasClassifier(build_fn=build_nn_model, epochs=10, batch_size=32, verbose=0)
nn_classifier.fit(X_train_nn, np.array(y_train))
y_pred_nn_prob = nn_classifier.predict(X_test_nn)
# KerasClassifier returns predictions as floats; convert to int labels (0 or 1)
y_pred_nn = (y_pred_nn_prob > 0.5).astype(int)

acc_nn = accuracy_score(y_test, y_pred_nn)
prec_nn = precision_score(y_test, y_pred_nn)
rec_nn = recall_score(y_test, y_pred_nn)
f1_nn = f1_score(y_test, y_pred_nn)

results.append({
    'Model': 'Neural Network',
    'Accuracy': acc_nn,
    'Precision': prec_nn,
    'Recall': rec_nn,
    'F1': f1_nn,
    'Best Params': 'Fixed NN architecture, epochs=10, batch_size=32'
})

# ===============================
# Step 6: Results Table and Best Model Selection
# ===============================
results_df = pd.DataFrame(results)
results_df = results_df[['Model', 'Accuracy', 'Precision', 'Recall', 'F1', 'Best Params']]
results_df = results_df.sort_values(by='Accuracy', ascending=False)
print("\n\n=== Comparison of Models ===")
print(results_df.to_string(index=False))

# Determine the best model by highest accuracy.
best_model_name = results_df.iloc[0]['Model']
print("\nBest performing model:", best_model_name)

# Print the full classification report for the best model
print("\n=== Detailed Classification Report ===")
if best_model_name == 'Logistic Regression':
    best_preds = y_pred_lr
elif best_model_name == 'SVM':
    best_preds = y_pred_svm
elif best_model_name == 'Multinomial NB':
    best_preds = y_pred_nb
elif best_model_name == 'Voting Ensemble':
    best_preds = y_pred_ens
elif best_model_name == 'Neural Network':
    best_preds = y_pred_nn
else:
    best_preds = None

print(classification_report(y_test, best_preds))


[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\sayan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sayan\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sayan\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sayan\AppData\Roaming\nltk_data...


Logistic Regression Best Params: {'clf__C': 10, 'tfidf__max_features': 10000}
SVM Best Params: {'clf__C': 1, 'clf__kernel': 'linear', 'tfidf__max_features': 5000}
Multinomial NB Best Params: {'clf__alpha': 0.5, 'tfidf__max_features': 10000}


  X, y = self._initialize(X, y)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)




=== Comparison of Models ===
              Model  Accuracy  Precision   Recall       F1                                                         Best Params
Logistic Regression    0.8625   0.846512 0.892157 0.868735                        {'clf__C': 10, 'tfidf__max_features': 10000}
    Voting Ensemble    0.8550   0.847619 0.872549 0.859903                                           Fixed ensemble parameters
                SVM    0.8475   0.832558 0.877451 0.854415 {'clf__C': 1, 'clf__kernel': 'linear', 'tfidf__max_features': 5000}
     Neural Network    0.8375   0.826291 0.862745 0.844125                     Fixed NN architecture, epochs=10, batch_size=32
     Multinomial NB    0.8350   0.855670 0.813725 0.834171                   {'clf__alpha': 0.5, 'tfidf__max_features': 10000}

Best performing model: Logistic Regression

=== Detailed Classification Report ===
              precision    recall  f1-score   support

           0       0.88      0.83      0.86       196
           1  