In [86]:
import nltk
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
from urllib import request
from nltk import FreqDist
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import (
    precision_score, recall_score, f1_score,
    accuracy_score, confusion_matrix
)
from gensim.models import Word2Vec


## 1. Introduction

The IMDB Dataset is a widely recognized benchmark in natural language processing, primarily used for document classification and sentiment analysis. It comprises a large collection of movie reviews from the Internet Movie Database (IMDB), each labeled as either positive or negative, providing a balanced and well-structured corpus for analyzing text-based sentiment.

This dataset enables the development and evaluation of machine learning models that classify textual content based on emotional tone or opinion. Its diversity in writing style, vocabulary, and sentiment intensity makes it suitable for testing both traditional algorithms, such as Logistic Regression and Naïve Bayes, and advanced deep learning architectures like Recurrent Neural Networks and Transformers.

## 2. Data Preparation

Each movie review was cleaned to remove noise and ensure consistency. The text was converted to lowercase, HTML tags and punctuation were removed, and extra spaces were collapsed.  
The cleaned text was stored in a new column called **clean_review**. Sentiment labels were also converted from **“positive”** and **“negative”** to binary values (**1** and **0**, respectively) to prepare the data for machine learning classification.


In [87]:
movies_df = pd.read_csv("IMDB Dataset.csv")


In [88]:
movies_df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [89]:

def clean_text(text):
    text = text.lower()
    text = re.sub(r"<.*?>", " ", text)       # remove HTML tags
    text = re.sub(r"[^a-z\s]", " ", text)    # remove punctuation/numbers
    text = re.sub(r"\s+", " ", text).strip() # collapse spaces
    return text

movies_df["clean_review"] = movies_df["review"].apply(clean_text)
movies_df["label"] = movies_df["sentiment"].map({"positive": 1, "negative": 0})


In [90]:
movies_df.head()

Unnamed: 0,review,sentiment,clean_review,label
0,One of the other reviewers has mentioned that ...,positive,one of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,positive,a wonderful little production the filming tech...,1
2,I thought this was a wonderful way to spend ti...,positive,i thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,negative,basically there s a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,petter mattei s love in the time of money is a...,1


In [91]:
x_train,x_test, y_train,y_test = train_test_split(
    movies_df["clean_review"],
    movies_df["label"],
    train_size=0.8,
    test_size=0.2, 
    random_state=456,
    stratify=movies_df["label"]
    )

In [92]:
# vectorizer = TfidfVectorizer(max_features=5000,stop_words="english",ngram_range=(1,4))
# v_train_set = vectorizer.fit_transform(x_train)
# v_test_set =  vectorizer.fit_transform(x_test)
x_train_tokens = [text.split() for text in x_train]
x_test_tokens  = [text.split() for text in x_test]

w2v_model = Word2Vec(
    sentences=x_train_tokens,
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
    sg=1
)

def document_vector(words):
    words = [w for w in words if w in w2v_model.wv]
    if len(words) == 0:
        return np.zeros(w2v_model.vector_size)
    return np.mean(w2v_model.wv[words], axis=0)

v_train_set = np.vstack([document_vector(words) for words in x_train_tokens])
v_test_set  = np.vstack([document_vector(words) for words in x_test_tokens])

array([[-0.12770022,  0.14132889,  0.00443909, ..., -0.13852431,
         0.02057929, -0.14643483],
       [-0.14634496,  0.16588333,  0.03190178, ..., -0.157649  ,
        -0.02352808, -0.13709491],
       [-0.16994557,  0.15071285,  0.05862762, ..., -0.1477762 ,
         0.01954959, -0.08624883],
       ...,
       [-0.11269318,  0.23411827, -0.02945689, ..., -0.12321492,
         0.08264293, -0.09171139],
       [-0.13498987,  0.1477674 ,  0.02775964, ..., -0.16479829,
        -0.02493664, -0.11860866],
       [-0.13356303,  0.14362587,  0.04662751, ..., -0.14539264,
         0.00798832, -0.08886444]], shape=(10000, 100), dtype=float32)

## 3. Model Development

In [93]:
model_metrics = [
        "Set",
        "Accuracy",
        "Precision",
        "Recall",
        "Sensitivity",
        "Specificity",
        "F1"
        ]

def evaluate_model(y_true, y_pred):
   

    acc = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    cm = confusion_matrix(y_true, y_pred)
    TP, FN, FP, TN = cm[0, 0], cm[0, 1], cm[1, 0], cm[1, 1]
    sensitivity = TP / (TP + FN) if (TP + FN) else 0
    specificity = TN / (TN + FP) if (TN + FP) else 0

    return {
        "Accuracy": acc,
        "Precision": prec,
        "Recall": rec,
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "F1": f1
    }
    
def generate_report(model_instance,trainX,trainY,testX,testY):
    y_train_pred = model_instance.predict(trainX)
    y_test_pred = model_instance.predict(testX)
    train_set_metrics = evaluate_model(trainY,y_train_pred)
    test_set_metrics = evaluate_model(testY,y_test_pred)
    train_set_metrics["Set"] = "Training"
    test_set_metrics["Set"] = "Test"
    model_metrics_df = pd.DataFrame(columns=model_metrics,data= [train_set_metrics,test_set_metrics])
    styled_report = model_metrics_df.style.hide(axis="index")
    return model_metrics_df,styled_report
    

### SVM

In [83]:
param_grid = {'C': np.arange(0.001, 1, 0.009)}


svm_model = LinearSVC(random_state=500)

grid = GridSearchCV(svm_model, param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=0)
grid.fit(v_train_set, y_train)
print("Best Parameters:", grid.best_params_)
print("Best CV Accuracy:", round(grid.best_score_, 3))
svm_model = grid.best_estimator_

svm_df, svm_df_styled = generate_report(svm_model,v_train_set,y_train,v_test_set,y_test)
svm_df_styled

Best Parameters: {'C': np.float64(0.15399999999999997)}
Best CV Accuracy: 0.883


Set,Accuracy,Precision,Recall,Sensitivity,Specificity,F1
Training,0.91735,0.909608,0.9268,0.9079,0.9268,0.918124
Test,0.5759,0.574136,0.5878,0.564,0.5878,0.580887


### Logistic Regression

In [51]:
log_reg = LogisticRegression(max_iter=1000)

param_grid = {'C': np.arange(0.001, 1, 0.005), 'solver': ['liblinear', 'lbfgs']}

grid = GridSearchCV(estimator=log_reg, param_grid=param_grid,cv=5, scoring='accuracy', n_jobs=-1, verbose=0)

grid.fit(v_train_set, y_train)

print("Best Parameters:", grid.best_params_)
print("Best CV Accuracy:", round(grid.best_score_, 3))

logistic_model = grid.best_estimator_

lgreg_df, lgreg_df_styled = generate_report(logistic_model,v_train_set,y_train,v_test_set,y_test)
lgreg_df_styled



Best Parameters: {'C': np.float64(0.986), 'solver': 'liblinear'}
Best CV Accuracy: 0.884


Set,Accuracy,Precision,Recall,Sensitivity,Specificity,F1
Training,0.9115,0.904065,0.9207,0.9023,0.9207,0.912307
Test,0.5764,0.598454,0.4644,0.6884,0.4644,0.522973


### Naive Bayes

In [119]:
naive_model = MultinomialNB()
naive_model.fit(v_train_set,y_train)
naive_df, naive_df_styled = generate_report(naive_model,v_train_set,y_train,v_test_set,y_test)
naive_df_styled

Set,Accuracy,Precision,Recall,Sensitivity,Specificity,F1
Training,0.8639,0.853541,0.87855,0.84925,0.87855,0.865865
Test,0.5804,0.579259,0.5876,0.5732,0.5876,0.5834


### xgboost

In [85]:

xgboost = GradientBoostingClassifier(random_state=500)
param_grid = {'n_estimators':[100,500],'learning_rate':[0.01,0.1,0.2],'max_depth':[2,3,4]}
grid = GridSearchCV(xgboost,param_grid,cv=3,scoring='accuracy',n_jobs=-1,verbose=0)
grid.fit(v_train_set,y_train)
print("Best Params:",grid.best_params_)
print("Best CV Accuracy:",round(grid.best_score_,3))
xgboost = grid.best_estimator_

naxgboost_df, xgboost_df_styled = generate_report(xgboost,v_train_set,y_train,v_test_set,y_test)
xgboost_df_styled

Best Params: {'learning_rate': 0.2, 'max_depth': 4, 'n_estimators': 500}
Best CV Accuracy: 0.867


Set,Accuracy,Precision,Recall,Sensitivity,Specificity,F1
Training,0.961775,0.953588,0.9708,0.95275,0.9708,0.962117
Test,0.5531,0.542096,0.6838,0.4224,0.6838,0.604758


### Random Forest

In [121]:
random_forest = RandomForestClassifier(random_state=500)
random_forest.fit(v_train_set,y_train)
random_forest_df, random_forest_df_styled = generate_report(random_forest,v_train_set,y_train,v_test_set,y_test)
random_forest_df_styled

Set,Accuracy,Precision,Recall,Sensitivity,Specificity,F1
Training,1.0,1.0,1.0,1.0,1.0,1.0
Test,0.5841,0.593299,0.5348,0.6334,0.5348,0.562533
