In [1]:
import pandas as pd
import matplotlib.pyplot as plt

# data prep
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# nltk.download("stopwords")
# nltk.download("punkt")
# nltk.download("wordnet")

# model training
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report, precision_score, recall_score

In [2]:
df = pd.read_csv('IMDB Dataset.csv')

In [3]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
# checking for missing values
df.isna().sum()

review       0
sentiment    0
dtype: int64

In [5]:
# checking the distribution of sentiment class
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

## Basic Text Statistics (Word Count & Sentence Length)

In [6]:
df["word_count"] = df["review"].apply(lambda x: len(str(x).split()))
df["char_count"] = df["review"].apply(lambda x: len(str(x)))
df["sentence_count"] = df["review"].apply(lambda x: len(str(x).split(".")))

print("Basic Text Statistics:\n")
df[["word_count", "char_count", "sentence_count"]].describe()

Basic Text Statistics:



Unnamed: 0,word_count,char_count,sentence_count
count,50000.0,50000.0,50000.0
mean,231.15694,1309.43102,14.0104
std,171.343997,989.728014,9.890968
min,4.0,32.0,1.0
25%,126.0,699.0,8.0
50%,173.0,970.0,11.0
75%,280.0,1590.25,17.0
max,2470.0,13704.0,176.0


In [7]:
total_words = df["word_count"].sum()
print(f"Total words in dataset: {total_words}")

Total words in dataset: 11557847


## "review" column preprocessing pipeline
this process will include:
* Converting to lowercase
* Removing special characters, punctuation, and numbers
* Tokenizing
* Removing stopwords
* Lemmatizing

P.S. Tokenizing means breaking text into smaller parts, usually words or sentences, to make it easier to analyze. <br>Lemmatizing reduces words to their base or dictionary form (e.g., running → run), helping models understand different variations of the same word. Both techniques improve text processing by making data cleaner and more structured for machine learning.

In [8]:
from bs4 import BeautifulSoup

def clean_text(text):
    # removing HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    return text

df["review"] = df["review"].apply(clean_text)

In [9]:
# defining stopwords, lemmatizer
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# cleaning function
def preprocess_text(text):
    # converting to lowercase
    text = text.lower()
    
    # removing special characters, numbers, and punctuation
    text = re.sub(r"[^a-z\s]", "", text)
    
    # tokenizing the text
    tokens = word_tokenize(text)
    
    # removing stopwords and apply lemmatization
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # joining tokens back into a string
    return " ".join(tokens)

In [10]:
df["cleaned_review"] = df["review"].apply(preprocess_text)
df[["review", "cleaned_review"]].head()

Unnamed: 0,review,cleaned_review
0,One of the other reviewers has mentioned that ...,one reviewer mentioned watching oz episode you...
1,A wonderful little production. The filming tec...,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,basically there family little boy jake think t...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter matteis love time money visually stunni...


### Check the prep results

In [11]:
df["original_length"] = df["review"].apply(len)
df["cleaned_length"] = df["cleaned_review"].apply(len)

# length statistics comparison
df[["original_length", "cleaned_length"]].describe()

Unnamed: 0,original_length,cleaned_length
count,50000.0,50000.0
mean,1285.19024,819.54246
std,971.155366,632.153317
min,32.0,17.0
25%,689.0,429.0
50%,953.0,603.0
75%,1559.0,998.0
max,13584.0,9182.0


In [12]:
from collections import Counter

# getting most common words in original and cleaned text
original_words = Counter(" ".join(df["review"]).split()).most_common(20)
cleaned_words = Counter(" ".join(df["cleaned_review"]).split()).most_common(20)

print("Top 20 words in Original Reviews:\n", original_words)
print("\nTop 20 words in Cleaned Reviews:\n", cleaned_words)

Top 20 words in Original Reviews:
 [('the', 568758), ('a', 306961), ('and', 301931), ('of', 283626), ('to', 261851), ('is', 203056), ('in', 169983), ('I', 133367), ('that', 126818), ('this', 113733), ('it', 107920), ('was', 92658), ('as', 83132), ('with', 82569), ('for', 80920), ('The', 68906), ('but', 66286), ('on', 61197), ('movie', 60762), ('are', 56513)]

Top 20 words in Cleaned Reviews:
 [('movie', 99026), ('film', 89809), ('one', 52677), ('like', 39790), ('time', 29397), ('good', 28615), ('character', 27573), ('get', 24435), ('even', 24286), ('story', 24229), ('would', 24001), ('make', 23565), ('see', 23494), ('really', 22900), ('scene', 20706), ('much', 18897), ('well', 18629), ('people', 17979), ('great', 17803), ('bad', 17673)]


In [13]:
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt

# wordcloud_original = WordCloud(width=800, height=400, background_color="white").generate(" ".join(df["review"]))
# wordcloud_cleaned = WordCloud(width=800, height=400, background_color="white").generate(" ".join(df["cleaned_review"]))

# # plotting side-by-side comparison
# fig, ax = plt.subplots(1, 2, figsize=(15, 6))

# ax[0].imshow(wordcloud_original, interpolation="bilinear")
# ax[0].set_title("Original Reviews Word Cloud")
# ax[0].axis("off")

# ax[1].imshow(wordcloud_cleaned, interpolation="bilinear")
# ax[1].set_title("Cleaned Reviews Word Cloud")
# ax[1].axis("off")

# plt.show()

## Train a Model
Simple Naive Bayes / Logistic Regression Model

We use Naive Bayes because it's a simple, fast, and effective algorithm for text classification, especially when features (words) are independent, which is often assumed in NLP tasks. Logistic Regression is used because it's a strong baseline model for binary classification (positive vs. negative), performs well with high-dimensional text data, and outputs interpretable probabilities. Both models work well with small to medium-sized datasets and are computationally efficient compared to deep learning methods.

In [14]:
import time
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import pipeline

# ✅ Convert sentiment labels to binary (0: Negative, 1: Positive)
df["sentiment"] = df["sentiment"].map({"negative": 0, "positive": 1})

# ✅ Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(df["review"], df["sentiment"], test_size=0.2, random_state=42)

# ✅ TF-IDF vectorization (removes stopwords & keeps top 5,000 words)
vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)  # Learn vocabulary & transform train data
X_test_tfidf = vectorizer.transform(X_test)  # Transform test data using same vocabulary

In [15]:
# verifying that the vectorizer has learned something from the training data
# print(vectorizer.vocabulary_)

Even if we already removed stopwords using NLTK, TF-IDF is still useful for several reasons:

* **Weighting Important Words** – TF-IDF assigns higher importance to words that appear frequently in a document but not in all documents, making it better at distinguishing meaningful terms.

* **Handling Word Frequency Differences** – Some words (even after stopword removal) appear very frequently but might not be important. TF-IDF helps reduce their impact by giving lower weights to overly common words.

* **Feature Representation for ML Models** – Machine learning models can't work with raw text, so we need numerical features. TF-IDF converts text into a sparse matrix of weighted word frequencies, which is more informative than simple word counts.

## End-to-End Model Training, Evaluation & Timing

In [16]:
# function to evaluate model performance and measure runtime
def evaluate_model_with_time(model_name, y_true, y_pred, start_time):
    end_time = time.time()
    runtime_minutes = (end_time - start_time) / 60  # Convert to minutes
    
    return {
        "Model": model_name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1-Score": f1_score(y_true, y_pred),
        "Runtime (min)": round(runtime_minutes, 2)
    }

# selecting a random 3,000-sample test subset for fair comparison
sample_indices = np.random.choice(len(X_test), 5000, replace=False)
X_test_sample = [X_test.iloc[i] for i in sample_indices]
y_test_sample = [y_test.iloc[i] for i in sample_indices]

# training & evaluating Naive Bayes ###
nb_model = MultinomialNB()
start_time = time.time()
nb_model.fit(X_train_tfidf, y_train)
nb_preds_sample = nb_model.predict(vectorizer.transform(X_test_sample))
nb_results_sample = evaluate_model_with_time("Naive Bayes (5K)", y_test_sample, nb_preds_sample, start_time)

# training & evaluating Logistic Regression ###
lr_model = LogisticRegression(max_iter=500)
start_time = time.time()
lr_model.fit(X_train_tfidf, y_train)
lr_preds_sample = lr_model.predict(vectorizer.transform(X_test_sample))
lr_results_sample = evaluate_model_with_time("Logistic Regression (5K)", y_test_sample, lr_preds_sample, start_time)

# loading & evaluating DistilBERT ###
classifier = pipeline(
    "sentiment-analysis",
    model="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
    tokenizer="distilbert/distilbert-base-uncased-finetuned-sst-2-english",
    truncation=True
)
start_time = time.time()
bert_preds_sample = classifier(X_test_sample)
bert_preds_binary_sample = [1 if pred['label'] == 'POSITIVE' else 0 for pred in bert_preds_sample]
bert_results_sample = evaluate_model_with_time("DistilBERT (5K)", y_test_sample, bert_preds_binary_sample, start_time)

df_results_sample = pd.DataFrame([nb_results_sample, lr_results_sample, bert_results_sample])
df_results_sample

Device set to use mps:0


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Runtime (min)
0,Naive Bayes (5K),0.8548,0.857654,0.854257,0.855952,0.01
1,Logistic Regression (5K),0.8894,0.876336,0.909307,0.892517,0.02
2,DistilBERT (5K),0.886,0.909167,0.860198,0.884005,49.0


1. Naive Bayes (5K)
* **Accuracy**: 0.8548: This is a solid performance, indicating that Naive Bayes is performing reasonably well with 5,000 reviews.
* **Precision**: 0.8577: Precision is slightly better than recall, meaning that when Naive Bayes predicts a review as positive, it's fairly accurate.
* **Recall**: 0.8543: Recall is very close to precision, which means the model is catching most of the positive reviews (though not all).
* **F1-Score**: 0.8560: This is quite a good F1 score, balancing precision and recall well.
* **Runtime**: 0.01 minutes: Fast! Naive Bayes is very efficient, even with 5,000 reviews.
2. Logistic Regression (5K)
* **Accuracy**: 0.8894: Logistic Regression continues to show strong performance, with an accuracy of 88.94%, a bit higher than Naive Bayes.
* **Precision**: 0.8763: Precision is slightly lower than Naive Bayes, meaning there are slightly more false positives.
* **Recall**: 0.9093: Recall is better than Naive Bayes, meaning Logistic Regression is good at identifying positive reviews.
* **F1-Score**: 0.8925: The higher F1 score suggests that Logistic Regression balances precision and recall better than Naive Bayes.
* **Runtime**: 0.02 minutes: Still quick and efficient for 5,000 samples, but it does take a bit longer than Naive Bayes.
3. DistilBERT (5K)
* **Accuracy**: 0.8860: DistilBERT's accuracy is competitive with Logistic Regression, but lower than its performance on the 1K sample.
* **Precision**: 0.9092: DistilBERT achieves the highest precision, meaning it's very good at predicting positive reviews when it does make that prediction.
* **Recall**: 0.8602: The recall is lower than its precision, meaning it's not catching as many of the actual positive reviews, which could lead to more false negatives.
* **F1-Score**: 0.8840: A decent F1 score, though not the best among the models, showing that its precision is pulling the overall performance up.
* **Runtime**: 49.00 minutes: As expected, the runtime is significantly longer. DistilBERT benefits from a powerful transformer model but is much slower compared to the traditional machine learning models.

**Comparison and Key Takeaways:**
Naive Bayes is still the fastest but performs slightly worse in accuracy and other metrics compared to Logistic Regression.
Logistic Regression shows a strong balance of precision, recall, and F1-score, making it a reliable choice for this task.
DistilBERT, while being competitive in terms of precision, still lags behind in recall and has a significantly longer runtime. It could be more effective with fine-tuning, but for now, it's slower than the other models.