In [1]:
import pandas as pd 

In [2]:
df=pd.read_csv("IMDB Dataset.csv")

In [3]:
print(df.head())
print(df.isnull().sum())
print(df.describe())

                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
review       0
sentiment    0
dtype: int64
                                                   review sentiment
count                                               50000     50000
unique                                              49582         2
top     Loved today's show!!! It was a variety and not...  positive
freq                                                    5     25000


In [4]:
!pip install nltk



In [5]:
import sys
print(sys.executable)

C:\Users\prana\anaconda3\envs\fresh_env\python.exe


In [6]:
!{sys.executable} -m pip install nltk



In [7]:
import re 
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

In [8]:
#We have to download the nltk resources
#We need stopwords + wordnet lemmatizer.
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\prana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# Stopwords removal = remove noisy filler words
# Lemmatization = convert words to their meaningful base form
# Together they make the text cleaner, more meaningful, and easier for TF-IDF/Word2Vec models to learn from.

stop_words=set(stopwords.words('english'))
Lemmatizer=WordNetLemmatizer()

In [10]:
#removing html tag 
def step_remove_html(text):
    return BeautifulSoup(text,"html.parser").get_text()

# Convert to lowercase 
def step_lowercase(text):
    return text.lower()

#Remove punctuations and numbers
def step_remove_punc_num(text):
    return re.sub(r'[^a-zA-Z]',' ',text)

# Tokenize the corpus
def step_tokenize(text):
    return text.split()

#Remove stopwords
def step_remove_stopwords(words):
    return[word for word in words if word not in stop_words]

#Lemmatization
def step_lemmatize(words):
    return [Lemmatizer.lemmatize(word) for word in words]

#Joined back int the full cleaned sentence
def step_join(words):
    return " ".join(words)

In [11]:
#Now we create a clean function that calls all the smaller steps.
def clean_text(text):
    text=step_remove_html(text)
    text=step_lowercase(text)
    text=step_remove_punc_num(text)

    words=step_tokenize(text)
    words=step_remove_stopwords(words)
    words=step_lemmatize(words)

    return step_join(words)

In [12]:
# Apply to the imdb datset we already have

df['clean_review']=df['review'].apply(clean_text)


In [13]:
df[['review','clean_review']].head()


Unnamed: 0,review,clean_review
0,One of the other reviewers has mentioned that ...,one reviewer mentioned watching oz episode hoo...
1,A wonderful little production. <br /><br />The...,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",petter mattei love time money visually stunnin...


In [22]:
# Model A â€” TF-IDF + Logistic Regression
#Firsly we are encoding the labels (positive=1, negative=0)

df['sentiment_label']=df['sentiment'].map({'positive' : 1,'negative': 0})
df[['sentiment','sentiment_label']].head()

Unnamed: 0,sentiment,sentiment_label
0,positive,1
1,positive,1
2,positive,1
3,negative,0
4,positive,1


In [32]:
#train test splitiing
from sklearn.model_selection import train_test_split
x=df['clean_review']
y=df['sentiment_label']

x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [36]:
#TF-IDF Vectorizer
#This is where my cleaned text gets converted into numerical vectors that ML models can understand.
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=10000)
x_train_tfidf=tfidf.fit_transform(x_train)
x_test_tfidf=tfidf.transform(x_test)

In [44]:
#TF-IDF vectors are ready, which means we have completed the feature engineering
#Train Logistic Regression on TF-IDF
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
#Create the model
model_tfidf = LogisticRegression(max_iter=200)
# Train the model
model_tfidf.fit(x_train_tfidf, y_train)
# Predict on test set
y_pred = model_tfidf.predict(x_test_tfidf)
#Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8945

Classification Report:
               precision    recall  f1-score   support

           0       0.91      0.88      0.89      4961
           1       0.88      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



TF-IDF + Logistic Regression gives a strong baseline accuracy of 89.4% on IMDB reviews.
The model performs consistently across both positive and negative classes, with balanced precision, recall, and F1-scores.
This establishes a reliable reference point for comparing Word2Vec and LSTM models next.

In [51]:
# Train our own Word2Vec model 
#converting our cleaned text into tokenized lists.
sentences = [review.split() for review in df['clean_review']]


In [55]:
#Train Word2Vec Model
!pip install gensim



In [59]:
!{sys.executable} -m pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp39-cp39-win_amd64.whl.metadata (8.6 kB)
Collecting smart_open>=1.8.1 (from gensim)
  Downloading smart_open-7.5.0-py3-none-any.whl.metadata (24 kB)
Downloading gensim-4.4.0-cp39-cp39-win_amd64.whl (24.4 MB)
   ---------------------------------------- 0.0/24.4 MB ? eta -:--:--
   -- ------------------------------------- 1.3/24.4 MB 9.6 MB/s eta 0:00:03
   ----- ---------------------------------- 3.4/24.4 MB 10.1 MB/s eta 0:00:03
   --------- ------------------------------ 5.5/24.4 MB 9.9 MB/s eta 0:00:02
   ------------ --------------------------- 7.9/24.4 MB 10.1 MB/s eta 0:00:02
   --------------- ------------------------ 9.7/24.4 MB 9.7 MB/s eta 0:00:02
   ----------------- ---------------------- 10.5/24.4 MB 9.8 MB/s eta 0:00:02
   ----------------- ---------------------- 10.5/24.4 MB 9.8 MB/s eta 0:00:02
   ----------------- ---------------------- 10.5/24.4 MB 9.8 MB/s eta 0:00:02
   ----------------- ---------------------- 10.5/24.4 M

In [63]:
from gensim.models import Word2Vec
w2v_model = Word2Vec(
    sentences,
    vector_size=100,   # embedding size
    window=5,          # context window
    min_count=2,       # ignore rare words
    workers=4,         # CPU cores
    sg=1               # 1 = skip-gram, 0 = CBOW
)

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_fl

In [65]:
#get vector for each word 
import numpy as np

def get_avg_w2v(words, model, vector_size=100):
    vec = np.zeros(vector_size)
    count = 0
    
    for word in words:
        if word in model.wv:
            vec += model.wv[word]
            count += 1
    
    if count != 0:
        vec = vec / count
    
    return vec

In [75]:
#Convert X_train and X_test to lists of tokens
x_train_tokens = [text.split() for text in x_train]
x_test_tokens = [text.split() for text in x_test]

x_train_w2v = np.array([get_avg_w2v(words, w2v_model, 100) for words in x_train_tokens])
x_test_w2v = np.array([get_avg_w2v(words, w2v_model, 100) for words in x_test_tokens])

In [81]:

from sklearn.linear_model import LogisticRegression

model_w2v = LogisticRegression(max_iter=200)
model_w2v.fit(x_train_w2v, y_train)

y_pred_w2v = model_w2v.predict(x_test_w2v)

In [83]:
from sklearn.metrics import accuracy_score, classification_report

print("Accuracy:", accuracy_score(y_test, y_pred_w2v))
print(classification_report(y_test, y_pred_w2v))

Accuracy: 0.8735
              precision    recall  f1-score   support

           0       0.87      0.87      0.87      4961
           1       0.87      0.88      0.87      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000

