In [19]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/prabhjeet1/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
df = pd.read_csv(
    "/Users/prabhjeet1/Desktop/PORTFOLIO/GenAi_For_Developers_FreeCodeCamp/data/IMDB Dataset.csv"
)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [21]:
df.shape

(50000, 2)

In [22]:
df = df.iloc[:15000]
df.shape

(15000, 2)

In [23]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [24]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [25]:
df.duplicated().sum()

39

In [26]:
df.drop_duplicates(inplace=True)
df.shape

(14961, 2)

# Basic PreProcessing
- Remove HTML Tags
- LowerCase
- Remove StopWords

In [27]:
def remove_html_tags(text):
    pattern = re.compile("<.*?>")
    return pattern.sub("", text)

In [28]:
df["review"] = df["review"].apply(remove_html_tags)

In [29]:
df["review"] = df["review"].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. the filming tec...,positive
2,i thought this was a wonderful way to spend ti...,positive
3,basically there's a family where a little boy ...,negative
4,"petter mattei's ""love in the time of money"" is...",positive


In [30]:
def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word not in stopwords.words("english"):
            new_text.append(word)
    return " ".join(new_text)

In [31]:
df["review"] = df["review"].apply(remove_stopwords)
df.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode ...,positive
1,wonderful little production. filming technique...,positive
2,thought wonderful way spend time hot summer we...,positive
3,basically there's family little boy (jake) thi...,negative
4,"petter mattei's ""love time money"" visually stu...",positive


In [32]:
X = df.iloc[:,0:1]
y = df['sentiment']

In [33]:
X.head()

Unnamed: 0,review
0,one reviewers mentioned watching 1 oz episode ...
1,wonderful little production. filming technique...
2,thought wonderful way spend time hot summer we...
3,basically there's family little boy (jake) thi...
4,"petter mattei's ""love time money"" visually stu..."


In [34]:
y

0        positive
1        positive
2        positive
3        negative
4        positive
           ...   
14995    negative
14996    positive
14997    negative
14998    negative
14999    positive
Name: sentiment, Length: 14961, dtype: object

In [37]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

y = encoder.fit_transform(y)
y

array([1, 1, 1, ..., 0, 0, 1])

In [36]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [38]:
X_train.shape

(11968, 1)

In [39]:
X_test.shape

(2993, 1)

# Applying Bow

In [41]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()  # Bag Of Word Technique

In [42]:
X_train_bow = cv.fit_transform(X_train["review"]).toarray()
X_test_bow = cv.transform(X_test["review"]).toarray()

In [43]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# GaussianNB

In [44]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()

gnb.fit(X_train_bow, y_train)

0,1,2
,"priors  priors: array-like of shape (n_classes,), default=None Prior probabilities of the classes. If specified, the priors are not adjusted according to the data.",
,"var_smoothing  var_smoothing: float, default=1e-9 Portion of the largest variance of all features that is added to variances for calculation stability. .. versionadded:: 0.20",1e-09


In [45]:
y_pred = gnb.predict(X_test_bow)

from sklearn.metrics import accuracy_score, confusion_matrix

accuracy_score(y_test, y_pred)

0.6625459405278984

In [46]:
confusion_matrix(y_test, y_pred)

array([[1144,  359],
       [ 651,  839]])

# N-Grams

In [49]:
cv = CountVectorizer(ngram_range=(1, 2), max_features=5000)

X_train_bow = cv.fit_transform(X_train["review"]).toarray()
X_test_bow = cv.transform(X_test["review"]).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test, y_pred)

0.8419645840294019

# RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()

rf.fit(X_train_bow, y_train)

y_pred = rf.predict(X_test_bow)

accuracy_score(y_test, y_pred) # Bag Of Words

0.8453057133311059

# Using TF-IDF

In [88]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfid = TfidfVectorizer()

In [91]:
X_train_tfidf = tfid.fit_transform(X_train["review"]).toarray()
X_test_tfidf = tfid.transform(X_test["review"]).toarray()
rf.fit(X_train_tfidf, y_train)

y_pred = rf.predict(X_test_tfidf)

accuracy_score(y_test, y_pred)

0.8416304710992315

# Using Word2Vec

In [60]:
from nltk import sent_tokenize
from gensim.utils import simple_preprocess
import gensim
import nltk

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/prabhjeet1/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [79]:
# Select the column first (this creates a Series), then apply
X_train_word2vec = X_train["review"].apply(gensim.utils.simple_preprocess)
X_test_word2vec = X_test["review"].apply(gensim.utils.simple_preprocess)

In [80]:
model = gensim.models.Word2Vec(
    sentences=X_train_word2vec,
    window=10,
    min_count=2,
    workers=4,  # Use parallel processing
)

Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'


In [83]:
def document_vector(doc_tokens, model):
    # Filter out words not in the vocabulary
    valid_words = [word for word in doc_tokens if word in model.wv]

    if len(valid_words) > 0:
        # Average the vectors of valid words
        return np.mean(model.wv[valid_words], axis=0)
    else:
        # If no words are found, return a vector of zeros
        return np.zeros(model.vector_size)


# 3. Apply the function to create X_train and X_test matrices
X_train_w2v = np.array([document_vector(doc, model) for doc in X_train_word2vec])
X_test_w2v = np.array([document_vector(doc, model) for doc in X_test_word2vec])

In [84]:
# 4. Train Random Forest
rf = RandomForestClassifier()
rf.fit(X_train_w2v, y_train)

# 5. Predict and Evaluate
y_pred = rf.predict(X_test_w2v)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.804209822920147
