## Importing all the necessary Libraries

In [63]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import emoji
from nltk.corpus import stopwords
nltk.download('stopwords')
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 255)

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\adity\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Importing the Dataset

In [13]:
df = pd.read_csv('D:\WorkSpace\GenAI\GenAI\datasets\Project1_dataset\IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of v...",positive
1,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen-...",positive
2,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well b...",positive
3,"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br...",negative
4,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situ...",positive


## Basic EDA & Splitting the Data into smaller Batches

In [15]:
df.shape

(50000, 2)

In [16]:
df_new = df.iloc[:10000]
df_new.head()

Unnamed: 0,review,sentiment
0,"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of v...",positive
1,"A wonderful little production. <br /><br />The filming technique is very unassuming- very old-time-BBC fashion and gives a comforting, and sometimes discomforting, sense of realism to the entire piece. <br /><br />The actors are extremely well chosen-...",positive
2,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air conditioned theater and watching a light-hearted comedy. The plot is simplistic, but the dialogue is witty and the characters are likable (even the well b...",positive
3,"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.<br /><br />This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.<br...",negative
4,"Petter Mattei's ""Love in the Time of Money"" is a visually stunning film to watch. Mr. Mattei offers us a vivid portrait about human relations. This is a movie that seems to be telling us what money, power and success do to people in the different situ...",positive


In [18]:
print("\n",df['sentiment'].value_counts())
print("\n",df_new['sentiment'].value_counts())


 sentiment
positive    25000
negative    25000
Name: count, dtype: int64

 sentiment
positive    5028
negative    4972
Name: count, dtype: int64


In [20]:
df_new.duplicated().sum()

17

In [22]:
df_new.drop_duplicates(inplace=True)

In [23]:
print("\n",df['sentiment'].value_counts())
print("\n",df_new['sentiment'].value_counts())


 sentiment
positive    24884
negative    24698
Name: count, dtype: int64

 sentiment
positive    5023
negative    4960
Name: count, dtype: int64


In [24]:
df_new.duplicated().sum()

0

## Basic Preprocessing

In [67]:
# Define a translation table to remove punctuation
translator = str.maketrans('', '', string.punctuation)

# Dictionary for common chat words
chat_words = {
    'AFAIK': 'As Far As I Know',
    'AFK': 'Away From Keyboard',
    'ASAP': 'As Soon As Possible',
    'FYI': 'For Your Information',
    'BRB': 'Be Right Back',
    'BTW': 'By The Way',
    'OMG': 'Oh My God',
    'IMO': 'In My Opinion',
    'LOL': 'Laugh Out Loud',
    'TTYL': 'Talk To You Later',
    'GTG': 'Got To Go',
    'TTYT': 'Talk To You Tomorrow',
    'IDK': "I Don't Know",
    'TMI': 'Too Much Information',
    'IMHO': 'In My Humble Opinion',
    'ICYMI': 'In Case You Missed It',
    'FAQ': 'Frequently Asked Questions',
    'TGIF': "Thank God It's Friday",
    'FYA': 'For Your Action'
}

def preprocess_text(text):
    # Step 1: Remove HTML tags
    def remove_html_tags(text):
        clean = re.compile('<.*?>')
        cleaned_text = re.sub(clean, '', text)
        normalized_text = re.sub(r'\s+', ' ', cleaned_text).strip()
        return normalized_text

    # Step 2: Remove URLs
    def remove_url(text):
        text_without_url = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
        cleaned_text = ' '.join(text_without_url.split())
        return cleaned_text

    # Step 3: Convert to lowercase
    def to_lowercase(text):
        return text.lower()
    
    # Step 4: Remove stopwords
    def remove_stopwords(text):
        new_text = []
        for word in text.split():
            if word not in stopwords.words('english'):
                new_text.append(word)
        return " ".join(new_text)

    # Step 5: Replace chat words with full forms
    def chat_conversion(text):
        new_text = []
        for word in text.split():
            if word.upper() in chat_words:
                new_text.append(chat_words[word.upper()])
            else:
                new_text.append(word)
        return " ".join(new_text)
    
    # Step 6: Remove/Handle emojis
    def remove_emoji(text):
        # Demojize the text, converting emojis to their corresponding text descriptions
        result = emoji.demojize(text)
        # Replace colons and underscores with spaces
        result = result.replace(":", "").replace("_", " ")
        return result



    # Apply all preprocessing steps
    text = remove_html_tags(text)
    text = remove_url(text)
    text = to_lowercase(text)  # Convert to lowercase
    text = remove_stopwords(text)  # Remove stopwords
    text = chat_conversion(text)
    text = remove_emoji(text)  # Remove emojis

    return text

In [None]:
df_new['review'] = df_new['review'].apply(preprocess_text)

In [28]:
df_new['review'] = df_new['review'].str.translate(translator)

In [29]:
df_new.head()

Unnamed: 0,review,sentiment
0,one reviewers mentioned watching 1 oz episode hooked right exactly happened methe first thing struck oz brutality unflinching scenes violence set right word go trust me show faint hearted timid show pulls punches regards drugs sex violence hardcore cl...,positive
1,wonderful little production filming technique unassuming oldtimebbc fashion gives comforting sometimes discomforting sense realism entire piece actors extremely well chosen michael sheen has got polari voices pat too truly see seamless editing guided ...,positive
2,thought wonderful way spend time hot summer weekend sitting air conditioned theater watching lighthearted comedy plot simplistic dialogue witty characters likable even well bread suspected serial killer may disappointed realize match point 2 risk addi...,positive
3,basically theres family little boy jake thinks theres zombie closet parents fighting timethis movie slower soap opera suddenly jake decides become rambo kill zombieok first going make film must decide thriller drama drama movie watchable parents divo...,negative
4,petter matteis love time money visually stunning film watch mr mattei offers us vivid portrait human relations movie seems telling us money power success people different situations encounter variation arthur schnitzlers play theme director transfers ...,positive


In [30]:
# Remove any extra spaces in the 'review' column
df_new['review'] = df_new['review'].str.replace(r'\s+', ' ', regex=True).str.strip()


In [42]:
X = df_new.iloc[:,0:1]
y = df_new['sentiment']

In [43]:
encoder = LabelEncoder()

y = encoder.fit_transform(y)
y

array([1, 1, 1, ..., 0, 0, 1])

## Train-test Split

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)

In [45]:
X_train.shape

(7986, 1)

In [46]:
X_test.shape

(1997, 1)

## Modeling

### Using - Bag of Words Technique

In [47]:
cv = CountVectorizer()
X_train_bow = cv.fit_transform(X_train['review']).toarray()

X_test_bow = cv.transform(X_test['review']).toarray()

In [48]:
X_train_bow

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

### Using - GaussianNB

In [50]:
gnb = GaussianNB()
gnb.fit(X_train_bow,y_train)

In [52]:
y_pred = gnb.predict(X_test_bow)

accuracy_score(y_test, y_pred)

0.656484727090636

In [53]:
confusion_matrix(y_test, y_pred)

array([[697, 255],
       [431, 614]], dtype=int64)

### Using - Random Forest Classifier

In [55]:
rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)

y_pred_rf = rf.predict(X_test_bow)

accuracy_score(y_test, y_pred_rf)

0.8477716574862293

In [56]:
confusion_matrix(y_test, y_pred_rf)

array([[803, 149],
       [155, 890]], dtype=int64)

### Using - CountVectorizer with RFC

In [58]:
cv = CountVectorizer(max_features=3000)
X_train_cv = cv.fit_transform(X_train['review']).toarray()

X_test_cv = cv.transform(X_test['review']).toarray()

rf =  RandomForestClassifier()

rf.fit(X_train_cv,y_train)

y_pred_rf_cv = rf.predict(X_test_cv)

accuracy_score(y_test, y_pred_rf_cv)

0.8382573860791187

In [59]:
confusion_matrix(y_test, y_pred_rf_cv)

array([[802, 150],
       [173, 872]], dtype=int64)

### Using - CountVectorizer with RFC and Ngrams

In [62]:
cv = CountVectorizer(max_features=5000, ngram_range=(2,2))

X_train_cv_ngram = cv.fit_transform(X_train['review']).toarray()

X_test_cv_ngram = cv.transform(X_test['review']).toarray()

rf =  RandomForestClassifier()

rf.fit(X_train_cv_ngram,y_train)

y_pred_rf_cv_ngram = rf.predict(X_test_cv_ngram)
accuracy_score(y_test, y_pred_rf_cv_ngram)

0.7315973960941412

In [61]:
confusion_matrix(y_test, y_pred_rf_cv_ngram)

array([[799, 153],
       [183, 862]], dtype=int64)

### Using - TF-IDF Vectorizer with RFC

In [64]:
tfidf = TfidfVectorizer()

X_train_tfidf = tfidf.fit_transform(X_train['review']).toarray()
X_test_tfidf = tfidf.transform(X_test['review']).toarray()
rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)

y_pred_rf_tfidf = rf.predict(X_test_tfidf)

accuracy_score(y_test, y_pred_rf_tfidf)

0.8357536304456685

In [65]:
confusion_matrix(y_test, y_pred_rf_tfidf)

array([[803, 149],
       [179, 866]], dtype=int64)