# Import the libraries

In [196]:

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import string
import re
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import accuracy_score, precision_score, recall_score

import os
import warnings
warnings.filterwarnings('ignore')

## Preprocess Text Data

In [64]:
df = pd.read_csv("/Users/sandaraung/Documents/AI/IMDB_Dataset/IMDB_dataset.csv")
df.head()

Unnamed: 0,review,sentiment
0,I thought this was a wonderful way to spend ti...,positive
1,"Probably my all-time favorite movie, a story o...",positive
2,I sure would like to see a resurrection of a u...,positive
3,"This show was an amazing, fresh & innovative i...",negative
4,Encouraged by the positive comments about this...,negative


In [65]:
df.shape

(25000, 2)

## Subseting by the first 5000 due to data size is too big.

In [66]:
df = df.iloc[0:5000]

In [67]:
df.shape

(5000, 2)

# Removal of Stopwords, Punctuation

In [68]:
stopwords = nltk.corpus.stopwords.words('english')

In [69]:
# Check punctuations
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [70]:
def clean_text(text):
    # Remove punctuation
    text = "".join([word for word in text if word not in string.punctuation])
    
    # Tokenization
    tokens = re.split('\W+', text)
    
    # Remove stopwords
    text = [word for word in tokens if word not in stopwords]
    return text

In [71]:
df['review_clean'] = df['review'].apply(lambda x: clean_text(x.lower()))

df.head()

Unnamed: 0,review,sentiment,review_clean
0,I thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su..."
1,"Probably my all-time favorite movie, a story o...",positive,"[probably, alltime, favorite, movie, story, se..."
2,I sure would like to see a resurrection of a u...,positive,"[sure, would, like, see, resurrection, dated, ..."
3,"This show was an amazing, fresh & innovative i...",negative,"[show, amazing, fresh, innovative, idea, 70s, ..."
4,Encouraged by the positive comments about this...,negative,"[encouraged, positive, comments, film, looking..."


# Tokenization

In [202]:
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

df['review_clean']= df['review'].apply(lambda x: tokenize(x.lower()))

df.head()

Unnamed: 0,review,sentiment,review_clean,review_lemmatized,review_stemmed,review_len,punct%
0,I thought this was a wonderful way to spend ti...,positive,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe...",761,0.053
1,"Probably my all-time favorite movie, a story o...",positive,"[probably, my, all, time, favorite, movie, a, ...","[probably, alltime, favorite, movie, story, se...","[probabl, alltim, favorit, movi, stori, selfle...",538,0.052
2,I sure would like to see a resurrection of a u...,positive,"[i, sure, would, like, to, see, a, resurrectio...","[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrect, date, seah...",577,0.021
3,"This show was an amazing, fresh & innovative i...",negative,"[this, show, was, an, amazing, fresh, innovati...","[show, amazing, fresh, innovative, idea, 70, f...","[show, amaz, fresh, innov, idea, 70, first, ai...",761,0.043
4,Encouraged by the positive comments about this...,negative,"[encouraged, by, the, positive, comments, abou...","[encouraged, positive, comment, film, looking,...","[encourag, posit, comment, film, look, forward...",552,0.056


## Lemmatizing and Stemming

In [73]:
# WordNet Lemmatizer
wn = nltk.WordNetLemmatizer()

# Porter stemmer
ps = nltk.PorterStemmer()

In [74]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text


df['review_lemmatized'] = df['review_clean'].apply(lambda x: lemmatizing(x))
df.head()

Unnamed: 0,review,sentiment,review_clean,review_lemmatized
0,I thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su..."
1,"Probably my all-time favorite movie, a story o...",positive,"[probably, alltime, favorite, movie, story, se...","[probably, alltime, favorite, movie, story, se..."
2,I sure would like to see a resurrection of a u...,positive,"[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrection, dated, ..."
3,"This show was an amazing, fresh & innovative i...",negative,"[show, amazing, fresh, innovative, idea, 70s, ...","[show, amazing, fresh, innovative, idea, 70, f..."
4,Encouraged by the positive comments about this...,negative,"[encouraged, positive, comments, film, looking...","[encouraged, positive, comment, film, looking,..."


In [75]:
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text


df['review_stemmed'] = df['review_clean'].apply(lambda x: stemming(x))
df.head()

Unnamed: 0,review,sentiment,review_clean,review_lemmatized,review_stemmed
0,I thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe..."
1,"Probably my all-time favorite movie, a story o...",positive,"[probably, alltime, favorite, movie, story, se...","[probably, alltime, favorite, movie, story, se...","[probabl, alltim, favorit, movi, stori, selfle..."
2,I sure would like to see a resurrection of a u...,positive,"[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrect, date, seah..."
3,"This show was an amazing, fresh & innovative i...",negative,"[show, amazing, fresh, innovative, idea, 70s, ...","[show, amazing, fresh, innovative, idea, 70, f...","[show, amaz, fresh, innov, idea, 70, first, ai..."
4,Encouraged by the positive comments about this...,negative,"[encouraged, positive, comments, film, looking...","[encouraged, positive, comment, film, looking,...","[encourag, posit, comment, film, look, forward..."


## Apply  TFIDF Vectorizer

In [124]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(df['review'])


In [77]:
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names_out())

(5000, 59137)
['' '0' '00' ... 'ís' 'über' 'überwoman']


In [81]:
def count_punctuation(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3)


df['review_len'] = df['review'].apply(lambda x: len(x) - x.count(" "))
df['punct%'] = df['review'].apply(lambda x: count_punctuation(x))

In [82]:
df.head()

Unnamed: 0,review,sentiment,review_clean,review_lemmatized,review_stemmed,review_len,punct%
0,I thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe...",761,0.053
1,"Probably my all-time favorite movie, a story o...",positive,"[probably, alltime, favorite, movie, story, se...","[probably, alltime, favorite, movie, story, se...","[probabl, alltim, favorit, movi, stori, selfle...",538,0.052
2,I sure would like to see a resurrection of a u...,positive,"[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrect, date, seah...",577,0.021
3,"This show was an amazing, fresh & innovative i...",negative,"[show, amazing, fresh, innovative, idea, 70s, ...","[show, amazing, fresh, innovative, idea, 70, f...","[show, amaz, fresh, innov, idea, 70, first, ai...",761,0.043
4,Encouraged by the positive comments about this...,negative,"[encouraged, positive, comments, film, looking...","[encouraged, positive, comment, film, looking,...","[encourag, posit, comment, film, look, forward...",552,0.056


## Preparing for features

In [133]:
X_features = pd.DataFrame(X_tfidf.toarray())

In [134]:
X_features['review_len'] = df['review_len']

In [135]:
X_features['punct%'] = df['punct%']

In [136]:
X_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,59129,59130,59131,59132,59133,59134,59135,59136,review_len,punct%
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,761,0.053
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,538,0.052
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,577,0.021
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,761,0.043
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,552,0.056


In [137]:
X_features.shape

(5000, 59139)

## GridSearch CV

In [138]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

In [139]:
X_features.columns = X_features.columns.astype(str)

In [140]:
X_train, X_test, y_train, y_test = train_test_split(X_features, df['sentiment'], test_size=0.3)

In [141]:
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, pos_label='positive', average='binary')
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision, 3), round(recall, 3),
        round((y_pred==y_test).sum() / len(y_pred), 3)))

In [142]:
for n_est in [10, 40, 80]:
    for depth in [10, 15, 25, None]:
        train_RF(n_est, depth)

Est: 10 / Depth: 10 ---- Precision: 0.658 / Recall: 0.757 / Accuracy: 0.687
Est: 10 / Depth: 15 ---- Precision: 0.642 / Recall: 0.745 / Accuracy: 0.671
Est: 10 / Depth: 25 ---- Precision: 0.699 / Recall: 0.73 / Accuracy: 0.713
Est: 10 / Depth: None ---- Precision: 0.73 / Recall: 0.655 / Accuracy: 0.711
Est: 40 / Depth: 10 ---- Precision: 0.739 / Recall: 0.844 / Accuracy: 0.777
Est: 40 / Depth: 15 ---- Precision: 0.743 / Recall: 0.862 / Accuracy: 0.785
Est: 40 / Depth: 25 ---- Precision: 0.779 / Recall: 0.821 / Accuracy: 0.797
Est: 40 / Depth: None ---- Precision: 0.799 / Recall: 0.777 / Accuracy: 0.795
Est: 80 / Depth: 10 ---- Precision: 0.755 / Recall: 0.889 / Accuracy: 0.804
Est: 80 / Depth: 15 ---- Precision: 0.769 / Recall: 0.882 / Accuracy: 0.812
Est: 80 / Depth: 25 ---- Precision: 0.79 / Recall: 0.859 / Accuracy: 0.819
Est: 80 / Depth: None ---- Precision: 0.806 / Recall: 0.845 / Accuracy: 0.824


In [181]:
df.head()

Unnamed: 0,review,sentiment,review_clean,review_lemmatized,review_stemmed,review_len,punct%
0,I thought this was a wonderful way to spend ti...,positive,"[thought, wonderful, way, spend, time, hot, su...","[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe...",761,0.053
1,"Probably my all-time favorite movie, a story o...",positive,"[probably, alltime, favorite, movie, story, se...","[probably, alltime, favorite, movie, story, se...","[probabl, alltim, favorit, movi, stori, selfle...",538,0.052
2,I sure would like to see a resurrection of a u...,positive,"[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrect, date, seah...",577,0.021
3,"This show was an amazing, fresh & innovative i...",negative,"[show, amazing, fresh, innovative, idea, 70s, ...","[show, amazing, fresh, innovative, idea, 70, f...","[show, amaz, fresh, innov, idea, 70, first, ai...",761,0.043
4,Encouraged by the positive comments about this...,negative,"[encouraged, positive, comments, film, looking...","[encouraged, positive, comment, film, looking,...","[encourag, posit, comment, film, look, forward...",552,0.056


In [182]:
features = X_features
labels = df[['sentiment']]

models = {}

In [183]:
def print_results(results):
    print('BEST PARAMS: {}\n'.format(results.best_params_))

    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print('{} (+/-{}) for {}'.format(round(mean, 3), round(std * 2, 3), params))

In [186]:
def evaluate_model(name, model, features, labels):
    start = time()
    pred = model.predict(features)
    end = time()
    accuracy = round(accuracy_score(labels, pred), 3)
    precision = round(precision_score(labels, pred, average='micro'), 3)
    recall = round(recall_score(labels, pred, average='micro'), 3)
    print('{} -- Accuracy: {} / Precision: {} / Recall: {} / Latency: {}ms'.format(name,
                                                                                   accuracy,
                                                                                   precision,
                                                                                   recall,
                                                                                   round((end - start)*1000, 1)))

## RandomForestClassifier

In [184]:
rf = RandomForestClassifier()
param = {
    'n_estimators': [10, 40, 80], 
    'max_depth': [10, 15, 25, None]
}

cv = GridSearchCV(rf, param, cv=2, n_jobs=-1)
cv.fit(X_train, y_train)

print_results(cv)
models["RF"] = cv.best_estimator_

BEST PARAMS: {'max_depth': 25, 'n_estimators': 80}

0.669 (+/-0.002) for {'max_depth': 10, 'n_estimators': 10}
0.738 (+/-0.031) for {'max_depth': 10, 'n_estimators': 40}
0.779 (+/-0.0) for {'max_depth': 10, 'n_estimators': 80}
0.679 (+/-0.001) for {'max_depth': 15, 'n_estimators': 10}
0.777 (+/-0.035) for {'max_depth': 15, 'n_estimators': 40}
0.797 (+/-0.003) for {'max_depth': 15, 'n_estimators': 80}
0.685 (+/-0.025) for {'max_depth': 25, 'n_estimators': 10}
0.777 (+/-0.026) for {'max_depth': 25, 'n_estimators': 40}
0.803 (+/-0.017) for {'max_depth': 25, 'n_estimators': 80}
0.711 (+/-0.022) for {'max_depth': None, 'n_estimators': 10}
0.776 (+/-0.002) for {'max_depth': None, 'n_estimators': 40}
0.798 (+/-0.017) for {'max_depth': None, 'n_estimators': 80}


# Gradient Boosting Classifier.

In [187]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

In [190]:
gb = GradientBoostingClassifier()
param = {
    'n_estimators': [20,40], 
    'max_depth': [3,5,8],
    'learning_rate': [0.1]
}

cv = GridSearchCV(gb, param, cv=2, n_jobs=-1)
cv_fit = cv.fit(X_train, y_train)
models["GB"] = cv.best_estimator_
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
5,80.597376,2.835463,5.127868,0.328629,0.1,8,40,"{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",0.766857,0.757714,0.762286,0.004571,1
3,84.854071,2.462857,8.802279,0.894983,0.1,5,40,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.768,0.752,0.76,0.008,2
4,67.710104,1.549586,7.421038,0.285976,0.1,8,20,"{'learning_rate': 0.1, 'max_depth': 8, 'n_esti...",0.743429,0.743429,0.743429,0.0,3
1,76.756475,0.157767,11.299987,0.161977,0.1,3,40,"{'learning_rate': 0.1, 'max_depth': 3, 'n_esti...",0.759429,0.725714,0.742571,0.016857,4
2,54.377445,0.197414,8.169328,0.286021,0.1,5,20,"{'learning_rate': 0.1, 'max_depth': 5, 'n_esti...",0.753143,0.716,0.734571,0.018571,5


## Final Model Evaluation

In [195]:
for name, model in models.items():
    evaluate_model(name, model, features, labels)

RF -- Accuracy: 0.934 / Precision: 0.934 / Recall: 0.934 / Latency: 13790.0ms
GB -- Accuracy: 0.907 / Precision: 0.907 / Recall: 0.907 / Latency: 24056.5ms


# Report for best performance model

***Analysis***

Accuracy measures the proportion of correct predictions among the total number of cases processed. The RF model has an accuracy of 0.934, outperforming the GB model which has an accuracy of 0.907.

Precision is the ratio of correctly predicted positive observations to the total predicted positives. The RF model shows a higher precision (0.934) compared to the GB model (0.907).

Recall is the ratio of correctly predicted positive observations to the all observations in actual class. The RF model again outperforms with a recall of 0.934, whereas the GB model has a recall of 0.907.

Latency refers to the time taken to make predictions. The RF model has a lower latency of 13790.0 ms, making it more efficient compared to the GB model which has a latency of 24056.5 ms.

***Best Performance Model***

Based on the evaluation metrics, the Random Forest (RF) model is the better-performing model compared to the Gradient Boosting (GB) model. It has higher accuracy, precision, and recall, and also operates with significantly lower latency. Therefore, the RF model is recommended for deployment.