# IMDb Movie Review - Project 3
> Name: Sharun Garg  
> Student ID: 200493338

# 1. Importing important libraries and packages

In [1]:
import numpy as np
import pandas as pd
import string
import re
import time
pd.set_option('display.max_colwidth', 100)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import precision_recall_fscore_support as score

import warnings
warnings.filterwarnings("ignore")

### Importing Natural Language Toolkit

In [2]:
!pip install nltk
import nltk



# 2. Reading in the data

In [3]:
file = "IMDB_dataset.xlsx"
data = pd.read_excel(file)
data.head()

Unnamed: 0,review,sentiment
0,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...",positive
1,"Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a nobl...",positive
2,I sure would like to see a resurrection of a up dated Seahunt series with the tech they have tod...,positive
3,"This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 o...",negative
4,Encouraged by the positive comments about this film on here I was looking forward to watching th...,negative


> We can see that there are two columns in the data set with **review** columns having the text data and **sentiment** column giving indication if it is a positive or negative review.

### More info on data

In [4]:
print("Input data has {} rows and {} columns".format(len(data), len(data.columns)))

Input data has 25000 rows and 2 columns


In [5]:
print("Out of {} reviews, {} are positive, {} are negative".format(len(data),
                                                       len(data[data['sentiment']=='positive']),
                                                       len(data[data['sentiment']=='negative'])))

Out of 25000 reviews, 12500 are positive, 12500 are negative


In [6]:
print("Number of null in label: {}".format(data['sentiment'].isnull().sum()))
print("Number of null in text: {}".format(data['review'].isnull().sum()))

Number of null in label: 0
Number of null in text: 0


# 3. Data Cleaning

## Removing the punctuations in the text data

In [7]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

data['review_text_clean'] = data['review'].apply(lambda x: remove_punct(x))

data.head()

Unnamed: 0,review,sentiment,review_text_clean
0,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...",positive,I thought this was a wonderful way to spend time on a too hot summer weekend sitting in the air ...
1,"Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a nobl...",positive,Probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble c...
2,I sure would like to see a resurrection of a up dated Seahunt series with the tech they have tod...,positive,I sure would like to see a resurrection of a up dated Seahunt series with the tech they have tod...
3,"This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 o...",negative,This show was an amazing fresh innovative idea in the 70s when it first aired The first 7 or 8 ...
4,Encouraged by the positive comments about this film on here I was looking forward to watching th...,negative,Encouraged by the positive comments about this film on here I was looking forward to watching th...


> All the punctuation marks are removed from the reviews as part of data cleaning and the text is stored in new column **review_text_clean**

## Tokenizing the text data into separate words

In [8]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

data['review_text_tokenized'] = data['review_text_clean'].apply(lambda x: tokenize(x.lower()))

data.head()

Unnamed: 0,review,sentiment,review_text_clean,review_text_tokenized
0,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...",positive,I thought this was a wonderful way to spend time on a too hot summer weekend sitting in the air ...,"[i, thought, this, was, a, wonderful, way, to, spend, time, on, a, too, hot, summer, weekend, si..."
1,"Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a nobl...",positive,Probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble c...,"[probably, my, alltime, favorite, movie, a, story, of, selflessness, sacrifice, and, dedication,..."
2,I sure would like to see a resurrection of a up dated Seahunt series with the tech they have tod...,positive,I sure would like to see a resurrection of a up dated Seahunt series with the tech they have tod...,"[i, sure, would, like, to, see, a, resurrection, of, a, up, dated, seahunt, series, with, the, t..."
3,"This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 o...",negative,This show was an amazing fresh innovative idea in the 70s when it first aired The first 7 or 8 ...,"[this, show, was, an, amazing, fresh, innovative, idea, in, the, 70s, when, it, first, aired, th..."
4,Encouraged by the positive comments about this film on here I was looking forward to watching th...,negative,Encouraged by the positive comments about this film on here I was looking forward to watching th...,"[encouraged, by, the, positive, comments, about, this, film, on, here, i, was, looking, forward,..."


> Tokenization process is applied on **review_text_clean** (text without punctuations) and the text is converted into an array of separate tokens and stored in **review_text_tokenized**.

##  Removing the stop words

In [9]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

stopword = nltk.corpus.stopwords.words('english')
data['review_text_nostop'] = data['review_text_tokenized'].apply(lambda x: remove_stopwords(x))

data.head()

Unnamed: 0,review,sentiment,review_text_clean,review_text_tokenized,review_text_nostop
0,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...",positive,I thought this was a wonderful way to spend time on a too hot summer weekend sitting in the air ...,"[i, thought, this, was, a, wonderful, way, to, spend, time, on, a, too, hot, summer, weekend, si...","[thought, wonderful, way, spend, time, hot, summer, weekend, sitting, air, conditioned, theater,..."
1,"Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a nobl...",positive,Probably my alltime favorite movie a story of selflessness sacrifice and dedication to a noble c...,"[probably, my, alltime, favorite, movie, a, story, of, selflessness, sacrifice, and, dedication,...","[probably, alltime, favorite, movie, story, selflessness, sacrifice, dedication, noble, cause, p..."
2,I sure would like to see a resurrection of a up dated Seahunt series with the tech they have tod...,positive,I sure would like to see a resurrection of a up dated Seahunt series with the tech they have tod...,"[i, sure, would, like, to, see, a, resurrection, of, a, up, dated, seahunt, series, with, the, t...","[sure, would, like, see, resurrection, dated, seahunt, series, tech, today, would, bring, back, ..."
3,"This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 o...",negative,This show was an amazing fresh innovative idea in the 70s when it first aired The first 7 or 8 ...,"[this, show, was, an, amazing, fresh, innovative, idea, in, the, 70s, when, it, first, aired, th...","[show, amazing, fresh, innovative, idea, 70s, first, aired, first, 7, 8, years, brilliant, thing..."
4,Encouraged by the positive comments about this film on here I was looking forward to watching th...,negative,Encouraged by the positive comments about this film on here I was looking forward to watching th...,"[encouraged, by, the, positive, comments, about, this, film, on, here, i, was, looking, forward,...","[encouraged, positive, comments, film, looking, forward, watching, film, bad, mistake, ive, seen..."


> The stopwords are a list of words that are very very common but don’t provide useful information for most text analysis procedures. So they are removed before analysis to reduce the overhead.

## Cleaning process Before and After

In [10]:
data.drop(['review_text_clean', 'review_text_tokenized'], axis = 1, inplace=True)
data.head()

Unnamed: 0,review,sentiment,review_text_nostop
0,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...",positive,"[thought, wonderful, way, spend, time, hot, summer, weekend, sitting, air, conditioned, theater,..."
1,"Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a nobl...",positive,"[probably, alltime, favorite, movie, story, selflessness, sacrifice, dedication, noble, cause, p..."
2,I sure would like to see a resurrection of a up dated Seahunt series with the tech they have tod...,positive,"[sure, would, like, see, resurrection, dated, seahunt, series, tech, today, would, bring, back, ..."
3,"This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 o...",negative,"[show, amazing, fresh, innovative, idea, 70s, first, aired, first, 7, 8, years, brilliant, thing..."
4,Encouraged by the positive comments about this film on here I was looking forward to watching th...,negative,"[encouraged, positive, comments, film, looking, forward, watching, film, bad, mistake, ive, seen..."


## Lemmatization 
> Lemmatisation is the process of grouping together the inflected forms of a word so they can be analysed as a single item, identified by the word's lemma, or dictionary form.

In [11]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

wn = nltk.WordNetLemmatizer()
data['review_text_lemmatized'] = data['review_text_nostop'].apply(lambda x: lemmatizing(x))

data.head(7)

Unnamed: 0,review,sentiment,review_text_nostop,review_text_lemmatized
0,"I thought this was a wonderful way to spend time on a too hot summer weekend, sitting in the air...",positive,"[thought, wonderful, way, spend, time, hot, summer, weekend, sitting, air, conditioned, theater,...","[thought, wonderful, way, spend, time, hot, summer, weekend, sitting, air, conditioned, theater,..."
1,"Probably my all-time favorite movie, a story of selflessness, sacrifice and dedication to a nobl...",positive,"[probably, alltime, favorite, movie, story, selflessness, sacrifice, dedication, noble, cause, p...","[probably, alltime, favorite, movie, story, selflessness, sacrifice, dedication, noble, cause, p..."
2,I sure would like to see a resurrection of a up dated Seahunt series with the tech they have tod...,positive,"[sure, would, like, see, resurrection, dated, seahunt, series, tech, today, would, bring, back, ...","[sure, would, like, see, resurrection, dated, seahunt, series, tech, today, would, bring, back, ..."
3,"This show was an amazing, fresh & innovative idea in the 70's when it first aired. The first 7 o...",negative,"[show, amazing, fresh, innovative, idea, 70s, first, aired, first, 7, 8, years, brilliant, thing...","[show, amazing, fresh, innovative, idea, 70, first, aired, first, 7, 8, year, brilliant, thing, ..."
4,Encouraged by the positive comments about this film on here I was looking forward to watching th...,negative,"[encouraged, positive, comments, film, looking, forward, watching, film, bad, mistake, ive, seen...","[encouraged, positive, comment, film, looking, forward, watching, film, bad, mistake, ive, seen,..."
5,Phil the Alien is one of those quirky films where the humour is based around the oddness of ever...,negative,"[phil, alien, one, quirky, films, humour, based, around, oddness, everything, rather, actual, pu...","[phil, alien, one, quirky, film, humour, based, around, oddness, everything, rather, actual, pun..."
6,I saw this movie when I was about 12 when it came out. I recall the scariest scene was the big b...,negative,"[saw, movie, 12, came, recall, scariest, scene, big, bird, eating, men, dangling, helplessly, pa...","[saw, movie, 12, came, recall, scariest, scene, big, bird, eating, men, dangling, helplessly, pa..."


# 4. TF-IDF Vectorization and Data Processing Pipeline
> Performing the process of removing punctuations, removing stopwords and doing tokenization on the data.  
After that TF-IDF Vectorization is applied to the cleaned data for converting the text into vectors for performing the classification process.

In [12]:
stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()

data = pd.read_excel(file)

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(data['review'])

In [13]:
X_tfidf.shape

(25000, 114083)

> It can be observed that TF-IDF vectorization converted the cleaned tokenized texts into vectors spead over 114083 columns.

# 5. Classification using RandomForest and XGBoost

**Separating the sentiment column as target**

In [14]:
target = data['sentiment']
target.shape

(25000,)

## Hyperparameter Tuning of Random Forest Classifier using GridSearchCV

In [16]:
rf = RandomForestClassifier()
param = {'n_estimators': [25, 50, 100],
        'max_depth': [25, 50, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=2)
gs_fit = gs.fit(X_tfidf, target)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,33.718268,0.183859,0.324236,0.002039,50.0,100,"{'max_depth': 50, 'n_estimators': 100}",0.8464,0.8562,0.8532,0.8478,0.8458,0.84988,0.0041,1
8,67.319619,0.984776,0.450469,0.017627,,100,"{'max_depth': None, 'n_estimators': 100}",0.8522,0.8594,0.8458,0.8442,0.8454,0.8494,0.005724,2
2,15.498685,0.073754,0.248001,0.001774,25.0,100,"{'max_depth': 25, 'n_estimators': 100}",0.838,0.8486,0.8372,0.8402,0.8388,0.84056,0.00414,3
7,33.439558,0.230993,0.23896,0.001536,,50,"{'max_depth': None, 'n_estimators': 50}",0.8294,0.841,0.8326,0.8362,0.826,0.83304,0.005222,4
4,16.864103,0.070747,0.183527,0.002084,50.0,50,"{'max_depth': 50, 'n_estimators': 50}",0.8288,0.8366,0.8316,0.8372,0.8278,0.8324,0.003884,5


> It can be observed that the parameters 'max_depth' of 50 and 'n_estimators' of 100 had the highest mean accuracy of about **85 %**.

## Hyperparameter Tuning of Xtreme Gradient Boosting Classifier using GridSearchCV

In [18]:
xgb = XGBClassifier()
param = {
    'n_estimators': [25, 50, 100], 
    'eval_metric': ['rmse', 'mlogloss'],
    'learning_rate': [0.1, 1]
}


gs_xgb = GridSearchCV(xgb, param, cv=5, n_jobs=2)
cv_fit = gs_xgb.fit(X_tfidf, target)
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_eval_metric,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,46.615969,0.594961,0.486483,0.020953,rmse,1.0,100,"{'eval_metric': 'rmse', 'learning_rate': 1, 'n_estimators': 100}",0.8396,0.8364,0.8348,0.8362,0.8394,0.83728,0.001896,1
11,44.742249,4.128671,0.43825,0.057772,mlogloss,1.0,100,"{'eval_metric': 'mlogloss', 'learning_rate': 1, 'n_estimators': 100}",0.8396,0.8364,0.8348,0.8362,0.8394,0.83728,0.001896,1
4,24.185407,0.314176,0.458525,0.023487,rmse,1.0,50,"{'eval_metric': 'rmse', 'learning_rate': 1, 'n_estimators': 50}",0.8276,0.8276,0.8282,0.8342,0.8316,0.82984,0.002639,3
10,23.724,0.936218,0.406835,0.07035,mlogloss,1.0,50,"{'eval_metric': 'mlogloss', 'learning_rate': 1, 'n_estimators': 50}",0.8276,0.8276,0.8282,0.8342,0.8316,0.82984,0.002639,3
2,48.896452,0.40003,0.496066,0.035706,rmse,0.1,100,"{'eval_metric': 'rmse', 'learning_rate': 0.1, 'n_estimators': 100}",0.826,0.8402,0.8276,0.831,0.8244,0.82984,0.005622,5


> It can be observed that the parameters 'n_estimator' of 100, 'learning_rate' of 1, and 'eval_metric' as 'rmse' had the highest mean accuracy of **83.7 %**.

## Splitting the data into test and training set

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, target, test_size=0.2)

In [20]:
print("No. of rows in training set: {} \nNo. of rows in test set: {}".format(X_train.shape[0], X_test.shape[0]))

No. of rows in training set: 20000 
No. of rows in test set: 5000


## Creating RandomForestClassifier with tuned hyperparameters

In [21]:
rf = RandomForestClassifier(n_estimators=100, max_depth=50, n_jobs=2)

start = time.time()
rf_model = rf.fit(X_train, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = rf_model.predict(X_test)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='positive', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 16.848 / Predict time: 0.175 ---- Precision: 0.841 / Recall: 0.86 / Accuracy: 0.851


> it can be observed that the RandomForestClassifier had a pretty decent Accuracy of '85%' and a predict time of 175 msecs.

## Creating XGBoostClassifier with tuned hyperparameters

In [22]:
xgb = XGBClassifier(learning_rate= 1, n_estimators= 100, eval_metric= 'rmse')

start = time.time()
xgb_model = xgb.fit(X_train, y_train)
end = time.time()
fit_time = (end - start)

start = time.time()
y_pred = xgb.predict(X_test)
end = time.time()
pred_time = (end - start)

precision, recall, fscore, train_support = score(y_test, y_pred, pos_label='positive', average='binary')
print('Fit time: {} / Predict time: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
    round(fit_time, 3), round(pred_time, 3), round(precision, 3), round(recall, 3), round((y_pred==y_test).sum()/len(y_pred), 3)))

Fit time: 28.759 / Predict time: 0.295 ---- Precision: 0.824 / Recall: 0.852 / Accuracy: 0.837


> it can be observed that the XtremeGradientBoostingClassifier had a pretty decent Accuracy of '83.7%' but a larger predict time of 295 msecs.

# Result:
The **RandomForestClassifier** performed better than **XtremeGradientBoostingClassifier** as the fit time was less than about 1.7 times and a faster predict time by about 1.7 times. Randorm Forest Classifier also had a better accuracy of 85% as compared to Extreme Boosting Classifier having accuracy of 83.7%.