In [9]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'C:\Users\hp\ENVS\ARIMA\Scripts\python.exe -m pip install --upgrade pip' command.


In [10]:
import pandas as pd
import string
import re
import nltk

In [11]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

## Q1: Preprocess Text Data
> (Remove punctuation, Perform Tokenization, Remove stopwords and Lemmatize/Stem)

In [12]:
data = pd.read_csv("data.csv", sep=',')
positive_records = data.loc[data["sentiment"] == "positive"][:3000]
negative_records = data.loc[data["sentiment"] == "negative"][:3000]
data = pd.concat([positive_records, negative_records], axis=0)
print(data.shape)
data.head()

(6000, 2)


Unnamed: 0,review,sentiment
0,I thought this was a wonderful way to spend ti...,positive
1,"Probably my all-time favorite movie, a story o...",positive
2,I sure would like to see a resurrection of a u...,positive
8,This a fantastic movie of three prisoners who ...,positive
11,"What an absolutely stunning movie, if you have...",positive


### 1. Remove punctuation

In [13]:
def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

data['review_clean'] = data['review'].apply(lambda x: remove_punct(x))

data.head()

Unnamed: 0,review,sentiment,review_clean
0,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...
1,"Probably my all-time favorite movie, a story o...",positive,Probably my alltime favorite movie a story of ...
2,I sure would like to see a resurrection of a u...,positive,I sure would like to see a resurrection of a u...
8,This a fantastic movie of three prisoners who ...,positive,This a fantastic movie of three prisoners who ...
11,"What an absolutely stunning movie, if you have...",positive,What an absolutely stunning movie if you have ...


### 2.Tokenization

In [14]:
import re

def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

data['review_tokenized'] = data['review_clean'].apply(lambda x: tokenize(x.lower()))

data.head()

Unnamed: 0,review,sentiment,review_clean,review_tokenized
0,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,..."
1,"Probably my all-time favorite movie, a story o...",positive,Probably my alltime favorite movie a story of ...,"[probably, my, alltime, favorite, movie, a, st..."
2,I sure would like to see a resurrection of a u...,positive,I sure would like to see a resurrection of a u...,"[i, sure, would, like, to, see, a, resurrectio..."
8,This a fantastic movie of three prisoners who ...,positive,This a fantastic movie of three prisoners who ...,"[this, a, fantastic, movie, of, three, prisone..."
11,"What an absolutely stunning movie, if you have...",positive,What an absolutely stunning movie if you have ...,"[what, an, absolutely, stunning, movie, if, yo..."


### 3.Remove stopwords

In [15]:
stopword = nltk.corpus.stopwords.words('english')

In [16]:
def remove_stopwords(tokenized_list):
    text = [word for word in tokenized_list if word not in stopword]
    return text

data['review_nostop'] = data['review_tokenized'].apply(lambda x: remove_stopwords(x))

data.head()

Unnamed: 0,review,sentiment,review_clean,review_tokenized,review_nostop
0,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su..."
1,"Probably my all-time favorite movie, a story o...",positive,Probably my alltime favorite movie a story of ...,"[probably, my, alltime, favorite, movie, a, st...","[probably, alltime, favorite, movie, story, se..."
2,I sure would like to see a resurrection of a u...,positive,I sure would like to see a resurrection of a u...,"[i, sure, would, like, to, see, a, resurrectio...","[sure, would, like, see, resurrection, dated, ..."
8,This a fantastic movie of three prisoners who ...,positive,This a fantastic movie of three prisoners who ...,"[this, a, fantastic, movie, of, three, prisone...","[fantastic, movie, three, prisoners, become, f..."
11,"What an absolutely stunning movie, if you have...",positive,What an absolutely stunning movie if you have ...,"[what, an, absolutely, stunning, movie, if, yo...","[absolutely, stunning, movie, 25, hrs, kill, w..."


### 4.Stem text

In [17]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer()
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

data['review_nostop'] = data['review'].apply(lambda x: clean_text(x.lower()))
def stemming(tokenized_text):
    text = [ps.stem(word) for word in tokenized_text]
    return text

data['review_stemmed'] = data['review_nostop'].apply(lambda x: stemming(x))

data.head()

Unnamed: 0,review,sentiment,review_clean,review_tokenized,review_nostop,review_stemmed
0,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe..."
1,"Probably my all-time favorite movie, a story o...",positive,Probably my alltime favorite movie a story of ...,"[probably, my, alltime, favorite, movie, a, st...","[probably, alltime, favorite, movie, story, se...","[probabl, alltim, favorit, movi, stori, selfle..."
2,I sure would like to see a resurrection of a u...,positive,I sure would like to see a resurrection of a u...,"[i, sure, would, like, to, see, a, resurrectio...","[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrect, date, seah..."
8,This a fantastic movie of three prisoners who ...,positive,This a fantastic movie of three prisoners who ...,"[this, a, fantastic, movie, of, three, prisone...","[fantastic, movie, three, prisoners, become, f...","[fantast, movi, three, prison, becom, famou, o..."
11,"What an absolutely stunning movie, if you have...",positive,What an absolutely stunning movie if you have ...,"[what, an, absolutely, stunning, movie, if, yo...","[absolutely, stunning, movie, 25, hrs, kill, w...","[absolut, stun, movi, 25, hr, kill, watch, won..."


In [18]:
stopwords = nltk.corpus.stopwords.words('english')
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()
def clean_text_tfidf(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords]
    return text

data.head(10)

Unnamed: 0,review,sentiment,review_clean,review_tokenized,review_nostop,review_stemmed
0,I thought this was a wonderful way to spend ti...,positive,I thought this was a wonderful way to spend ti...,"[i, thought, this, was, a, wonderful, way, to,...","[thought, wonderful, way, spend, time, hot, su...","[thought, wonder, way, spend, time, hot, summe..."
1,"Probably my all-time favorite movie, a story o...",positive,Probably my alltime favorite movie a story of ...,"[probably, my, alltime, favorite, movie, a, st...","[probably, alltime, favorite, movie, story, se...","[probabl, alltim, favorit, movi, stori, selfle..."
2,I sure would like to see a resurrection of a u...,positive,I sure would like to see a resurrection of a u...,"[i, sure, would, like, to, see, a, resurrectio...","[sure, would, like, see, resurrection, dated, ...","[sure, would, like, see, resurrect, date, seah..."
8,This a fantastic movie of three prisoners who ...,positive,This a fantastic movie of three prisoners who ...,"[this, a, fantastic, movie, of, three, prisone...","[fantastic, movie, three, prisoners, become, f...","[fantast, movi, three, prison, becom, famou, o..."
11,"What an absolutely stunning movie, if you have...",positive,What an absolutely stunning movie if you have ...,"[what, an, absolutely, stunning, movie, if, yo...","[absolutely, stunning, movie, 25, hrs, kill, w...","[absolut, stun, movi, 25, hr, kill, watch, won..."
13,The Karen Carpenter Story shows a little more ...,positive,The Karen Carpenter Story shows a little more ...,"[the, karen, carpenter, story, shows, a, littl...","[karen, carpenter, story, shows, little, singe...","[karen, carpent, stori, show, littl, singer, k..."
16,"Taut and organically gripping, Edward Dmytryk'...",positive,Taut and organically gripping Edward Dmytryks ...,"[taut, and, organically, gripping, edward, dmy...","[taut, organically, gripping, edward, dmytryks...","[taut, organ, grip, edward, dmytryk, crossfir,..."
17,"""Ardh Satya"" is one of the finest film ever ma...",positive,Ardh Satya is one of the finest film ever made...,"[ardh, satya, is, one, of, the, finest, film, ...","[ardh, satya, one, finest, film, ever, made, i...","[ardh, satya, one, finest, film, ever, made, i..."
18,One of the most significant quotes from the en...,positive,One of the most significant quotes from the en...,"[one, of, the, most, significant, quotes, from...","[one, significant, quotes, entire, film, prono...","[one, signific, quot, entir, film, pronounc, h..."
24,"This movie is based on the book, ""A Many Splen...",positive,This movie is based on the book A Many Splendo...,"[this, movie, is, based, on, the, book, a, man...","[movie, based, book, many, splendored, thing, ...","[movi, base, book, mani, splendor, thing, han,..."


## Q2: TFIDF Vectorization

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer(analyzer=clean_text_tfidf)
X_tfidf = tfidf_vect.fit_transform(data['review'])
print(X_tfidf.shape)

(6000, 61307)


## Q3: Exploring parameter settings using GridSearchCV on Random Forest & Gradient Boosting Classifier.

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split

y=data['sentiment'].map({'positive':1,'negative':0})
X_train, X_test, y_train, y_test = train_test_split(X_tfidf,y, test_size=0.2)
def train_RF(n_est, depth):
    rf = RandomForestClassifier(n_estimators=n_est, max_depth=depth, n_jobs=-1)
    rf_model = rf.fit(X_train, y_train)
    y_pred = rf_model.predict(X_test)
    precision, recall, fscore, support = score(y_test, y_pred, average='binary')
    print('Est: {} / Depth: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        n_est, depth, round(precision, 3), round(recall, 3),
        round((y_pred==y_test).sum() / len(y_pred), 3)))
    
#               ***************REPLACE IT WITH GRIDSEARCH CV METHOD********************
# from sklearn.model_selection import GridSearchCV
# GridSearchCV(estimator=SVC(),param_grid=[10, 50])
for n_est in [10, 50,100]:
    for depth in [10, 20, 30, None]:
         train_RF(n_est, depth)  

Est: 10 / Depth: 10 ---- Precision: 0.678 / Recall: 0.746 / Accuracy: 0.704
Est: 10 / Depth: 20 ---- Precision: 0.682 / Recall: 0.707 / Accuracy: 0.698
Est: 10 / Depth: 30 ---- Precision: 0.72 / Recall: 0.732 / Accuracy: 0.732
Est: 10 / Depth: None ---- Precision: 0.76 / Recall: 0.635 / Accuracy: 0.725
Est: 50 / Depth: 10 ---- Precision: 0.782 / Recall: 0.828 / Accuracy: 0.804
Est: 50 / Depth: 20 ---- Precision: 0.805 / Recall: 0.83 / Accuracy: 0.82
Est: 50 / Depth: 30 ---- Precision: 0.827 / Recall: 0.842 / Accuracy: 0.838
Est: 50 / Depth: None ---- Precision: 0.827 / Recall: 0.804 / Accuracy: 0.823
Est: 100 / Depth: 10 ---- Precision: 0.805 / Recall: 0.851 / Accuracy: 0.828
Est: 100 / Depth: 20 ---- Precision: 0.831 / Recall: 0.852 / Accuracy: 0.844
Est: 100 / Depth: 30 ---- Precision: 0.829 / Recall: 0.856 / Accuracy: 0.844
Est: 100 / Depth: None ---- Precision: 0.852 / Recall: 0.832 / Accuracy: 0.848


In [21]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

X_train, X_test, y_train, y_test = train_test_split(X_tfidf,y, test_size=0.2)

def train_GB(est, max_depth, lr):
    gb = GradientBoostingClassifier(n_estimators=est, max_depth=max_depth, learning_rate=lr)
    gb_model = gb.fit(X_train, y_train)
    y_pred = gb_model.predict(X_test)
    precision, recall, fscore, train_support = score(y_test, y_pred, average='binary')
    print('Est: {} / Depth: {} / LR: {} ---- Precision: {} / Recall: {} / Accuracy: {}'.format(
        est, max_depth, lr, round(precision, 3), round(recall, 3), 
        round((y_pred==y_test).sum()/len(y_pred), 3)))   
    
for n_est in [50, 100,150]:
    for max_depth in [3, 7,11,15]:
        for lr in [0.01,0.1, 1]:
            train_GB(n_est, max_depth, lr)

Est: 50 / Depth: 3 / LR: 0.01 ---- Precision: 0.631 / Recall: 0.916 / Accuracy: 0.694
Est: 50 / Depth: 3 / LR: 0.1 ---- Precision: 0.714 / Recall: 0.863 / Accuracy: 0.762
Est: 50 / Depth: 3 / LR: 1 ---- Precision: 0.756 / Recall: 0.821 / Accuracy: 0.781
Est: 50 / Depth: 7 / LR: 0.01 ---- Precision: 0.653 / Recall: 0.883 / Accuracy: 0.711
Est: 50 / Depth: 7 / LR: 0.1 ---- Precision: 0.75 / Recall: 0.84 / Accuracy: 0.782
Est: 50 / Depth: 7 / LR: 1 ---- Precision: 0.724 / Recall: 0.826 / Accuracy: 0.759
Est: 50 / Depth: 11 / LR: 0.01 ---- Precision: 0.669 / Recall: 0.843 / Accuracy: 0.717
Est: 50 / Depth: 11 / LR: 0.1 ---- Precision: 0.759 / Recall: 0.845 / Accuracy: 0.791
Est: 50 / Depth: 11 / LR: 1 ---- Precision: 0.73 / Recall: 0.826 / Accuracy: 0.763
Est: 50 / Depth: 15 / LR: 0.01 ---- Precision: 0.671 / Recall: 0.838 / Accuracy: 0.718
Est: 50 / Depth: 15 / LR: 0.1 ---- Precision: 0.754 / Recall: 0.861 / Accuracy: 0.793
Est: 50 / Depth: 15 / LR: 1 ---- Precision: 0.725 / Recall: 0.848

## Q4: Perform Final evaluation of models on the best parameter settings using the evaluation metrics

#### 1. Random Forest. 

In [22]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier()
param = {'n_estimators': [10, 150, 300],
        'max_depth': [30, 60, 90, None]}

gs = GridSearchCV(rf, param, cv=5, n_jobs=-1)
gs_fit = gs.fit(X_tfidf,y)
pd.DataFrame(gs_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
5,57.450892,0.838965,0.604457,0.032535,60.0,300,"{'max_depth': 60, 'n_estimators': 300}",0.831667,0.844167,0.848333,0.85,0.8625,0.847333,0.009936,1
8,68.441679,0.345525,0.579685,0.023108,90.0,300,"{'max_depth': 90, 'n_estimators': 300}",0.835833,0.840833,0.8525,0.836667,0.859167,0.845,0.00925,2
11,58.649823,2.181192,0.367862,0.072871,,300,"{'max_depth': None, 'n_estimators': 300}",0.826667,0.844167,0.860833,0.834167,0.853333,0.843833,0.01239,3
10,35.971511,1.322008,0.283747,0.022659,,150,"{'max_depth': None, 'n_estimators': 150}",0.826667,0.8375,0.855833,0.8325,0.861667,0.842833,0.013567,4
2,31.709712,0.403648,0.481467,0.015017,30.0,300,"{'max_depth': 30, 'n_estimators': 300}",0.830833,0.8375,0.854167,0.838333,0.851667,0.8425,0.008929,5


#### 2. GradientBoostingClassifier. 

In [23]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

gb = GradientBoostingClassifier()
param = {
    'n_estimators': [100, 150], 
    'max_depth': [7, 11, 15],
    'learning_rate': [0.1]
}

clf = GridSearchCV(gb, param, cv=5, n_jobs=-1)
cv_fit = clf.fit(X_tfidf,y)
pd.DataFrame(cv_fit.cv_results_).sort_values('mean_test_score', ascending=False)[0:5]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
1,244.76528,2.060118,0.035634,0.006181,0.1,7,150,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.7725,0.805833,0.8375,0.815833,0.844167,0.815167,0.025486,1
3,394.437976,1.58294,0.049506,0.008605,0.1,11,150,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.770833,0.8025,0.823333,0.811667,0.830833,0.807833,0.020887,2
0,167.157986,1.881748,0.028906,0.006685,0.1,7,100,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.7625,0.800833,0.8225,0.798333,0.828333,0.8025,0.023184,3
5,422.735294,13.814425,0.029606,0.007483,0.1,15,150,"{'learning_rate': 0.1, 'max_depth': 15, 'n_est...",0.766667,0.785,0.823333,0.8,0.836667,0.802333,0.025289,4
2,268.539047,3.734401,0.039477,0.00692,0.1,11,100,"{'learning_rate': 0.1, 'max_depth': 11, 'n_est...",0.765833,0.799167,0.82,0.799167,0.825833,0.802,0.021053,5


## Q5: Report the best performing model

In the experiment, the Random Forest has 
GradientBoostingClassifier is the best model. 

> Random Forest  = 0.304306 (mean_score_time) 	0.098282 	(std_score_time) <br>
> GradientBoostingClassifier = 0.020673 (mean_score_time)	0.002118 (std_score_time)	

<b>Thus, The Gradient Boster </b> has the least score time, so the best peroforming model is it.