# NATURAL LANGUAGE PROCESSING

# LOADING THE LIBRARIES

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
# package for NLP
import nltk
nltk.download('stopwords')
# stopwords removes irrelvent words from list of words 
from nltk.corpus import stopwords
# simplify the word because loved and love means the same
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from xgboost import XGBRFClassifier

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


---
## 1. DATA PREPROCESSING

### 1.1 IMPORTING THE DATASET

In [47]:
# quoting = 3 means quote none
dataset = pd.read_csv('Restaurant_Reviews.tsv',sep='\t', quoting=3)

In [48]:
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


### 1.2 CLEANING THE DATASET

In [49]:
# for cleaning purpose
# import re
# import nltk
# nltk.download('stopwords')
# removes the irrelevent words
# from nltk.corpus import stopwords
# from nltk.stem.porter import PorterStemmer

In [50]:
ps = PorterStemmer()
# corpus is the collection of text in standard programming way
corpus = []
for i in np.arange(0,dataset.shape[0]):
    review = dataset.iloc[i,0].lower()
    review = re.findall(r'[a-z]+',review)
    # set(stopwords.words('english')) fast the process
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    # ' '.join words with space between them
    review = ' '.join(review)
#     dataset.iloc[i,0] = review
    corpus.append(review)

In [51]:
# [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
# word for word in review if not word in set(stopwords.words('english'))
corpus[:6]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho']

---
## 2. CREATING THE BAG OF WORDS MODEL

In [52]:
# for Bag Of Words Model
from sklearn.feature_extraction.text import CountVectorizer

In [53]:
cv = CountVectorizer()

In [54]:
# creating sparse matrix
# first fitting to corpus and then converting into matrix
X = cv.fit_transform(corpus).toarray()
print('Spars Matrix X contain {0} rows and {1} columns.'.format(X.shape[0],X.shape[1]))

Spars Matrix X contain 1000 rows and 1565 columns.


In [55]:
# improving more by adding max_features parameter
# so we are taking only 1500 words that are the most frequent words out of 1565 words
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
print('Spars Matrix X contain {0} rows and {1} columns.'.format(X.shape[0],X.shape[1]))

Spars Matrix X contain 1000 rows and 1500 columns.


In [56]:
# depending for building the model
# creating one dimensional dependent array
Y = np.array(dataset.iloc[:,1])

---
## 3. BUILDING THE MODEL

### 3.1 SPLITTING THE DATASET INTO THE TRAINING SET AND TEST SET

In [57]:
from sklearn.model_selection import train_test_split as splt

In [58]:
X_train, X_test, Y_train, Y_test = splt(X, Y, test_size=0.2, random_state=0)

### 3.2 FEATURE SCALING

In [59]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [60]:
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

### 3.3 FITTING THE NAIVE BAYES TO THE TRAINING SET

In [61]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from xgboost import XGBRFClassifier
classifier = GaussianNB()
classifier2 = RandomForestClassifier()
classifier3 = GradientBoostingClassifier()
classifier4 = XGBClassifier()
classifier5 = XGBRFClassifier()

In [62]:
classifier.fit(X_train,Y_train)
classifier2.fit(X_train,Y_train)
classifier3.fit(X_train,Y_train)
classifier4.fit(X_train,Y_train)
classifier5.fit(X_train,Y_train)

XGBRFClassifier(base_score=0.5, colsample_bylevel=1, colsample_bynode=0.8,
                colsample_bytree=1, gamma=0, learning_rate=1, max_delta_step=0,
                max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
                n_jobs=1, nthread=None, objective='binary:logistic',
                random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
                seed=None, silent=None, subsample=0.8, verbosity=1)

### 3.4 PREDICTING THE TEST SET RESULT

In [63]:
y_pred = classifier.predict(X_test)
y_pred2 = classifier2.predict(X_test)
y_pred3 = classifier3.predict(X_test)
y_pred4 = classifier4.predict(X_test)
y_pred5 = classifier5.predict(X_test)

### 3.5 MAKING THE CONFUSION MATRIX

In [64]:
from sklearn.metrics import confusion_matrix as cm

In [65]:
print(cm(Y_test, y_pred))
print(cm(Y_test, y_pred2))
print(cm(Y_test, y_pred3))
print(cm(Y_test, y_pred4))
print(cm(Y_test, y_pred5))

[[55 42]
 [12 91]]
[[87 10]
 [48 55]]
[[92  5]
 [47 56]]
[[93  4]
 [53 50]]
[[94  3]
 [74 29]]


In [67]:
print('Naive Bayes Accuracy is:',((55+91)/200)*100)
print('Random Forest Accuracy is:',((87+56)/200)*100)
print('Gradient Boosting Accuracy is:',((92+56)/200)*100)
print('Xtreme Gradient Boosting Accuracy is:',((93+50)/200)*100)
print('XGB Random Forest Accuracy is:',((94+29)/200)*100)

Naive Bayes Accuracy is: 73.0
Random Forest Accuracy is: 71.5
Gradient Boosting Accuracy is: 74.0
Xtreme Gradient Boosting Accuracy is: 71.5
XGB Random Forest Accuracy is: 61.5
