# Natural Language Processing    

Problem Statement :

Using NLP predict whether the review is positive or negative for a given dataset 
https://drive.google.com/open?id=1-TJWzdxapGhp2aElncd6RH6zOpSAf69X

In [1]:
#importing necessary library
#importing matplotlib 
import matplotlib.pyplot as plt
#importing seaborn
import seaborn as sea
#importing pandas 
import pandas as pd
#importing numpy
import numpy as np

#importing job-lib
import joblib
import re,nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

#importing scikit learn  library classes
import sklearn.model_selection as model_selection
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import accuracy_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')

# loading csv data from the file given in the url
data_set = pd.read_table("/home/admin3/Documents/MyDoc/data_sets/Restaurant_Reviews.tsv")
#prinding info of data
data_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
Review    1000 non-null object
Liked     1000 non-null int64
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


#### Data Cleaning

In [2]:
data_set.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
# this methods removes stopwords and stemwords from the row
def pre_processing_nltk(row,ps):
    row = re.sub('[^a-zA-Z]'," ",row).lower().split()
    row = [ps.stem(word) for word in row if not word in set(stopwords.words('english'))]
    row = " ".join(row)
    return row

In [4]:
def fit_transform_nltk(data,cv,flag=True):
    if flag!=True:
        return cv,cv.fit_transform(data).toarray()
    else:
        return cv,cv.transform(data).toarray()

In [5]:
corps=[]
ps = PorterStemmer()
for index in range(len(data_set)):
    corps.append(pre_processing_nltk(data_set['Review'][index],ps))

In [6]:
joblib.dump(ps,'/home/admin3/ml_with_phoenix/natural_language_processing/pkl_objects/porter_stemmer.pkl')

['/home/admin3/ml_with_phoenix/natural_language_processing/pkl_objects/porter_stemmer.pkl']

In [7]:
corps[:5]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price']

In [8]:
cv,x_values = fit_transform_nltk(corps,CountVectorizer(max_features=1000),False)

In [9]:
joblib.dump(cv,'/home/admin3/ml_with_phoenix/natural_language_processing/pkl_objects/count_vectorizer.pkl')

['/home/admin3/ml_with_phoenix/natural_language_processing/pkl_objects/count_vectorizer.pkl']

In [10]:
y_values = data_set['Liked'].values

#### Seperating train and test data

In [11]:
#obtaining splitted training and test data set
train_X, test_X, train_Y, test_Y = model_selection.train_test_split(x_values,y_values,test_size=0.3,random_state=0)

In [12]:
print(test_X.shape)
print(train_X.shape)

(300, 1000)
(700, 1000)


#### Building classififaction Model 

In [13]:
def fit_or_predict(x,y,classifier,task =0):
    if task !=0:
        classifier.fit(x,y)
        return classifier
    else:
        return classifier.predict(x)

In [14]:
classifier = fit_or_predict(train_X,train_Y,LogisticRegression(),1)

In [15]:
joblib.dump(classifier,'/home/admin3/ml_with_phoenix/natural_language_processing/pkl_objects/logistic_classifier.pkl')

['/home/admin3/ml_with_phoenix/natural_language_processing/pkl_objects/logistic_classifier.pkl']

#### Predicting values and Checking Accuracy 

In [16]:
# predicting values of test data set and storing
test_prediction = fit_or_predict(test_X,test_Y,classifier)

# predicting values of train data set and storing
train_prediction = fit_or_predict(train_X,train_Y,classifier)

In [17]:
# checking confusion matrix  to see how many values are predicted correct and incorrect
confusion_matrix(test_prediction,test_Y)

array([[116,  61],
       [ 27,  96]])

In [18]:
# checking confusion matrix  to see how many values are predicted correct and incorrect
confusion_matrix(train_prediction,train_Y)

array([[349,  25],
       [  8, 318]])

In [19]:
# calculating and printing accuracy score
score=accuracy_score(train_Y,train_prediction)
print("accuracy percentage  : ",score*100)

accuracy percentage  :  95.28571428571428


In [20]:
score=accuracy_score(test_Y,test_prediction)
print("accuracy percentage  : ",score*100)

accuracy percentage  :  70.66666666666667


#### Using K-Fold cross validation to analys 

In [21]:
accuracies = cross_val_score(estimator = classifier, X = train_X, y = train_Y, cv = 10)
fold = 1
for accuracy in accuracies:
    print("accuray of {}".format(fold),"  fols is  >   ",accuracy*100)
    fold+=1

accuray of 1   fols is  >    78.57142857142857
accuray of 2   fols is  >    75.71428571428571
accuray of 3   fols is  >    80.0
accuray of 4   fols is  >    71.42857142857143
accuray of 5   fols is  >    80.0
accuray of 6   fols is  >    75.71428571428571
accuray of 7   fols is  >    74.28571428571429
accuray of 8   fols is  >    68.57142857142857
accuray of 9   fols is  >    82.85714285714286
accuray of 10   fols is  >    78.57142857142857


In [22]:
print("mean of all the above accuracies is :  ",accuracies.mean())
print("standard deviation of accuracies is :  ",accuracies.std())

mean of all the above accuracies is :   0.7657142857142857
standard deviation of accuracies is :   0.041007714555449506


In [23]:
accuracies = cross_val_score(estimator = classifier, X = test_X, y = test_Y, cv = 10)
fold = 1
for accuracy in accuracies:
    print("accuray of {}".format(fold),"  fols is  >   ",accuracy*100)
    fold+=1

accuray of 1   fols is  >    80.0
accuray of 2   fols is  >    70.0
accuray of 3   fols is  >    80.0
accuray of 4   fols is  >    66.66666666666666
accuray of 5   fols is  >    76.66666666666667
accuray of 6   fols is  >    70.0
accuray of 7   fols is  >    80.0
accuray of 8   fols is  >    56.666666666666664
accuray of 9   fols is  >    63.33333333333333
accuray of 10   fols is  >    63.33333333333333
