In [40]:
# Importing Libraries 
import numpy as np   
import pandas as pd  
  
# Import dataset 
dataset = pd.read_csv('C:/Users/Lenovo/Desktop/PROJECTS/Restaurant_Reviews.tsv', delimiter = '\t') 

# library to clean data 
import re  #regular expression
  
# Natural Language Tool Kit 

import nltk  
#nltk.download('stopwords') 
  
# to remove stopword 
from nltk.corpus import stopwords #corpus means collection of text
  
# for Stemming propose  
from nltk.stem.porter import PorterStemmer 
  
# Initialize empty array 
# to append clean text  
corpus = []  

In [41]:
print(dataset.head())
print("________________________________________________________")
print(type(dataset))
print("________________________________________________________")
print(dataset.iloc[:,0]) #iloc needs index
print("________________________________________________________")
print(dataset.loc[:,'Review']) #loc needs label
print("________________________________________________________")
print(dataset.loc[0:3,['Review','Liked']]) 

                                              Review  Liked
0                           Wow... Loved this place.      1
1                                 Crust is not good.      0
2          Not tasty and the texture was just nasty.      0
3  Stopped by during the late May bank holiday of...      1
4  The selection on the menu was great and so wer...      1
________________________________________________________
<class 'pandas.core.frame.DataFrame'>
________________________________________________________
0                               Wow... Loved this place.
1                                     Crust is not good.
2              Not tasty and the texture was just nasty.
3      Stopped by during the late May bank holiday of...
4      The selection on the menu was great and so wer...
                             ...                        
995    I think food should have flavor and texture an...
996                             Appetite instantly gone.
997    Overall I was not impress

In [42]:
# 1000 (reviews) rows to clean 
for i in range(0, 1000):  #or use range(1000)

    # re.sub('what to replace', 'with what', 'from which variable')
    # ^ matches start of line; $ matches end of line
    # column : "Review", row ith  
    # [^a-zA-Z] match anything other than small n caps a-z.
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])  #removes punctuation marks(!,:?...)
    
    # convert all cases to lower cases 
    review = review.lower()  
    
    # split to array(default delimiter is " ") 
    review = review.split()   #sentence is converted to array of words
    
    # creating PorterStemmer object to take main stem of each word 
    ps = PorterStemmer()  # groups inflected words n gets converted to root ex: running,ran,runs to run(root)
    
    # loop for stemming each word in string array at ith row     
    review = [ps.stem(word) for word in review 
                if not word in set(stopwords.words('english'))]  
    # stem only that word which is not part of stopword
    # stopword are words which don't add much meaning to sentence, can be removed safely ex:the,he,have etc
    
    # rejoin all string array elements  to create back into a string 
    review = ' '.join(review)   
    
      # append each string to create  array of clean text  
    corpus.append(review)  
    

In [43]:
# Creating the Bag of Words model 
# BOW represents corpus in vector form n that vector has no.s which represents occurance of the word in each review.
# countvectorizer toknizes and build vocab of known words.
# building vocab here means every column is represend with unique word and every row is review here,every row is initialized
# with 0 then row wise vector is filled based on no. of times that word has occur in that review 
# and 0 if not occured.(written in new book 1st slot last page)

from sklearn.feature_extraction.text import CountVectorizer 
  
# To extract max 1500 feature. "max_features" is attribute to  experiment with to get better results 
cv = CountVectorizer(max_features = 1500)  

In [44]:
# X contains corpus (dependent variable) 
X = cv.fit_transform(corpus).toarray()  
#print(cv.vocabulary_)
#print(X)
# y contains answers if review  is positive or negative 
y = dataset.iloc[:, 1].values 

{'wow': 1482, 'love': 777, 'place': 963, 'crust': 310, 'good': 569, 'tasti': 1297, 'textur': 1309, 'nasti': 870, 'stop': 1246, 'late': 737, 'may': 809, 'bank': 92, 'holiday': 642, 'rick': 1084, 'steve': 1239, 'recommend': 1050, 'select': 1140, 'menu': 827, 'great': 583, 'price': 997, 'get': 553, 'angri': 33, 'want': 1432, 'damn': 319, 'pho': 951, 'honeslti': 645, 'tast': 1295, 'fresh': 528, 'potato': 989, 'like': 760, 'rubber': 1098, 'could': 283, 'tell': 1302, 'made': 788, 'ahead': 15, 'time': 1331, 'kept': 720, 'warmer': 1434, 'fri': 529, 'touch': 1349, 'servic': 1149, 'prompt': 1009, 'would': 1480, 'go': 563, 'back': 83, 'cashier': 204, 'care': 196, 'ever': 447, 'say': 1125, 'still': 1241, 'end': 431, 'wayyy': 1442, 'overpr': 916, 'tri': 1359, 'cape': 192, 'cod': 245, 'ravoli': 1040, 'chicken': 222, 'cranberri': 296, 'mmmm': 847, 'disgust': 370, 'pretti': 996, 'sure': 1281, 'human': 662, 'hair': 603, 'shock': 1158, 'sign': 1169, 'indic': 684, 'cash': 202, 'highli': 635, 'waitress': 

In [None]:
# stop words are removed as they dont provide uniqe info and occur in abundance, if not removed longer vector will be formed
# stemming is done to reduce size of vector, this will induce to root word

In [45]:
# Splitting the dataset into the Training set and Test set 
from sklearn.model_selection import train_test_split 

# experiment with "test_size"  to get better results 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=1)


In [46]:
# NB

# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix:\n",cm)

# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))
print("Recall is ",round(score3,2))

Confusion Matrix:
 [[75 59]
 [18 98]]


Accuracy is  69.2 %
Precision is  0.62
Recall is  0.84


In [47]:
# Logistic Regression

# Fitting Logistic Regression to the Training set
from sklearn import linear_model
classifier = linear_model.LogisticRegression(C=1.5)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print ("Confusion Matrix:\n",cm)

# Accuracy, Precision and Recall
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
score1 = accuracy_score(y_test,y_pred)
score2 = precision_score(y_test,y_pred)
score3= recall_score(y_test,y_pred)
print("\n")
print("Accuracy is ",round(score1*100,2),"%")
print("Precision is ",round(score2,2))
print("Recall is ",round(score3,2))



Confusion Matrix:
 [[106  28]
 [ 31  85]]


Accuracy is  76.4 %
Precision is  0.75
Recall is  0.73
