# Restaurant review project

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("Restaurant_Reviews.tsv", delimiter = '\t')
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
df.isnull().sum()

Review    0
Liked     0
dtype: int64

# Data preprocessing

In [4]:
import nltk
import re
nltk.download('stopwords')
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shiva\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [6]:
from nltk.corpus import stopwords


In [7]:
corpus = []
ps = PorterStemmer()

for i in range(df.shape[0]):
    # Data Cleaning step
    clean = re.sub(pattern='^a-zA-Z', repl = " ", string = df.Review[i])
    clean = clean.lower()
    # Tokenizatiomn
    clean_split = clean.split()
    # Remove stop words
    clean_split = [word for word in clean_split if word not in set(stopwords.words('english'))]
    # Stemming
    review = [ps.stem(word) for word in clean_split]
    
    joined_words = " ".join(review)
    
    corpus.append(joined_words)
    

In [8]:
corpus[0]

'wow... love place.'

In [9]:
corpus[0:10]

['wow... love place.',
 'crust good.',
 'tasti textur nasty.',
 'stop late may bank holiday rick steve recommend love it.',
 'select menu great prices.',
 'get angri want damn pho.',
 'honeslti tast fresh.)',
 'potato like rubber could tell made ahead time kept warmer.',
 'fri great too.',
 'great touch.']

# Creating BAG of words model

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

In [11]:
cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, -1].values

In [12]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [15]:
X_train.shape

(800, 1500)

In [17]:
y_train.shape

(800,)

# Model building

In [18]:
from sklearn.naive_bayes import MultinomialNB

In [19]:
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

In [20]:
y_pred = clf.predict(X_test)
y_pred

array([1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1,
       0, 1], dtype=int64)

# Accuracy Score

In [21]:
from sklearn.metrics import accuracy_score
score = accuracy_score(y_test, y_pred)
print("Accuracy score: {}%".format(round(score*100, 2)))

Accuracy score: 77.0%


# hyperparamter tunning

In [43]:
score_list=[]
accu=0.0

for i in np.arange(0.1,1.1,0.1):
    test_classifier = MultinomialNB(alpha=i)
    test_classifier.fit(X_train, y_train)
    pred = test_classifier.predict(X_test)
    score = round(accuracy_score(y_test,pred)*100,2)
    if score>accu:
        accu=score
        print('The best accuracy is {}% with alpha value as {}'.format(accu, round(i,1)))



The best accuracy is 76.0% with alpha value as 0.1
The best accuracy is 76.5% with alpha value as 0.2
The best accuracy is 77.0% with alpha value as 0.4


In [44]:
classifier = MultinomialNB(alpha=0.4)
classifier.fit(X_train, y_train)

MultinomialNB(alpha=0.4)

## prediction

In [45]:
def predict_sentiment(sample_review):
    sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ', string = sample_review)
    sample_review = sample_review.lower()
    sample_review_words = sample_review.split()
    sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
    ps = PorterStemmer()
    final_review = [ps.stem(word) for word in sample_review_words]
    final_review = ' '.join(final_review)

    temp = cv.transform([final_review]).toarray()
    return classifier.predict(temp)[0]

In [46]:
#Prediction 

sample_review = 'The food is really good here.'

if predict_sentiment(sample_review):
    print('GOOD review!.')
else:
    print('BAD review!')

GOOD review!.


In [47]:
# Predicting values
sample_review = 'Food was pretty bad and the service was very slow.'

if predict_sentiment(sample_review):
  print('GOOD review!.')
else:
  print('BAD review!')

BAD review!


In [48]:
# Predicting values
sample_review = 'Poor service, the waiter made me feel like I was stupid every time he came to the table.'

if predict_sentiment(sample_review):
  print('GOOD review!.')
else:
  print('BAD review!')

BAD review!


In [49]:
# Creating a pickle file for the CountVectorizer
import pickle

pickle.dump(cv, open('cv-transform.pkl', 'wb'))