In [1]:
# Importing essential libraries
import numpy as np
import pandas as pd

In [2]:
# Loading dataset 
df = pd.read_csv('restaurant_review_dataset.tsv', delimiter='\t', quoting=3)

In [3]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
df.shape

(1000, 2)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [6]:
df.describe()

Unnamed: 0,Liked
count,1000.0
mean,0.5
std,0.50025
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [7]:
df.isnull().count()

Review    1000
Liked     1000
dtype: int64

In [8]:
df['Liked'].value_counts()

1    500
0    500
Name: Liked, dtype: int64

In [9]:
df.columns

Index(['Review', 'Liked'], dtype='object')

## Data Preprocessing 

In [10]:
# Importing essential libraries for performing Natural Language Processing
import nltk
import re
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:
corpus = []
for i in range(0,1000):

  # Cleaning special character from the reviews
  review = re.sub(pattern='[^a-zA-Z]',repl=' ', string=df['Review'][i])

  # Converting the entire review into lower case
  review = review.lower()

  # Tokenizing the review by words
  review_words = review.split()

  # Removing the stop words
  review_words = [word for word in review_words if not word in set(stopwords.words('english'))]

  # Stemming the words
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review_words]

  # Joining the stemmed words
  review = ' '.join(review)

  # Creating a corpus
  corpus.append(review)

In [12]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [13]:
corpus[0:4]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love']

In [14]:
# Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2000)
X = cv.fit_transform(corpus).toarray()
y = df.iloc[:, 1].values

In [15]:
X[0:10]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [16]:
y[0:10]

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1], dtype=int64)

## Model 


In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [18]:
X_train.shape

(800, 1565)

In [19]:
X_test.shape

(200, 1565)

### MultinomialNB Algorithm 

In [20]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [21]:
# Predicting the Test set results
y_pred_classifier = classifier.predict(X_test)

### Random Forest Classifier Algorithm

In [22]:
# Fitting RandomForestClassifier to the Training Set 
from sklearn.ensemble import RandomForestClassifier 
model = RandomForestClassifier(n_estimators = 1000, 
                            criterion = 'entropy') 
                              
model.fit(X_train, y_train) 

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [23]:
y_pred_model = model.predict(X_test)

### BernoulliNB Algorithm 

In [28]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import BernoulliNB
classifier_bernoulli = BernoulliNB(alpha=0.8)
classifier_bernoulli.fit(X_train, y_train)

BernoulliNB(alpha=0.8, binarize=0.0, class_prior=None, fit_prior=True)

In [31]:
y_pred_classifier_bernoulli = classifier_bernoulli.predict(X_test)

## Metrics

In [32]:
# Accuracy, Precision, Recall and Confusion Matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix

score1 = accuracy_score(y_test,y_pred_classifier)
score2 = precision_score(y_test,y_pred_classifier)
score3= recall_score(y_test,y_pred_classifier)
cm = confusion_matrix(y_test, y_pred_classifier)
print("***************  MULTINOMIAL NAIVE BAYES ALGORITHM  ***************")
print("\n")
print("-------------------------- METRICS --------------------------")
print("Accuracy score is: {}%".format(round(score1*100,2)))
print("Precision score is: {}".format(round(score2,2)))
print("Recall score is: {}".format(round(score3,2)))
print("-------------------------- CONFUSION MATRIX --------------------------")
print(cm)
print("\n")
print("\n")


score11 = accuracy_score(y_test,y_pred_model)
score22 = precision_score(y_test,y_pred_model)
score33= recall_score(y_test,y_pred_model)
cm1 = confusion_matrix(y_test, y_pred_model)
print("***************  RANDOM FOREST CLASSIFIER ALGORITHM  ***************")
print("\n")
print("-------------------------- METRICS --------------------------")
print("Accuracy score is: {}%".format(round(score11*100,2)))
print("Precision score is: {}".format(round(score22,2)))
print("Recall score is: {}".format(round(score33,2)))
print("-------------------------- CONFUSION MATRIX --------------------------")
print(cm1)
print("\n")
print("\n")

score111 = accuracy_score(y_test,y_pred_classifier_bernoulli)
score222 = precision_score(y_test,y_pred_classifier_bernoulli)
score333= recall_score(y_test,y_pred_classifier_bernoulli)
cm2 = confusion_matrix(y_test, y_pred_classifier_bernoulli)
print("***************  BERNOULLI'S NAIVE BAYES ALGORITHM  ***************")
print("\n")
print("-------------------------- METRICS --------------------------")
print("Accuracy score is: {}%".format(round(score111*100,2)))
print("Precision score is: {}".format(round(score222,2)))
print("Recall score is: {}".format(round(score333,2)))
print("-------------------------- CONFUSION MATRIX --------------------------")
print(cm2)




***************  MULTINOMIAL NAIVE BAYES ALGORITHM  ***************


-------------------------- METRICS --------------------------
Accuracy score is: 77.5%
Precision score is: 0.77
Recall score is: 0.8
-------------------------- CONFUSION MATRIX --------------------------
[[73 24]
 [21 82]]




***************  RANDOM FOREST CLASSIFIER ALGORITHM  ***************


-------------------------- METRICS --------------------------
Accuracy score is: 72.5%
Precision score is: 0.85
Recall score is: 0.56
-------------------------- CONFUSION MATRIX --------------------------
[[87 10]
 [45 58]]




***************  BERNOULLI'S NAIVE BAYES ALGORITHM  ***************


-------------------------- METRICS --------------------------
Accuracy score is: 77.5%
Precision score is: 0.77
Recall score is: 0.8
-------------------------- CONFUSION MATRIX --------------------------
[[73 24]
 [21 82]]


## Making prediction on custom data 

In [33]:
def predict_sentiment(sample_review):
  sample_review = re.sub(pattern='[^a-zA-Z]',repl=' ', string = sample_review)
  sample_review = sample_review.lower()
  sample_review_words = sample_review.split()
  sample_review_words = [word for word in sample_review_words if not word in set(stopwords.words('english'))]
  ps = PorterStemmer()
  final_review = [ps.stem(word) for word in sample_review_words]
  final_review = ' '.join(final_review)

  temp = cv.transform([final_review]).toarray()
  return classifier.predict(temp)

In [34]:
# Predicting values
sample_review = 'The ambience is very good.'

if predict_sentiment(sample_review):
  print('This is a POSITIVE review.')
else:
  print('This is a NEGATIVE review!')

This is a POSITIVE review.


In [35]:
# Predicting values
sample_review = 'The waiters are arrogant.'

if predict_sentiment(sample_review):
  print('This is a POSITIVE review.')
else:
  print('This is a NEGATIVE review!')

This is a NEGATIVE review!


In [36]:
# Predicting values
sample_review = 'The food was tasty.'

if predict_sentiment(sample_review):
  print('This is a POSITIVE review.')
else:
  print('This is a NEGATIVE review!')

This is a POSITIVE review.
