# Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix 

# Load the dataset

In [2]:
dataset = pd.read_csv("C:\\Users\\DELL\\Downloads\\Restaurant_Reviews.tsv", delimiter = '\t')
dataset

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


# Cleaning the data 

In [3]:
# Initialise empty array
# to append the clean text
corpus = []
# 1000 (reviews) rows to clean
for i in range(0,1000):
    # column : "Review", row ith
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    # Convert all cases to lower cases
    review = review.lower()
    # split to array
    review = review.split()
    ps = PorterStemmer()
    # loop for stemming each word
    # in string arrays at ith row
    review = [ps.stem(word) for word in review
                if not word in set(stopwords.words('english'))]
    # rejoin all string array elements
    # to create back into a string
    review = ''.join(review)
    # append each string to create array of clean text
    corpus.append(review)
corpus

['wowloveplace',
 'crustgood',
 'tastitexturnasti',
 'stoplatemaybankholidayricksteverecommendlove',
 'selectmenugreatprice',
 'getangriwantdamnpho',
 'honesltitastfresh',
 'potatolikerubbercouldtellmadeaheadtimekeptwarmer',
 'frigreat',
 'greattouch',
 'servicprompt',
 'wouldgoback',
 'cashiercareeversaystillendwayyyoverpr',
 'tricapecodravolichickencranberrimmmm',
 'disgustprettisurehumanhair',
 'shocksignindiccash',
 'highlirecommend',
 'waitresslittlslowservic',
 'placeworthtimeletalonvega',
 'like',
 'burrittoblah',
 'foodamaz',
 'servicalsocute',
 'couldcarelessinteriorbeauti',
 'perform',
 'rightredvelvetcakeohhhstuffgood',
 'neverbroughtsaladask',
 'holewallgreatmexicanstreettacofriendlistaff',
 'tookhourgetfoodtablrestaurfoodlukewarmseverrunaroundliketotaloverwhelm',
 'worstsalmonsashimi',
 'alsocombolikeburgerfribeerdecentdeal',
 'likefinalblow',
 'foundplaceaccidcouldhappier',
 'seemlikegoodquickplacegrabbitefamiliarpubfoodfavorlookelsewher',
 'overallikeplacelot',
 'redeemq

# To extract the features

In [4]:
# To extract max 1500 features
# "max_attributes" is attribute to experiment with to get better results
cv = CountVectorizer(max_features=1500)
# X contains Corpus(dependant variable)
X = cv.fit_transform(corpus).toarray()
# y contains answers if review is positive or negative
y = dataset.iloc[:,1].values

# To split the data into training and testing set

In [5]:
# Experiment with test size to get better results
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25)

# Using machine learning algorithm to create the model

In [6]:
# n_estimators can be said as number of trees, experiment with n_estimators to get better results
model = RandomForestClassifier(n_estimators=501,criterion='entropy')
model.fit(X,y)

RandomForestClassifier(criterion='entropy', n_estimators=501)

# Predicting the result by using the above model

In [7]:
# Predicting the result
y_pred = model.predict(X_test)
y_pred

array([0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 1, 0, 1, 1, 1, 1], dtype=int64)

# Creating confusion matrix to see accuracy

In [8]:
cm = confusion_matrix(y_test,y_pred)
cm

array([[130,   0],
       [  0, 120]], dtype=int64)