In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd


In [None]:
data_set = pd.read_csv('/content/Restaurant_Reviews.tsv',delimiter='\t',quoting=3)
data_set

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


**cleaning and make bag of words**

In [None]:
data_set['Review'][0]

'Wow... Loved this place.'

In [None]:
import re # for cleaning in NLP and remove punctuation

review = re.sub('[^a-zA-Z]'," ",data_set['Review'][0] ) # return this values from the record and remove the others like (.)
review

'Wow    Loved this place '

In [None]:
review = review.lower()
review

'wow    loved this place '

In [None]:
review = review.split()
review

['wow', 'loved', 'this', 'place']

In [None]:
# to remove the unneeded words like a , the , on ,etc
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# remove this from first review and make sentence to add for bag of words
review = [word for word in review
          if not word in set(stopwords.words('english'))]  # make in set to high performance especially in big topics

review

['wow', 'loved', 'place']

In [None]:
# find the root of words (Lemmatization)

from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

review = [ps.stem(word) for word in review
          if not word in set(stopwords.words('english'))]  # make in set to high performance especially in big topics

review

['wow', 'love', 'place']

In [None]:
# reformed the list to the sentence

review = " ".join(review)
review

'wow love place'

In [None]:
# the total code
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

corpus = []
for i in range(len(data_set)):
  review = re.sub('[^a-zA-Z]'," ",data_set['Review'][i] )
  review = review.lower()
  review = review.split()
  ps = PorterStemmer()
  review = [ps.stem(word) for word in review
          if not word in set(stopwords.words('english'))]
  review = " ".join(review)
  corpus.append(review)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
corpus[0:10]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

**bag of words**

In [None]:
# creating the bag of words model and make sparse matrix

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500) # Countvectorizer can handle lower case and split and remove punctuation but by maunel better for web scraping
X = cv.fit_transform(corpus).toarray()

In [None]:
X_df = pd.DataFrame(X , columns=cv.get_feature_names_out())
X_df

Unnamed: 0,absolut,absolutley,accid,accommod,accomod,accordingli,account,ach,acknowledg,across,...,year,yellow,yellowtail,yelper,yet,yucki,yukon,yum,yummi,zero
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#target value
y = data_set.Liked

In [None]:
y

0      1
1      0
2      0
3      1
4      1
      ..
995    0
996    0
997    0
998    0
999    0
Name: Liked, Length: 1000, dtype: int64

In [None]:
from sklearn.model_selection import train_test_split

X_train , X_test , y_train , y_test = train_test_split(X , y , test_size=0.2 , random_state=0)

# X_train , X_valid, y_train , y_valid = train_test_split(X_fulltrain , y_fulltrain , test_size=0.2 , random_state=0)

In [None]:
X_train.shape , X_test.shape

((800, 1500), (200, 1500))

In [None]:
y_train.shape , y_test.shape

((800,), (200,))

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=20,random_state=0)
classifier.fit(X_train , y_train)

In [None]:
y_preds = classifier.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

cm = pd.DataFrame(confusion_matrix(y_test,y_preds))
cm

Unnamed: 0,0,1
0,85,12
1,39,64


In [None]:
accuracy_score(y_test,y_preds)

0.745

In [None]:
from sklearn.naive_bayes import GaussianNB
naive = GaussianNB()
naive.fit(X_train,y_train)

In [None]:
naive_preds = naive.predict(X_test)

cm = pd.DataFrame(confusion_matrix(y_test,naive_preds))
cm

Unnamed: 0,0,1
0,55,42
1,12,91


In [None]:
accuracy_score(y_test,naive_preds)

0.73

In [None]:
from sklearn.tree import DecisionTreeClassifier
classfier_tree_entropy = DecisionTreeClassifier(criterion='entropy',random_state=0)

classfier_tree_entropy.fit(X_train,y_train)


In [None]:
y_pred_entropy = classfier_tree_entropy.predict(X_test)
print(f'accuracy from entropy : {accuracy_score(y_test,y_pred_entropy)}')
cm2 = pd.DataFrame(confusion_matrix(y_test,y_pred_entropy))
cm2

accuracy from entropy : 0.71


Unnamed: 0,0,1
0,74,23
1,35,68
