# Restaurant Review Prediction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Restaurant_Reviews.tsv',sep='\t',quoting=3)
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [3]:
df.info() #the dataset has no null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [4]:
df['Liked'].value_counts()  #the target column is balanced

Liked
1    500
0    500
Name: count, dtype: int64

## Cleaning the Text data

In [5]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [6]:
ps = PorterStemmer()

In [7]:
def text_cleaning(text):
    cleaned = [word for word in text if word not in string.punctuation]
    cleaned = "".join(cleaned)
    cleaned  = cleaned.split()
    cleaned = [ps.stem(word) for word in cleaned if word.lower() not in stopwords.words('english')]
    cleaned = " ".join(cleaned)
    return cleaned

In [11]:
df['cleaned_text'] = df['Review'].apply(text_cleaning)

### BAG OF WORDS

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [9]:
cv = CountVectorizer(max_features=1500)

In [12]:
X = cv.fit_transform(df['cleaned_text']).toarray()

In [13]:
X.shape

(1000, 1500)

In [14]:
y = df['Liked'].values

In [15]:
y.shape

(1000,)

## Train test split

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## Model Building

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

### 1) Naive Bayes

In [19]:
nb = GaussianNB()

In [20]:
nb.fit(X_train,y_train)

In [21]:
pred = nb.predict(X_test)

In [22]:
print("-----------------------------------------")
print('\t',"NAIVE BAYES CLASSIFIER")
print("-----------------------------------------")
print("Classification report:",'\n',classification_report(y_test,pred),"\n")
print("The accuracy score is:", accuracy_score(y_test,pred),"\n")
print("confusion_matrix:",'\n\n',confusion_matrix(y_test,pred))

-----------------------------------------
	 NAIVE BAYES CLASSIFIER
-----------------------------------------
Classification report: 
               precision    recall  f1-score   support

           0       0.74      0.54      0.63        96
           1       0.66      0.83      0.74       104

    accuracy                           0.69       200
   macro avg       0.70      0.68      0.68       200
weighted avg       0.70      0.69      0.68       200
 

The accuracy score is: 0.69 

confusion_matrix: 

 [[52 44]
 [18 86]]


### 2) SVC

In [23]:
classifier2 =  SVC(kernel='linear',C=100,gamma='auto')

In [24]:
classifier2.fit(X_train,y_train)

In [25]:
svm_pred = classifier2.predict(X_test)

In [26]:
print("-----------------------------------------")
print('\t',"SUPPORT VECTOR MACHINE CLASSIFIER")
print("-----------------------------------------")
print("Classification report:",'\n',classification_report(y_test,svm_pred),"\n")
print("The accuracy score is:", accuracy_score(y_test,svm_pred),"\n")
print("confusion_matrix:",'\n\n',confusion_matrix(y_test,svm_pred))

-----------------------------------------
	 SUPPORT VECTOR MACHINE CLASSIFIER
-----------------------------------------
Classification report: 
               precision    recall  f1-score   support

           0       0.69      0.74      0.71        96
           1       0.74      0.69      0.72       104

    accuracy                           0.71       200
   macro avg       0.72      0.72      0.71       200
weighted avg       0.72      0.71      0.72       200
 

The accuracy score is: 0.715 

confusion_matrix: 

 [[71 25]
 [32 72]]


### 3) Random Forest Classifier

In [29]:
rf = RandomForestClassifier(n_estimators=150)

In [30]:
rf.fit(X_train,y_train)

In [32]:
rf_pred = rf.predict(X_test)

In [33]:
print("-----------------------------------------")
print('\t',"RANDOM FOREST CLASSIFIER")
print("-----------------------------------------")
print("Classification report:",'\n',classification_report(y_test,rf_pred),"\n")
print("The accuracy score is:", accuracy_score(y_test,rf_pred),"\n")
print("confusion_matrix:",'\n\n',confusion_matrix(y_test,rf_pred))

-----------------------------------------
	 RANDOM FOREST CLASSIFIER
-----------------------------------------
Classification report: 
               precision    recall  f1-score   support

           0       0.65      0.82      0.73        96
           1       0.78      0.60      0.68       104

    accuracy                           0.70       200
   macro avg       0.72      0.71      0.70       200
weighted avg       0.72      0.70      0.70       200
 

The accuracy score is: 0.705 

confusion_matrix: 

 [[79 17]
 [42 62]]
