In [24]:
import pandas as pd
import numpy as np

import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

# For NLP classification naive_bays, decision_tree, and random_forest are best

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, confusion_matrix

In [2]:
df = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

In [3]:
df.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
ps = PorterStemmer()

### Cleaning

In [6]:
corpus = []

for i in range(0,1000):
    review = df['Review'][i]
    review = review.lower()
    review = re.sub('[^a-z]', ' ', review)
    review = review.split()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)
    

### Making Variables: Bag of Words

In [7]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = df.Liked

### Split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=0)

## Models

### Naive Bays

In [11]:
nb = GaussianNB().fit(X_train,y_train)

In [13]:
y_pred = nb.predict(X_test)

In [18]:
confusion_matrix(y_test, y_pred)

array([[55, 42],
       [12, 91]])

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.82      0.57      0.67        97
           1       0.68      0.88      0.77       103

   micro avg       0.73      0.73      0.73       200
   macro avg       0.75      0.73      0.72       200
weighted avg       0.75      0.73      0.72       200



### Decision Tree

In [20]:
dt = DecisionTreeClassifier().fit(X_train, y_train)

In [21]:
y_pred_2 = dt.predict(X_test)

In [22]:
confusion_matrix(y_test, y_pred_2)

array([[70, 27],
       [38, 65]])

In [23]:
print(classification_report(y_test, y_pred_2))

              precision    recall  f1-score   support

           0       0.65      0.72      0.68        97
           1       0.71      0.63      0.67       103

   micro avg       0.68      0.68      0.68       200
   macro avg       0.68      0.68      0.67       200
weighted avg       0.68      0.68      0.67       200



### Random Forest

In [25]:
rf = RandomForestClassifier().fit(X_train, y_train)



In [27]:
y_pred_3 = rf.predict(X_test)

In [28]:
confusion_matrix(y_test, y_pred_3)

array([[83, 14],
       [53, 50]])

In [29]:
print(classification_report(y_test, y_pred_3))

              precision    recall  f1-score   support

           0       0.61      0.86      0.71        97
           1       0.78      0.49      0.60       103

   micro avg       0.67      0.67      0.67       200
   macro avg       0.70      0.67      0.66       200
weighted avg       0.70      0.67      0.65       200



In [30]:
# Need to redo with cross validation