# Import Libraries

In [37]:
import pandas as pd
import numpy as np
import re
import csv

In [3]:
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

# Load Dataset

In [5]:
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)

In [6]:
dataset.shape

(1000, 2)

In [7]:
dataset.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


# Cleaning the texts

In [8]:
corpus = []
for i in range(0, 1000):
    review = re.sub('[^a-zA-Z]', ' ', dataset['Review'][i])
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [10]:
len(corpus)

1000

In [11]:
corpus[0:10]

['wow love place',
 'crust good',
 'tasti textur nasti',
 'stop late may bank holiday rick steve recommend love',
 'select menu great price',
 'get angri want damn pho',
 'honeslti tast fresh',
 'potato like rubber could tell made ahead time kept warmer',
 'fri great',
 'great touch']

# Creating the `Bag of Words` Model

In [12]:
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values

In [13]:
X.shape

(1000, 1500)

In [18]:
X[0:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
y.shape

(1000,)

In [20]:
y[0:10]

array([1, 0, 0, 1, 1, 0, 0, 0, 1, 1], dtype=int64)

# Splitting `X` and `y` into the Train set and Test set

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

In [22]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(800, 1500)
(200, 1500)
(800,)
(200,)


In [23]:
X_train[0:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [24]:
X_test[0:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [25]:
y_train[0:2]

array([1, 1], dtype=int64)

In [26]:
y_test[0:2]

array([0, 0], dtype=int64)

# Fit `X_train` and `y_train` on Naive-Bayes Model

In [27]:
gauss_classifier = GaussianNB()
gauss_classifier

In [28]:
gauss_classifier.fit(X_train, y_train)

# Predicting `X_test`

In [29]:
y_pred = gauss_classifier.predict(X_test)

In [30]:
y_pred.shape

(200,)

In [31]:
y_pred[0:10]

array([1, 1, 1, 0, 0, 1, 1, 1, 1, 1], dtype=int64)

# Generate Confusion Matrix

In [32]:
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[55 42]
 [12 91]]


# Export as csv Files

In [33]:
dataset.to_csv('Restaurant_Reviews.csv', index=False)

In [44]:
with open('corpus1.csv', 'w') as f:
    write = csv.writer(f)
    write.writerows(corpus)

In [42]:
dict = {'Cleaned_Reviews': corpus}
df_corpus = pd.DataFrame(dict)
df_corpus.to_csv('corpus2.csv', index=False)

In [46]:
df_x = pd.DataFrame(X)
df_x.to_csv('X.csv', index=False)

In [53]:
# np.set_printoptions(suppress=True,precision=3)
# np.set_printoptions(formatter={'float': '{: 0.3f}'.format})
np.savetxt("X2.csv", X, delimiter=",", fmt='%1.3f')