In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('Restaurant-Reviews.tsv', sep='\t', quoting= 3)

In [3]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [4]:
data['Liked'].value_counts()

Liked
1    500
0    500
Name: count, dtype: int64

In [5]:
import nltk
import re

In [6]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\varun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
review = data['Review'][0]
review

'Wow... Loved this place.'

In [8]:
review = re.sub('[^a-zA-Z]', ' ', review)
review

'Wow    Loved this place '

In [9]:
review = review.lower()
review

'wow    loved this place '

In [10]:
review = review.split()
review

['wow', 'loved', 'this', 'place']

In [11]:
en_StopWords = stopwords.words('english')

In [12]:
preview = []
for word in review:
    if word not in en_StopWords:
        preview.append(word)
print(preview)

['wow', 'loved', 'place']


In [13]:
review = [word for word in review if word not in en_StopWords]

In [14]:
review

['wow', 'loved', 'place']

In [15]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [16]:
review = [ps.stem(word) for word in review] 
review

['wow', 'love', 'place']

In [17]:
review = " ".join(review)
print(review)

wow love place


In [18]:
corpus = []
ps = PorterStemmer()

for i in range(len(data)):
    review = data['Review'][i]
    review = re.sub('[^a-zA-Z]', ' ', review)
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in en_StopWords]
    review = " ".join(review)
    corpus.append(review)

In [19]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
cv = CountVectorizer(max_features=1500)

In [21]:
X = cv.fit_transform(corpus).toarray()

In [22]:
X.shape

(1000, 1500)

In [23]:
y = data.iloc[:,1].values

In [24]:
y.shape

(1000,)

In [25]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [26]:
X_train.shape, X_test.shape

((800, 1500), (200, 1500))

In [27]:
y_train.shape, y_test.shape

((800,), (200,))

In [28]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()

In [29]:
classifier.fit(X_train,y_train)

In [30]:
y_pred = classifier.predict(X_test)

In [31]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

0.73

In [32]:
200 * 0.71

142.0