In [2]:
import numpy as np
import pandas as pd

data = pd.read_csv('https://github.com/jradha11/sentiment-analysis-nlp/raw/master/Restaurant_Reviews.tsv', delimiter = '\t', quoting = 3)

In [5]:
data.head()

Unnamed: 0,Review,Liked
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


In [6]:
data.groupby('Liked').size()

Liked
0    500
1    500
dtype: int64

In [7]:
data['Liked'].value_counts()

Liked
1    500
0    500
Name: count, dtype: int64

In [8]:
data.isnull().sum()

Review    0
Liked     0
dtype: int64

In [9]:
x = data['Review'][0]

In [10]:
import re
review = re.sub('[^a-zA-Z]',' ',x)
review

'Wow    Loved this place '

In [11]:
review = review.split()
review

['Wow', 'Loved', 'this', 'place']

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shash\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
from nltk.corpus import stopwords

In [14]:
review_1 = [word for word in review if not word in set(stopwords.words('english'))]

In [15]:
from nltk.stem.porter import PorterStemmer

In [16]:
ps = PorterStemmer()
review_1 = [ps.stem(word) for word in review_1]
review_1

['wow', 'love', 'place']

In [17]:
review_2 = ' '.join(review_1)
review_2

'wow love place'

# Data Cleaning

In [33]:
def cleaner(data):
    reviews = []
    for text in data:
        txt = re.sub('[^a-zA-Z]',' ',text)
        txt.lower()
        
        txt = txt.split()
        
        txt = [ ps.stem(word) for word in txt if not word in stopwords.words('english')]
        
        txt = ' '.join(txt)
        
        reviews.append(txt)
        
    return reviews    

In [35]:
cleaned_data = cleaner(data['Review'])

# Vectorization

In [36]:
from sklearn.feature_extraction.text import CountVectorizer

In [44]:
cv = CountVectorizer(max_features = 1500)
vec_data = cv.fit_transform(cleaned_data)

In [45]:
from sklearn.model_selection import train_test_split

In [46]:
vec_data = vec_data.toarray()

In [57]:
y = data['Liked']
X_tr,X_te,y_tr,y_te = train_test_split(vec_data,y,test_size = 0.2,stratify = y,random_state = 0)

In [58]:
from sklearn.naive_bayes import GaussianNB

In [59]:
model = GaussianNB()
model.fit(X_tr,y_tr)

In [60]:
y_pred = model.predict(X_te)

In [61]:
from sklearn.metrics import classification_report, accuracy_score

In [62]:
print(classification_report(y_te,y_pred))

              precision    recall  f1-score   support

           0       0.76      0.61      0.68       100
           1       0.68      0.81      0.74       100

    accuracy                           0.71       200
   macro avg       0.72      0.71      0.71       200
weighted avg       0.72      0.71      0.71       200



In [63]:
accuracy_score(y_te,y_pred)

0.71

# Using Model

In [64]:
text_1 = 'Floor was not properly cleaned, I fell due to wet soup spread over floor'
text_2 = 'completely cleaned dishes and table, I really liked it'
text_3 = 'I waited for like 1 hour just to get trash like french fries'
text_4 = 'I had to wait for 10 minutes but when the dish was served, it was worth it'

In [72]:
def func(text):
    cleaned_text = cleaner([text])
    vec_txt = cv.fit_transform(cleaned_text).toarray()
    ans = model.predict(vec_txt)
    if ans[0] == 1:
        print('Negative review')
    else:
        print('Positive Review')

In [73]:
func(text_1)

ValueError: X has 8 features, but GaussianNB is expecting 1500 features as input.