##### Sentiment Analysis for Yelp using LSTM and Naive Bayes

https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences#



### Pre-Processing

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import sklearn
import tensorflow as tf
import keras
import gensim
from nltk.tokenize.treebank import TreebankWordDetokenizer
from keras.models import Sequential
from keras import layers
from keras.optimizers import RMSprop,Adam
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras import regularizers
from keras import backend as K
from keras.callbacks import ModelCheckpoint
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
# Import dataset into pandas dataframe
reviews = pd.read_csv("https://raw.githubusercontent.com/TiffanyLo19/Yelp-Sentiment-Analysis/main/yelp_labelled.txt", sep ='delimiter', header = None, names = ["Review"])
reviews['Sentiment'] = reviews['Review'].str.strip().str[-1]
reviews['Review'] = reviews['Review'].str[:-2]
reviews

  return func(*args, **kwargs)


Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1
...,...,...
995,I think food should have flavor and texture an...,0
996,Appetite instantly gone.,0
997,Overall I was not impressed and would not go b...,0
998,"The whole experience was underwhelming, and I ...",0


In [None]:
# Assign labels to sentiment
reviews['Sentiment'] = reviews.Sentiment.map({'1': 'positive', '0': 'negative'})
reviews

Unnamed: 0,Review,Sentiment
0,Wow... Loved this place.,positive
1,Crust is not good.,negative
2,Not tasty and the texture was just nasty.,negative
3,Stopped by during the late May bank holiday of...,positive
4,The selection on the menu was great and so wer...,positive
...,...,...
995,I think food should have flavor and texture an...,negative
996,Appetite instantly gone.,negative
997,Overall I was not impressed and would not go b...,negative
998,"The whole experience was underwhelming, and I ...",negative


In [None]:
# Count of each sentiment
reviews.groupby('Sentiment').nunique()

Unnamed: 0_level_0,Review
Sentiment,Unnamed: 1_level_1
negative,497
positive,499


In [None]:
# Check for null values
reviews = reviews[['Review','Sentiment']]
reviews["Review"].isnull().sum()

0

In [None]:
reviews.dtypes

Review       object
Sentiment    object
dtype: object

In [None]:
# Create training and testing datasets
from sklearn.model_selection import train_test_split
X = reviews[['Review']]
Y = reviews[['Sentiment']]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)
X_train2, X_test2, Y_train2, Y_test2 = train_test_split(X, Y, test_size = 0.2, random_state=5)
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(800, 1)
(200, 1)
(800, 1)
(200, 1)


In [None]:
# Vectorize text reviews to numbers
vec = CountVectorizer(stop_words = 'english')
for i in X_train:
  X_train[i] = vec.fit_transform(X_train[i]).toarray()
for i in X_test:
  X_test[i] = vec.fit_transform(X_test[i]).toarray()

In [None]:
# Check training and testing dataset shapes
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(800, 1)
(200, 1)
(800, 1)
(200, 1)


### Sentiment Prediction Using LSTM

In [None]:
# Split the series into a list
temp = []
toList = reviews['Review'].values.tolist()
for i in range(len(toList)):
    temp.append(toList[i])
list(temp[:5])

# Remove punctuation
def toWords(sentences):
    for line in sentences:
        yield(gensim.utils.simple_preprocess(str(line), deacc = True))

sepWords = list(toWords(temp))

print(sepWords[:10])

[['wow', 'loved', 'this', 'place'], ['crust', 'is', 'not', 'good'], ['not', 'tasty', 'and', 'the', 'texture', 'was', 'just', 'nasty'], ['stopped', 'by', 'during', 'the', 'late', 'may', 'bank', 'holiday', 'off', 'rick', 'steve', 'recommendation', 'and', 'loved', 'it'], ['the', 'selection', 'on', 'the', 'menu', 'was', 'great', 'and', 'so', 'were', 'the', 'prices'], ['now', 'am', 'getting', 'angry', 'and', 'want', 'my', 'damn', 'pho'], ['honeslty', 'it', 'didn', 'taste', 'that', 'fresh'], ['the', 'potatoes', 'were', 'like', 'rubber', 'and', 'you', 'could', 'tell', 'they', 'had', 'been', 'made', 'up', 'ahead', 'of', 'time', 'being', 'kept', 'under', 'warmer'], ['the', 'fries', 'were', 'great', 'too'], ['great', 'touch']]


In [None]:
# Detokenize
def detokenize(text):
    return TreebankWordDetokenizer().detokenize(text)

# Lowercase all sentences
words = []
for i in range(len(sepWords)):
    words.append(detokenize(sepWords[i]))
print(words[:10])

['wow loved this place', 'crust is not good', 'not tasty and the texture was just nasty', 'stopped by during the late may bank holiday off rick steve recommendation and loved it', 'the selection on the menu was great and so were the prices', 'now am getting angry and want my damn pho', 'honeslty it didn taste that fresh', 'the potatoes were like rubber and you could tell they had been made up ahead of time being kept under warmer', 'the fries were great too', 'great touch']


In [None]:
# Convert categorical to float type
labels = np.array(reviews['Sentiment'])
y = []
for i in range(len(labels)):
    if labels[i] == 'negative':
        y.append(0)
    if labels[i] == 'positive':
        y.append(1)
y = np.array(y)
labels = tf.keras.utils.to_categorical(y, 2, dtype = "float32")
del y

In [None]:
len(labels)

1000

In [None]:
max_words = 400
max_len = 100

tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(words)
seq = tokenizer.texts_to_sequences(words)
words1 = pad_sequences(seq, maxlen = max_len)
print(words1)

[[  0   0   0 ... 162   7  13]
 [  0   0   0 ...   5  10  14]
 [  0   0   0 ...   1   3  44]
 ...
 [  0   0   0 ...  10  36  30]
 [  0   0   0 ... 134 156  38]
 [  0   0   0 ... 337   1 242]]


In [None]:
#print(labels)

In [None]:
# Training and Testing using same ratios
X_train1, X_test1, y_train1, y_test1 = train_test_split(words1, labels, test_size = 0.2, random_state = 99)
print (len(X_train1), len(X_test1), len(y_train1), len(y_test1))

800 200 800 200


In [None]:
# Create and tune model
model = Sequential()
model.add(layers.Embedding(max_words, 40, input_length = max_len))
model.add(layers.Bidirectional(layers.LSTM(30, dropout = 0.6)))
model.add(layers.Dense(2, activation = 'softmax'))
model.compile(optimizer = 'rmsprop', loss = 'categorical_crossentropy', metrics = ['accuracy'])
history = model.fit(X_train1, y_train1, epochs = 40, validation_data = (X_test1, y_test1))


Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
# Find overall model accuracy
test_loss, accuracy = model.evaluate(X_test1, y_test1, verbose = 2)
print('Model Accuracy: ', accuracy)

7/7 - 0s - loss: 0.5449 - accuracy: 0.8200 - 130ms/epoch - 19ms/step
Model Accuracy:  0.8199999928474426


Sentence Testing

In [None]:
# Define sentiments
feeling = ['Negative', 'Positive']

In [None]:
seq = tokenizer.texts_to_sequences(['So dont go there if you are looking for good food...'])
test = pad_sequences(seq, maxlen = max_len)
feeling[np.around(model.predict(test)).argmax(axis = 1)[0]]



'Positive'

In [None]:
seq = tokenizer.texts_to_sequences(['If that bug never showed up I would have given a 4 for sure, but on the other side of the wall where this bug was climbing was the kitchen.'])
test = pad_sequences(seq, maxlen = max_len)
feeling[np.around(model.predict(test)).argmax(axis = 1)[0]]



'Negative'

In [None]:
seq = tokenizer.texts_to_sequences(['The warm beer didnt help.'])
test = pad_sequences(seq, maxlen = max_len)
feeling[np.around(model.predict(test)).argmax(axis = 1)[0]]



'Negative'

In [None]:
seq = tokenizer.texts_to_sequences(['The best place to go for a tasty bowl of Pho!'])
test = pad_sequences(seq, maxlen = max_len)
feeling[np.around(model.predict(test)).argmax(axis = 1)[0]]



'Positive'

### Sentiment Prediction Using Naive Bayes

In [None]:
# Import model and test hyperparamter
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB(priors=[0.1, 0.9])
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

  y = column_or_1d(y, warn=True)


0.49

In [None]:
# Test unique hyperparamter combination
clf = GaussianNB(priors=[0.5, 0.5])
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

  y = column_or_1d(y, warn=True)


0.525

In [None]:
# Test unique hyperparamter combination
clf = GaussianNB(var_smoothing = 1)
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

  y = column_or_1d(y, warn=True)


0.525

In [None]:
# Test unique hyperparamter combination
clf = GaussianNB(priors=[0.95, 0.05])
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

  y = column_or_1d(y, warn=True)


0.525

In [None]:
# Test unique hyperparamter combination
clf = GaussianNB(var_smoothing = 100)
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

  y = column_or_1d(y, warn=True)


0.49

In [None]:
# Test unique hyperparamter combination
clf = GaussianNB(var_smoothing = 0.0000001)
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

  y = column_or_1d(y, warn=True)


0.525

In [None]:
# Test unique hyperparamter combination
clf = GaussianNB(var_smoothing = 0)
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

  y = column_or_1d(y, warn=True)
  n_ij = -0.5 * np.sum(np.log(2.0 * np.pi * self.var_[i, :]))
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)
  n_ij -= 0.5 * np.sum(((X - self.theta_[i, :]) ** 2) / (self.var_[i, :]), 1)


0.51

In [None]:
# Test unique hyperparamter combination
clf = GaussianNB()
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)

  y = column_or_1d(y, warn=True)


0.525

In [None]:
# Return 10 misclassified reviews
predictions = clf.predict(X_test)
Y_test.reset_index(inplace = True,drop = True)
X_test.reset_index(inplace = True,drop = True)
i = 0
j = 0
while (i < 10 and j < len(X_test)):
  if (predictions[j] != Y_test.iloc[j,0]):
      print(X_test2.iloc[j,0])
      i = i + 1
  j = j + 1

No, I'm going to eat the potato that I found some strangers hair in it.
Would not go back.
I had the mac salad and it was pretty bland so I will not be getting that again.
Not much flavor to them, and very poorly constructed.
Maybe if they weren't cold they would have been somewhat edible.
But the service was beyond bad.
The descriptions said "yum yum sauce" and another said "eel sauce", yet another said "spicy mayo"...well NONE of the rolls had sauces on them.
I kept looking at the time and it had soon become 35 minutes, yet still no food.
I don't know what the big deal is about this place, but I won't be back "ya'all".
After the disappointing dinner we went elsewhere for dessert.
