In [122]:
# import all review datasets: yelp, imdb and amazon
import pandas
import re
# Grab and process the raw data.
data_path = ("../../../Data & Script/sentiment labelled sentences/yelp_labelled.txt")
yelp = pd.read_csv(data_path, delimiter= '\t', header=None)
data_path = ("../../../Data & Script/sentiment labelled sentences/imdb_labelled.txt")
imdb = pd.read_csv(data_path, delimiter= '\t', header=None)
data_path = ("../../../Data & Script/sentiment labelled sentences/amazon_cells_labelled.txt")
amazon = pd.read_csv(data_path, delimiter= '\t', header=None)

yelp.columns = ['review', 'sentiment']
# convert to lower case
yelp['review'] = yelp['review'].str.lower()
# remove punctuation
yelp['review'] = yelp['review'].str.replace('[^\w\s]','')

yelp.sample(5)

Unnamed: 0,review,sentiment
838,food is way overpriced and portions are fuckin...,0
710,that said our mouths and bellies were still qu...,1
411,come hungry leave happy and stuffed,1
187,interesting decor,1
849,bad day or not i have a very low tolerance for...,0


In [123]:
# create a word count vector for positive and negative reviews separetly
from collections import Counter
# combine all strings in positive reviews and remove punctuation
positive_reviews = " ".join(yelp[yelp['sentiment'] == 1]['review']).lower()
# combine all strings in spam email and remove punctuation
negative_reviews = " ".join(yelp[yelp['sentiment'] == 0]['review']).lower()

positive_word_count = Counter(positive_reviews.split(" "))
negative_word_count = Counter(negative_reviews.split(" "))


In [124]:
# Ignore stop words and find most common words
from nltk.corpus import stopwords
# all english stop words
en_stops = set(stopwords.words('english'))
# take only non-stop words
top_positive_words = [w for w, c in positive_word_count.most_common() if w not in en_stops]
top_negative_words = [w for w, c in negative_word_count.most_common() if w not in en_stops]
print("A total of ", len(top_positive_words), " positve words")
print("A total of ", len(top_negative_words), " negative words")

A total of  1144  positve words
A total of  1288  negative words


In [125]:
# find top 50 positives
top_positive_words[:50]

['good',
 'great',
 'food',
 'place',
 'service',
 '',
 'friendly',
 'delicious',
 'back',
 'nice',
 'time',
 'really',
 'best',
 'amazing',
 'also',
 'like',
 'restaurant',
 'go',
 'love',
 'staff',
 'vegas',
 'first',
 'menu',
 'always',
 'fantastic',
 'experience',
 'awesome',
 'pretty',
 'made',
 'loved',
 'definitely',
 'fresh',
 'one',
 'steak',
 'excellent',
 'even',
 'pizza',
 'perfect',
 'prices',
 'server',
 'ever',
 'im',
 'chicken',
 'atmosphere',
 'selection',
 'could',
 'tasty',
 'came',
 'stars',
 'well']

In [126]:
# find top 50 negative words
top_negative_words[:50]
#selected_negative_words = ['go','dont','ever','never','bad','better','not','falvor','even','say']

['food',
 'place',
 'back',
 'service',
 'like',
 'go',
 'dont',
 'good',
 'never',
 'would',
 'time',
 '',
 'ever',
 'minutes',
 'bad',
 'one',
 'much',
 'got',
 'wont',
 'really',
 'disappointed',
 'worst',
 'think',
 'going',
 'wasnt',
 'ive',
 'came',
 'eat',
 'us',
 'im',
 'slow',
 'get',
 'wait',
 'bland',
 'better',
 'well',
 'way',
 'waited',
 'flavor',
 'probably',
 'terrible',
 'didnt',
 'times',
 'ordered',
 'also',
 'another',
 'even',
 'could',
 'say',
 'overpriced']

In [127]:
# choose positive words which didn't occur in the top 50 negative words
positive_words = [w for w in top_positive_words[:50] if w not in top_negative_words[:50]]
positive_words

['great',
 'friendly',
 'delicious',
 'nice',
 'best',
 'amazing',
 'restaurant',
 'love',
 'staff',
 'vegas',
 'first',
 'menu',
 'always',
 'fantastic',
 'experience',
 'awesome',
 'pretty',
 'made',
 'loved',
 'definitely',
 'fresh',
 'steak',
 'excellent',
 'pizza',
 'perfect',
 'prices',
 'server',
 'chicken',
 'atmosphere',
 'selection',
 'tasty',
 'stars']

In [128]:
# choose negative words which didn't occur in the top 50 positive words
negative_words = [w for w in top_negative_words[:50] if w not in top_positive_words[:50]]
negative_words

['dont',
 'never',
 'would',
 'minutes',
 'bad',
 'much',
 'got',
 'wont',
 'disappointed',
 'worst',
 'think',
 'going',
 'wasnt',
 'ive',
 'eat',
 'us',
 'slow',
 'get',
 'wait',
 'bland',
 'better',
 'way',
 'waited',
 'flavor',
 'probably',
 'terrible',
 'didnt',
 'times',
 'ordered',
 'another',
 'say',
 'overpriced']

In [129]:
# Next create the features
# take top 10 positives and 10 negative words to be new features
keywords = positive_words + negative_words
for key in keywords:
    yelp[str(key)] = yelp.review.str.contains(' ' + str(key) + ' ', case=False)

yelp.sample(5)

Unnamed: 0,review,sentiment,great,friendly,delicious,nice,best,amazing,restaurant,love,...,waited,flavor,probably,terrible,didnt,times,ordered,another,say,overpriced
364,server did a great job handling our large rowd...,1,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
513,reasonably priced also,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
564,generous portions and great taste,1,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
194,im not really sure how joeys was voted best ho...,0,False,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
304,good servicecheck,1,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [130]:
# Now train model and calculate accuracy on training set
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
data = yelp[keywords]
target = yelp['sentiment']
model.fit(data, target)
y_pred = model.predict(data)
print("Number of mislabeled points out of a total {} points : {}, accuracy: {}% ".format(data.shape[0],(target != y_pred).sum(), (target == y_pred).sum()/data.shape[0]))

Number of mislabeled points out of a total 1000 points : 324, accuracy: 0.676% 


In [131]:
data.columns

Index(['great', 'friendly', 'delicious', 'nice', 'best', 'amazing',
       'restaurant', 'love', 'staff', 'vegas', 'first', 'menu', 'always',
       'fantastic', 'experience', 'awesome', 'pretty', 'made', 'loved',
       'definitely', 'fresh', 'steak', 'excellent', 'pizza', 'perfect',
       'prices', 'server', 'chicken', 'atmosphere', 'selection', 'tasty',
       'stars', 'dont', 'never', 'would', 'minutes', 'bad', 'much', 'got',
       'wont', 'disappointed', 'worst', 'think', 'going', 'wasnt', 'ive',
       'eat', 'us', 'slow', 'get', 'wait', 'bland', 'better', 'way', 'waited',
       'flavor', 'probably', 'terrible', 'didnt', 'times', 'ordered',
       'another', 'say', 'overpriced'],
      dtype='object')