In [1]:
# import all review datasets: yelp, imdb and amazon
import pandas as pd
import re
# Grab and process the raw data.
data_path = ("../../../Data & Script/sentiment labelled sentences/yelp_labelled.txt")
yelp = pd.read_csv(data_path, delimiter= '\t', header=None)
data_path = ("../../../Data & Script/sentiment labelled sentences/imdb_labelled.txt")
imdb = pd.read_csv(data_path, delimiter= '\t', header=None)
data_path = ("../../../Data & Script/sentiment labelled sentences/amazon_cells_labelled.txt")
amazon = pd.read_csv(data_path, delimiter= '\t', header=None)

yelp.columns = ['review', 'sentiment']
# convert to lower case
yelp['review'] = yelp['review'].str.lower()
# remove punctuation
yelp['review'] = yelp['review'].str.replace('[^\w\s]','')
# remove numbers
yelp['review'] = yelp['review'].str.replace('\d','')

yelp.sample(5)

Unnamed: 0,review,sentiment
784,the chains which im no fan of beat this place ...,0
960,insults profound deuchebaggery and had to go o...,0
710,that said our mouths and bellies were still qu...,1
273,stopped by this place while in madison for the...,1
106,the food was delicious our bartender was atten...,1


In [2]:
# create a word count vector for positive and negative reviews separetly
from collections import Counter
# combine all strings in positive reviews and remove punctuation
positive_reviews = " ".join(yelp[yelp['sentiment'] == 1]['review']).lower()
# combine all strings in spam email and remove punctuation
negative_reviews = " ".join(yelp[yelp['sentiment'] == 0]['review']).lower()

positive_word_count = Counter(positive_reviews.split(" "))
negative_word_count = Counter(negative_reviews.split(" "))


In [3]:
# Ignore stop words and find most common words
from nltk.corpus import stopwords
# all english stop words
en_stops = set(stopwords.words('english'))
# take only non-stop words
top_positive_words = [w for w, c in positive_word_count.most_common() if w not in en_stops]
top_negative_words = [w for w, c in negative_word_count.most_common() if w not in en_stops]

print("A total of ", len(top_positive_words), " positve words")
print("A total of ", len(top_negative_words), " negative words")


A total of  1130  positve words
A total of  1265  negative words


In [4]:
# find top 50 positives
top_positive_words[:100]

['good',
 'great',
 'food',
 'place',
 '',
 'service',
 'friendly',
 'delicious',
 'back',
 'nice',
 'time',
 'really',
 'best',
 'amazing',
 'also',
 'like',
 'restaurant',
 'go',
 'love',
 'staff',
 'vegas',
 'first',
 'menu',
 'always',
 'fantastic',
 'experience',
 'awesome',
 'pretty',
 'made',
 'loved',
 'definitely',
 'fresh',
 'one',
 'steak',
 'excellent',
 'even',
 'atmosphere',
 'pizza',
 'perfect',
 'prices',
 'server',
 'ever',
 'im',
 'chicken',
 'selection',
 'could',
 'tasty',
 'came',
 'stars',
 'well',
 'say',
 'get',
 'spot',
 'come',
 'every',
 'happy',
 'beer',
 'want',
 'salad',
 'us',
 'wonderful',
 'ive',
 'buffet',
 'everything',
 'breakfast',
 'town',
 'night',
 'clean',
 'eat',
 'would',
 'taste',
 'tender',
 'attentive',
 'cooked',
 'sandwich',
 'inside',
 'sauce',
 'still',
 'went',
 'spicy',
 'sushi',
 'quite',
 'order',
 'cant',
 'recommend',
 'family',
 'tried',
 'quality',
 'thing',
 'right',
 'side',
 'ordered',
 'try',
 'meal',
 'better',
 'next',
 'w

In [5]:
# find top 50 negative words
top_negative_words[:100]
#selected_negative_words = ['go','dont','ever','never','bad','better','not','falvor','even','say']

['',
 'food',
 'place',
 'back',
 'service',
 'like',
 'go',
 'dont',
 'good',
 'never',
 'would',
 'time',
 'ever',
 'minutes',
 'bad',
 'one',
 'much',
 'got',
 'wont',
 'really',
 'disappointed',
 'worst',
 'think',
 'going',
 'wasnt',
 'ive',
 'came',
 'eat',
 'us',
 'im',
 'slow',
 'get',
 'wait',
 'bland',
 'better',
 'well',
 'way',
 'waited',
 'flavor',
 'probably',
 'terrible',
 'didnt',
 'times',
 'ordered',
 'also',
 'another',
 'even',
 'could',
 'say',
 'overpriced',
 'pretty',
 'salad',
 'took',
 'restaurant',
 'coming',
 'experience',
 'know',
 'rude',
 'quality',
 'burger',
 'best',
 'cold',
 'definitely',
 'getting',
 'feel',
 'enough',
 'people',
 'meat',
 'chicken',
 'sushi',
 'bit',
 'give',
 'impressed',
 'soon',
 'stars',
 'want',
 'taste',
 'made',
 'sure',
 'vegas',
 'around',
 'poor',
 'hard',
 'horrible',
 'nothing',
 'eating',
 'mediocre',
 'management',
 'many',
 'lunch',
 'take',
 'felt',
 'either',
 'said',
 'money',
 'went',
 'next',
 'meal',
 'tasted',
 

In [6]:
# choose positive words which didn't occur in the top 50 negative words
positive_words = [w for w in top_positive_words[:100] if w not in top_negative_words[:100]]
positive_words

['great',
 'friendly',
 'delicious',
 'nice',
 'amazing',
 'love',
 'staff',
 'first',
 'menu',
 'always',
 'fantastic',
 'awesome',
 'loved',
 'fresh',
 'steak',
 'excellent',
 'atmosphere',
 'pizza',
 'perfect',
 'prices',
 'server',
 'selection',
 'tasty',
 'spot',
 'come',
 'every',
 'happy',
 'beer',
 'wonderful',
 'buffet',
 'everything',
 'breakfast',
 'town',
 'night',
 'clean',
 'tender',
 'attentive',
 'cooked',
 'sandwich',
 'inside',
 'sauce',
 'spicy',
 'quite',
 'order',
 'cant',
 'recommend',
 'family',
 'tried',
 'thing',
 'right',
 'side',
 'try',
 'worth',
 'waitress',
 'ambiance',
 'day']

In [7]:
# choose negative words which didn't occur in the top 50 positive words
negative_words = [w for w in top_negative_words[:100] if w not in top_positive_words[:100]]
negative_words

['dont',
 'never',
 'minutes',
 'bad',
 'much',
 'got',
 'wont',
 'disappointed',
 'worst',
 'think',
 'going',
 'wasnt',
 'slow',
 'wait',
 'bland',
 'way',
 'waited',
 'flavor',
 'probably',
 'terrible',
 'didnt',
 'times',
 'another',
 'overpriced',
 'took',
 'coming',
 'know',
 'rude',
 'burger',
 'cold',
 'getting',
 'feel',
 'enough',
 'people',
 'meat',
 'bit',
 'give',
 'impressed',
 'soon',
 'sure',
 'around',
 'poor',
 'hard',
 'horrible',
 'nothing',
 'eating',
 'mediocre',
 'management',
 'many',
 'lunch',
 'take',
 'felt',
 'either',
 'said',
 'money',
 'tasted']

In [8]:
# Next create the features
# take all positive and negative words as features
keywords = positive_words + negative_words

for key in keywords:
    yelp[key] = yelp.review.str.contains(' ' + key + ' ', case=False)

if "review" in keywords:
    keywords.remove("review")
if "" in keywords:
    keywords.remove("")

In [9]:
# Now train model and calculate accuracy on training set
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()
data = yelp[keywords]
target = yelp['sentiment']
model.fit(data, target)
y_pred = model.predict(data)
print("Number of mislabeled points out of a total {} points : {}, accuracy: {:.2f}% ".format(data.shape[0],(target != y_pred).sum(), ((target == y_pred).sum()/data.shape[0]) * 100))

Number of mislabeled points out of a total 1000 points : 281, accuracy: 71.90% 


In [10]:
false_positives = yelp[(target != y_pred) & (target == 0)]
false_negatives = yelp[(target != y_pred) & (target == 1)]

false_positive_words = []
false_negative_words = []


In [11]:
false_positives = " ".join(false_positives['review']).lower()
false_negatives = " ".join(false_negatives['review']).lower()

fp_count = Counter(false_negatives.split(" "))
fn_count = Counter(false_negatives.split(" "))


top_fp_count = [w for w, c in fp_count.most_common() if w not in en_stops]
top_fn_count = [w for w, c in fn_count.most_common() if w not in en_stops]

In [12]:
# Add new words to existing keywords and retrain model

# combine new features with existing ones and find distinct features.
keywords = set(top_fp_count + top_fn_count + keywords)

if "review" in keywords:
    keywords.remove("review")
if "" in keywords:
    keywords.remove("")


for feature in keywords:
    # if feature is not already there
    if(feature not in yelp.columns):
        yelp[feature] = yelp.review.str.contains(' ' + feature + ' ', case=False)

model = BernoulliNB()
data = yelp[list(keywords)]
target = yelp['sentiment']
model.fit(data, target)
y_pred = model.predict(data)
print("Number of mislabeled points out of a total {} points : {}, accuracy: {:.2f}% ".format(data.shape[0],(target != y_pred).sum(), ((target == y_pred).sum()/data.shape[0]) * 100))

Number of mislabeled points out of a total 1000 points : 230, accuracy: 77.00% 
