In [19]:
# import all review datasets: yelp, imdb and amazon
import pandas as pd
import re
# Grab and process the raw data.
data_path = ("../../../Data & Script/sentiment labelled sentences/yelp_labelled.txt")
yelp = pd.read_csv(data_path, delimiter= '\t', header=None)
data_path = ("../../../Data & Script/sentiment labelled sentences/imdb_labelled.txt")
imdb = pd.read_csv(data_path, delimiter= '\t', header=None)
data_path = ("../../../Data & Script/sentiment labelled sentences/amazon_cells_labelled.txt")
amazon = pd.read_csv(data_path, delimiter= '\t', header=None)
amazon.columns = ['review', 'sentiment']
yelp.columns = ['review', 'sentiment']
# convert to lower case
yelp['review'] = yelp['review'].str.lower()
amazon['review'] = amazon['review'].str.lower()

# remove punctuation
yelp['review'] = yelp['review'].str.replace('[^\w\s]','')
amazon['review'] = amazon['review'].str.replace('[^\w\s]','')
                  
# remove numbers
yelp['review'] = yelp['review'].str.replace('\d','')
amazon['review'] = amazon['review'].str.replace('\d','')
                  

yelp.sample(5)

Unnamed: 0,review,sentiment
138,he was extremely rude and really there are so ...,0
258,they know how to make them here,1
937,dont bother coming here,0
2,not tasty and the texture was just nasty,0
50,we ordered the duck rare and it was pink and t...,1


In [20]:
# create a word count vector for positive and negative reviews separetly
from collections import Counter
# combine all strings in positive reviews and remove punctuation
positive_reviews = " ".join(yelp[yelp['sentiment'] == 1]['review']).lower()
# combine all strings in spam email and remove punctuation
negative_reviews = " ".join(yelp[yelp['sentiment'] == 0]['review']).lower()

positive_word_count = Counter(positive_reviews.split(" "))
negative_word_count = Counter(negative_reviews.split(" "))

# Ignore stop words and find most common words
from nltk.corpus import stopwords
# all english stop words
en_stops = set(stopwords.words('english'))
# take only non-stop words
top_positive_words = [w for w, c in positive_word_count.most_common() if w not in en_stops]
top_negative_words = [w for w, c in negative_word_count.most_common() if w not in en_stops]

print("A total of ", len(top_positive_words), " positve words")
print("A total of ", len(top_negative_words), " negative words")

A total of  1130  positve words
A total of  1265  negative words


In [21]:
# find top 50 positives
top_positive_words[:50]

['good',
 'great',
 'food',
 'place',
 '',
 'service',
 'friendly',
 'delicious',
 'back',
 'nice',
 'time',
 'really',
 'best',
 'amazing',
 'also',
 'like',
 'restaurant',
 'go',
 'love',
 'staff',
 'vegas',
 'first',
 'menu',
 'always',
 'fantastic',
 'experience',
 'awesome',
 'pretty',
 'made',
 'loved',
 'definitely',
 'fresh',
 'one',
 'steak',
 'excellent',
 'even',
 'atmosphere',
 'pizza',
 'perfect',
 'prices',
 'server',
 'ever',
 'im',
 'chicken',
 'selection',
 'could',
 'tasty',
 'came',
 'stars',
 'well']

In [22]:
# find top 50 negative words
top_negative_words[:50]
#selected_negative_words = ['go','dont','ever','never','bad','better','not','falvor','even','say']

['',
 'food',
 'place',
 'back',
 'service',
 'like',
 'go',
 'dont',
 'good',
 'never',
 'would',
 'time',
 'ever',
 'minutes',
 'bad',
 'one',
 'much',
 'got',
 'wont',
 'really',
 'disappointed',
 'worst',
 'think',
 'going',
 'wasnt',
 'ive',
 'came',
 'eat',
 'us',
 'im',
 'slow',
 'get',
 'wait',
 'bland',
 'better',
 'well',
 'way',
 'waited',
 'flavor',
 'probably',
 'terrible',
 'didnt',
 'times',
 'ordered',
 'also',
 'another',
 'even',
 'could',
 'say',
 'overpriced']

In [23]:
def create_binary_data(data, keywords):
    # Next create the features
    # take all positive and negative words as features
    # copy by value(deep copy)
    data_mod = data.copy(deep=True)

    if "review" in keywords:
        keywords.remove("review")  
    if "" in keywords:
        keywords.remove("")
        
    for key in keywords:
        data_mod[key] = data_mod.review.str.contains(' ' + key + ' ', case=False) 
    
    return data_mod

In [24]:
# choose positive words which didn't occur in the top 1000/100 negative words
positive_words_big = [w for w in top_positive_words[:1000] if w not in top_negative_words[:1000]]
positive_words_small = [w for w in top_positive_words[:100] if w not in top_negative_words[:100]]

# choose negative words which didn't occur in the top 1000/100 positive words
negative_words_big = [w for w in top_negative_words[:1000] if w not in top_positive_words[:1000]]
negative_words_small = [w for w in top_negative_words[:100] if w not in top_positive_words[:100]]


In [25]:
# trying small set of features
keywords_small = positive_words_small + negative_words_small
yelp_small = create_binary_data(yelp, keywords_small)
# Now train model and calculate accuracy on training set
from sklearn.naive_bayes import BernoulliNB

model_small = BernoulliNB()
data = yelp_small[keywords_small]
target = yelp_small['sentiment']
model_small.fit(data, target)
y_pred = model_small.predict(data)
print("Number of mislabeled points out of a total {} points : {}, accuracy: {:.2f}% ".format(data.shape[0],(target != y_pred).sum(), ((target == y_pred).sum()/data.shape[0]) * 100))

Number of mislabeled points out of a total 1000 points : 281, accuracy: 71.90% 


In [26]:
# trying big set of features
keywords_big = positive_words_big + negative_words_big
yelp_big = create_binary_data(yelp, keywords_big)
# Now train model and calculate accuracy on training set
from sklearn.naive_bayes import BernoulliNB

model_big = BernoulliNB()
data = yelp_big[keywords_big]
target = yelp_big['sentiment']
model_big.fit(data, target)
y_pred = model_big.predict(data)
print("Number of mislabeled points out of a total {} points : {}, accuracy: {:.2f}% ".format(data.shape[0],(target != y_pred).sum(), ((target == y_pred).sum()/data.shape[0]) * 100))

Number of mislabeled points out of a total 1000 points : 163, accuracy: 83.70% 


In [27]:
# add some more features from misclassfied reviews
false_positives = yelp[(target != y_pred) & (target == 0)]
false_negatives = yelp[(target != y_pred) & (target == 1)]

false_positive_words = []
false_negative_words = []
false_positives = " ".join(false_positives['review']).lower()
false_negatives = " ".join(false_negatives['review']).lower()

fp_count = Counter(false_negatives.split(" "))
fn_count = Counter(false_negatives.split(" "))


top_fp_count = [w for w, c in fp_count.most_common() if w not in en_stops]
top_fn_count = [w for w, c in fn_count.most_common() if w not in en_stops]

# Add new words to existing big set of keywords and retrain model
# combine new features with existing ones and find distinct features.
keywords_combin = set(top_fp_count + top_fn_count + keywords_big)

yelp_combin = create_binary_data(yelp, keywords_combin)

model_combin = BernoulliNB()
data = yelp_combin[list(keywords_combin)]
target = yelp_combin['sentiment']
model_combin.fit(data, target)
y_pred = model_combin.predict(data)
print("Number of mislabeled points out of a total {} points : {}, accuracy: {:.2f}% ".format(data.shape[0],(target != y_pred).sum(), ((target == y_pred).sum()/data.shape[0]) * 100))

Number of mislabeled points out of a total 1000 points : 156, accuracy: 84.40% 


In [28]:
# Doing Cross Validation
from sklearn.model_selection import cross_val_score
cross_val_score(model_combin, data, target, cv=10)

array([0.76, 0.76, 0.64, 0.73, 0.72, 0.75, 0.74, 0.73, 0.66, 0.67])

In [29]:
# try to use the model_combin for predicting amazon reviews, I achieved 84.40 % accuracy, now let me try to use it for amazon reviews
# create the features for amazon
amazon_combin = create_binary_data(amazon, keywords_combin)
data = amazon_combin[list(keywords_combin)]
target = amazon['sentiment']
y_pred = model_combin.predict(data)
print("Number of mislabeled points out of a total {} points : {}, accuracy: {:.2f}% ".format(data.shape[0],(target != y_pred).sum(), ((target == y_pred).sum()/data.shape[0]) * 100))

Number of mislabeled points out of a total 1000 points : 431, accuracy: 56.90% 


In [30]:
amazon.head()

Unnamed: 0,review,sentiment
0,so there is no way for me to plug it in here i...,0
1,good case excellent value,1
2,great for the jawbone,1
3,tied to charger for conversations lasting more...,0
4,the mic is great,1


In [31]:
# I think my model is overfitting to yelp and it has a lot of features, let me try it with small set of features
# try to use the model for predicting amazon reviews, I achieved 84.40 % accuracy, now let me try to use it for amazon reviews
# create the features for amazon
amazon_small = create_binary_data(amazon, keywords_small)
data = amazon_small[list(keywords_small)]
target = amazon_small['sentiment']
y_pred = model_small.predict(data)
print("Number of mislabeled points out of a total {} points : {}, accuracy: {:.2f}% ".format(data.shape[0],(target != y_pred).sum(), ((target == y_pred).sum()/data.shape[0]) * 100))

Number of mislabeled points out of a total 1000 points : 442, accuracy: 55.80% 


In [32]:
# now let me try it with big set of features excluding words from misclassfied yelp reviews
# I think my model is overfitting to yelp and it has a lot of features, let me try it with small set of features
# try to use the model for predicting amazon reviews, I achieved 84.40 % accuracy, now let me try to use it for amazon reviews
# create the features for amazon
amazon_big = create_binary_data(amazon, keywords_big)
data = amazon_big[list(keywords_big)]
target = amazon_big['sentiment']
y_pred = model_big.predict(data)
print("Number of mislabeled points out of a total {} points : {}, accuracy: {:.2f}% ".format(data.shape[0],(target != y_pred).sum(), ((target == y_pred).sum()/data.shape[0]) * 100))

Number of mislabeled points out of a total 1000 points : 431, accuracy: 56.90% 


**Conclusion:** A model with small set of features resulted low accuracy(71.90%) on training data. Using much larger feature set improved the accuracy to 83.70% and further adding some more words that occured in the misclassfied reviews increased the accuracy to 84.00%. The model is not as accurate when tested on amazon reviews, it is only 55.80 or 56.90% accurate. A better solution might be to train a model using reviews from all companies(yelp, amazon and imdb) and create a model that has small training error as well as generalize well across all companies reviews.