In [2]:
import graphlab
graphlab.canvas.set_target('ipynb')
products = graphlab.SFrame('amazon_baby.gl')
products['word_count'] = graphlab.text_analytics.count_words(products['review'])
selected_words = ['awesome', 'great', 'fantastic', 'amazing', 'love', 'horrible', 'bad', 'terrible', 'awful', 'wow', 'hate']

This non-commercial license of GraphLab Create for academic use is assigned to sudhanshu.gupta@st.niituniversity.in and will expire on August 24, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\SUDHAN~1\AppData\Local\Temp\graphlab_server_1504499629.log.0


# Use .apply() to build a new feature with the counts for each of the selected_words

In [3]:
# function to return the occurence of word in all_words dictionary
def count_word(all_words, word):
    if word in all_words:
        return all_words[word]
    return 0

# create a column for every selected_words, containing the count of that word in each row's review
for word in selected_words:
    products[word] = products['word_count'].apply(lambda all_words : count_word(all_words, word))

In [4]:
max_count = -1
min_count = -1
max_count_word = ''
min_count_word = ''
for word in selected_words:
    word_sum = products[word].sum()
    if (word_sum > max_count) or (max_count == -1):
        max_count = word_sum
        max_count_word = word
    if (word_sum < min_count) or (min_count == -1):
        min_count = word_sum
        min_count_word = word

answer1 = [max_count_word, min_count_word]

# Create a new sentiment analysis model using only the selected_words as features

In [6]:
train_data,test_data = products.random_split(.8, seed=0)
products = products[products['rating'] != 3]
products['sentiment'] = products['rating'] >= 4

In [7]:
selected_words_model = graphlab.logistic_classifier.create(train_data, target='sentiment', features=selected_words, validation_set=test_data)
descending_value = selected_words_model['coefficients'].sort('value', ascending=False)
answer2 = [descending_value[0]['name'], descending_value[-1]['name']]

# Comparing the accuracy of different sentiment analysis model

In [9]:
# recreate the in-class sentiment model
sentiment_model = graphlab.logistic_classifier.create(train_data, target='sentiment', features=['word_count'], validation_set=test_data)

In [13]:
answer3 = [selected_words_model.evaluate(test_data)['accuracy'], sentiment_model.evaluate(test_data)['accuracy']]

# Interpreting the difference in performance between the models

In [16]:
diaper_champ_reviews = products[products['name'] == 'Baby Trend Diaper Champ']
diaper_champ_reviews['predicted_sentiment'] = sentiment_model.predict(diaper_champ_reviews, output_type='probability')

In [32]:
diaper_champ_reviews = diaper_champ_reviews.sort('predicted_sentiment', ascending=False)
answer4 = [diaper_champ_reviews[0]['predicted_sentiment'], selected_words_model.predict(diaper_champ_reviews[0:1], output_type='probability')[0]]

[0.9999999372669541, 0.796940851290671]


# Results

In [40]:
print 'Answer 1'
print '    Most used word: ', answer1[0]
print '    Least used word: ', answer1[1]
print 'Answer 2'
print '    Word with most positive weight: ', answer2[0]
print '    Word with most negative weight: ', answer2[1]
print 'Answer 3'
print '    Accuracy of selected_words_model: ', answer3[0]
print '    Accuracy of sentiment_model: ', answer3[1]
print 'Answer 4'
print '    Predicted sentiment for the most positive review: ', answer4[0]
print '    selected_words_model prediction for the most positive review: ', answer4[1]

Answer 1
    Most used word:  great
    Least used word:  wow
Answer 2
    Word with most positive weight:  love
    Word with most negative weight:  terrible
Answer 3
    Accuracy of selected_words_model:  0.843141964929
    Accuracy of sentiment_model:  0.916256305549
Answer 4
    Predicted sentiment for the most positive review:  0.999999937267
    selected_words_model prediction for the most positive review:  0.796940851291
