In [1]:
import pandas as pd
import numpy as np

In [2]:
products = pd.read_csv('amazon_baby_subset.csv')

In [3]:
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [4]:
important_words = pd.read_json('important_words.json')

In [5]:
products = products.fillna({'review':''})  # fill in N/A's in the review column

In [6]:
def remove_punctuation(text):
    import string
    return text.translate(None, string.punctuation)

In [7]:
products['review_clean'] = products['review'].apply(remove_punctuation)

In [8]:
for word in list(important_words.ix[:,0]):
    products[word] = products['review_clean'].apply(lambda s : s.split().count(word))

In [9]:
products[products['perfect']>=1]['perfect'].count()

2955

In [10]:
def get_numpy_data(dataframe, features, label):
    dataframe['constant'] = 1
    features = ['constant'] + features
    features_frame = dataframe[features]
    feature_matrix = features_frame.as_matrix()
    label_sarray = dataframe[label]
    label_array = label_sarray.as_matrix()
    return(feature_matrix, label_array)

In [11]:
feature_matrix, sentiment = get_numpy_data(products, list(important_words.ix[:,0]), ['sentiment'])

In [12]:
feature_matrix.shape

(53072L, 194L)

In [13]:
sentiment.shape

(53072L, 1L)

In [14]:
def predict_probability(feature_matrix, coefficients):
    # Take dot product of feature_matrix and coefficients  
    score = np.dot(feature_matrix, coefficients)
    # Compute P(y_i = +1 | x_i, w) using the link function
    predictions = 1 / (1 + np.exp(-1 * score))
    # return predictions
    return predictions

In [15]:
def feature_derivative(errors, feature):     
    # Compute the dot product of errors and feature
    derivative = np.dot(errors, feature)
    return derivative

In [16]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    indicator = np.array([1 if (x==+1) else 0 for x in sentiment])
    #print indicator
    scores = np.dot(feature_matrix, coefficients)
    lp = np.sum((indicator-1)*scores - np.log(1. + np.exp(-scores)))
    return lp

In [17]:
from math import sqrt
def logistic_regression(feature_matrix, sentiment, initial_coefficients, step_size, max_iter):
    coefficients = np.array(initial_coefficients) # make sure it's a numpy array
    for itr in xrange(max_iter):
        # Predict P(y_i = +1|x_1,w) using your predict_probability() function
        predictions = predict_probability(feature_matrix, coefficients)

        # Compute indicator value for (y_i = +1)
        indicator = np.array([1 if (x==+1) else 0 for x in sentiment])

        # Compute the errors as indicator - predictions
        errors = indicator - predictions

        for j in xrange(len(coefficients)): # loop over each coefficient
            # Recall that feature_matrix[:,j] is the feature column associated with coefficients[j]
            # compute the derivative for coefficients[j]. Save it in a variable called derivative
            derivative = feature_derivative(errors, feature_matrix[:,j])
            # add the step size times the derivative to the current coefficient
            coefficients[j] = coefficients[j] + (step_size * derivative)
            
        # Checking whether log likelihood is increasing
        if itr <= 15 or (itr <= 100 and itr % 10 == 0) or (itr <= 1000 and itr % 100 == 0) \
        or (itr <= 10000 and itr % 1000 == 0) or itr % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coefficients)
            print 'iteration %*d: log likelihood of observed labels = %.8f' % \
                (int(np.ceil(np.log10(max_iter))), itr, lp)
    return coefficients

In [18]:
coefficients = logistic_regression(feature_matrix, sentiment, np.zeros(194), 1e-7, 301)

iteration   0: log likelihood of observed labels = -36780.91768478
iteration   1: log likelihood of observed labels = -36775.13434712
iteration   2: log likelihood of observed labels = -36769.35713564
iteration   3: log likelihood of observed labels = -36763.58603240
iteration   4: log likelihood of observed labels = -36757.82101962
iteration   5: log likelihood of observed labels = -36752.06207964
iteration   6: log likelihood of observed labels = -36746.30919497
iteration   7: log likelihood of observed labels = -36740.56234821
iteration   8: log likelihood of observed labels = -36734.82152213
iteration   9: log likelihood of observed labels = -36729.08669961
iteration  10: log likelihood of observed labels = -36723.35786366
iteration  11: log likelihood of observed labels = -36717.63499744
iteration  12: log likelihood of observed labels = -36711.91808422
iteration  13: log likelihood of observed labels = -36706.20710739
iteration  14: log likelihood of observed labels = -36700.5020

In [19]:
coefficients

array([  5.16220157e-03,   1.55656966e-02,  -8.50204675e-03,
         6.65460842e-02,   6.58907629e-02,   5.01743882e-03,
        -5.38601484e-02,  -3.50488413e-03,   6.47945868e-02,
         4.54356263e-02,   3.98353364e-03,   2.00775410e-02,
         3.01350011e-02,  -2.87115530e-02,   1.52161964e-02,
         2.72592062e-04,   1.19448177e-02,  -1.82461935e-02,
        -1.21706420e-02,  -4.15110334e-02,   2.76820391e-03,
         1.77031999e-02,  -4.39700067e-03,   4.49764014e-02,
         9.90916464e-03,   8.99239081e-04,  -1.36219516e-03,
         1.26859357e-02,   8.26466695e-03,  -2.77426972e-02,
         6.10128809e-04,   1.54084501e-02,  -1.32134753e-02,
        -3.00512492e-02,   2.97399371e-02,   1.84087080e-02,
         2.86178752e-03,  -1.05768015e-02,  -6.57350362e-04,
        -1.01476555e-02,  -4.79579528e-03,   7.50891810e-03,
         4.27938289e-03,   3.06785501e-03,  -2.20317661e-03,
         9.57273354e-03,   9.91666827e-05,  -1.98462567e-02,
         1.75702722e-02,

In [20]:
scores = np.dot(feature_matrix, coefficients)

In [21]:
scores

array([ 0.05104571, -0.02936473,  0.02411584, ..., -0.40986295,
        0.01411436, -0.06755923])

In [22]:
products['prediction_algo'] = [1 if (x > 0) else -1 for x in scores]

In [23]:
products[products['prediction_algo']>0]['prediction_algo'].count()

25126

In [24]:
total_no = len(products.index)

In [25]:
correct_predict = len(products[products['sentiment'] == products['prediction_algo']].index)

In [26]:
accuracy = correct_predict / float(total_no)

In [27]:
accuracy

0.7518653904130238

In [28]:
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(list(important_words.ix[:,0]), coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)

In [29]:
word_coefficient_tuples

[(u'great', 0.066546084170457695),
 (u'love', 0.065890762922123258),
 (u'easy', 0.06479458680257838),
 (u'little', 0.045435626308421365),
 (u'loves', 0.044976401394906038),
 (u'well', 0.030135001092107077),
 (u'perfect', 0.029739937104968462),
 (u'old', 0.020077541034775378),
 (u'nice', 0.018408707995268992),
 (u'daughter', 0.017703199905701694),
 (u'soft', 0.01757027224560289),
 (u'fits', 0.016882471071408719),
 (u'happy', 0.01680529588976808),
 (u'baby', 0.015565696580423507),
 (u'recommend', 0.015408450108008665),
 (u'also', 0.015216196422918844),
 (u'best', 0.014991791565630264),
 (u'comfortable', 0.013253990081584901),
 (u'car', 0.012685935745813375),
 (u'clean', 0.012018174433365525),
 (u'son', 0.011944817713693955),
 (u'bit', 0.011708248093123262),
 (u'works', 0.011703160621987424),
 (u'size', 0.010715966516270301),
 (u'stroller', 0.0099091646359727361),
 (u'room', 0.0097832410215680614),
 (u'price', 0.0095727335435901815),
 (u'play', 0.0091784289839843096),
 (u'easily', 0.00903