In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import string

import math
import json
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
products = pd.read_csv("./amazon_baby_subset.csv")

In [3]:
products.head()

Unnamed: 0,name,review,rating,sentiment
0,Stop Pacifier Sucking without tears with Thumb...,All of my kids have cried non-stop when I trie...,5,1
1,Nature's Lullabies Second Year Sticker Calendar,We wanted to get something to keep track of ou...,5,1
2,Nature's Lullabies Second Year Sticker Calendar,My daughter had her 1st baby over a year ago. ...,5,1
3,"Lamaze Peekaboo, I Love You","One of baby's first and favorite books, and it...",4,1
4,SoftPlay Peek-A-Boo Where's Elmo A Children's ...,Very cute interactive book! My son loves this ...,5,1


In [4]:
sum( (products["sentiment"] == 1) * 1)

26579

In [5]:
sum( (products["sentiment"] != 1) * 1)

26493

In [6]:
with open('./important_words.json') as data_file:    
    important_words = json.load(data_file)

In [7]:
products = products.fillna({"review": ""})

In [8]:
translator = str.maketrans('','', string.punctuation)
products["review"] = products["review"].astype(str)

def normalize_string(x):
    x = x.str.translate(translator)
    x = x.str.lower()
    
    return x

def normalize_string_old(x):
    x = x.translate(translator)
    x = x.lower()
    
    return x

In [9]:
%%timeit
products["review_clean"] = normalize_string(products["review"])

1 loop, best of 3: 312 ms per loop


In [37]:
%%timeit
products["review_clean"] = products["review"].apply(normalize_string_old)

1 loop, best of 3: 281 ms per loop


5)

In [10]:
%%timeit
for word in important_words:
    products[word] = products["review_clean"].apply(lambda x: x.split().count(word))

1 loop, best of 3: 1min 34s per loop


In [16]:
#%%timeit <- this produces NAN, why?
products["list"] = products["review_clean"].str.split()

for word in important_words:
    products[word] = products["list"].str.count(word)

In [46]:
%%timeit
vectorizer_word_subset = CountVectorizer(vocabulary = important_words)
train_matrix_word_subset = vectorizer_word_subset.fit_transform(products['review_clean'])

1 loop, best of 3: 3.98 s per loop


In [None]:
products

In [47]:
sum( 1* products["perfect"] > 0 )

3309

8)

In [11]:
def get_numpy_data(dataframe, features, label):
    
    dataframe["constant"] = 1
    features = ["constant"] + features
    
    feature_matrix = dataframe[features].as_matrix()
    label_array = dataframe[label].as_matrix()
    return (feature_matrix, label_array)

In [12]:
feature_matrix, label_array = get_numpy_data(products, important_words, "sentiment")

In [13]:
feature_matrix.shape

(53072, 194)

10)

In [177]:
def predict_probability(feature_matrix, coefficients):
    score = np.dot(feature_matrix,coefficients)
    
    predictions = 1/(1 + np.exp(-score))
    
    return predictions

11)

In [81]:
def feature_derivative(errors, feature):
    return np.dot(errors, feature)

12)

In [143]:
def compute_log_likelihood(feature_matrix, sentiment, coefficients):
    
    indicator = (sentiment == 1)
    scores = np.dot(feature_matrix, coefficients)
    
    lp = np.sum((indicator - 1) * scores - np.log(1 + np.exp(-scores)))
    
    return lp

13)

In [213]:
def logistic_regression(feature_matrix, sentiment, initial_coeffs, 
                        step_size, max_iter) :
    
    coeffs = np.array(initial_coeffs)
    iteration = 0
    
    while iteration < max_iter:
        
        predictions = predict_probability(feature_matrix, coeffs)
        indicator = 1* (sentiment == 1)
        errors = indicator - predictions
        
        for i in range(len(coeffs)):
            
            derivative = feature_derivative(errors, feature_matrix[:,i])
            
            coeffs[i] += step_size * derivative
        
        iteration += 1
        
        if iteration <= 15 or (iteration <= 100 and iteration % 10 == 0) or (iteration <= 1000 and iteration % 100 == 0) \
        or (iteration <= 10000 and iteration % 1000 == 0) or iteration % 10000 == 0:
            lp = compute_log_likelihood(feature_matrix, sentiment, coeffs)
            print ('iteration %*d: log likelihood of observed labels = %.8f' % (int(np.ceil(np.log10(max_iter))), iteration, lp))
        
    return coeffs

In [214]:
fm = feature_matrix
st = label_array
initial_coeffs = np.array([0.0] * feature_matrix.shape[1])
step_size = 1e-7
max_iter = 301

In [215]:
coefficients = logistic_regression(fm, st, initial_coeffs, step_size, max_iter)

iteration   1: log likelihood of observed labels = -36779.70627863
iteration   2: log likelihood of observed labels = -36772.71312191
iteration   3: log likelihood of observed labels = -36765.72767668
iteration   4: log likelihood of observed labels = -36758.74992322
iteration   5: log likelihood of observed labels = -36751.77984195
iteration   6: log likelihood of observed labels = -36744.81741345
iteration   7: log likelihood of observed labels = -36737.86261842
iteration   8: log likelihood of observed labels = -36730.91543769
iteration   9: log likelihood of observed labels = -36723.97585224
iteration  10: log likelihood of observed labels = -36717.04384317
iteration  11: log likelihood of observed labels = -36710.11939171
iteration  12: log likelihood of observed labels = -36703.20247923
iteration  13: log likelihood of observed labels = -36696.29308721
iteration  14: log likelihood of observed labels = -36689.39119727
iteration  15: log likelihood of observed labels = -36682.4967

15)

In [216]:
final_score = np.dot(feature_matrix, coefficients)

In [220]:
final_class = 2* (final_score >= 0) - 1

In [224]:
print(sum( 1 * final_class == 1 ), sum( 1 * final_class != 1 ))

25237 27835


16)

In [232]:
acc = sum(1*(final_class == label_array)) / len(final_class)

In [233]:
print(round(acc,2))

0.77


17)

In [234]:
coefficients = list(coefficients[1:])

In [236]:
word_coeffs_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]

In [249]:
word_coeffs_tuples = sorted(word_coeffs_tuples, key = lambda x:x[1], reverse = True)

In [250]:
word_coeffs_tuples[0:10]

[('love', 0.084119424383667249),
 ('great', 0.08311194239481122),
 ('easy', 0.073503830170933745),
 ('loves', 0.048081701602836098),
 ('little', 0.045369103717335692),
 ('perfect', 0.034437993486444413),
 ('well', 0.02765945177922657),
 ('nice', 0.020781358649221086),
 ('old', 0.019747740577050456),
 ('fits', 0.019133111494995469)]

In [251]:
word_coeffs_tuples[-10:]

[('return', -0.02680721350222864),
 ('waste', -0.027196962011447265),
 ('back', -0.028219842211087653),
 ('get', -0.029491013575623962),
 ('disappointed', -0.030485382478378471),
 ('even', -0.033105049970913902),
 ('work', -0.033181734209090492),
 ('money', -0.04028587082625365),
 ('product', -0.042707473571271297),
 ('would', -0.05259056720690225)]