# Final: Amazon Product Review Kaggle Competition
# Model 2
## DSE 220: Machine Learning
## Due Date: 11 June 2017
## Orysya Stus

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
import gzip
    
def readGz(f):
  for l in gzip.open(f):
    yield eval(l)

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

train_df = getDF('train.json.gz')
test_df = getDF('test_Helpful.json.gz')

In [2]:
print(train_df.shape)

(200000, 12)


In [3]:
train_df.isnull().sum()

categoryID             0
categories             0
itemID                 0
reviewerID             0
rating                 0
reviewText             0
reviewHash             0
reviewTime             0
summary                0
unixReviewTime         0
helpful                0
price             125851
dtype: int64

In [4]:
print('Column price might need to be dropped since', train_df['price'].isnull().sum()/train_df.shape[0] * 100.0, '% of the data is null.')

Column price might need to be dropped since 62.9255 % of the data is null.


In [5]:
print('Delete column reviewHash since', train_df['reviewHash'].nunique(), 'unique values, out of', train_df.shape[0], 'exists.')
del train_df['reviewHash']

Delete column reviewHash since 200000 unique values, out of 200000 exists.


### Data Pre-Processing & Feature Engineering

Data Cleaning
Done
- deleted 'reviewHash' because it is completely unique
- create helpful_rate (if 0/0 then fillna with 0), delete 'helpful'
- convert 'reviewTime' to datetime
- delete 'unixReviewTime' since 'reviewTime' is present
- created review_keyText dictionary -> removed punctuation, stopwords, list of remaining words
- created review_allText dictionary -> list of all words
- created review_keyText_count dictionary -> length of each wordlist in review_keyText
- created review_allText_count dictionary -> length of each wordlist in review_allText
- created a vocabulary list comprised of the set of review_keyText
- created a dataframe called itemID_RPD -> calculates the reviews per day for each itemID
- created a dataframe called reviewerID_RPD -> calculates the reviews per day for each reviewerID

To Do
- determine which users with best nHelpful count use which words --> count the number of words in each reviewText
- see distribution of helpful_rate via histogram
- match itemID either by itemID if not present then via categories, price ie. NearestNeighbor
- determine which words are used through distribution of helpful_rate
- build a dataframe for item which contains the categoryID, categories (binary w/dummy variables), price

In [6]:
a = pd.DataFrame.from_dict(dict(train_df['helpful'])).T
train_df1 = pd.concat([train_df, a], axis=1)
train_df1['helpful_rate'] = train_df1['nHelpful']/train_df1['outOf']
train_df1['helpful_rate'].fillna(0, inplace=True)
del train_df1['helpful']
train_df1['reviewTime'] = pd.to_datetime(train_df1['reviewTime'])
del train_df1['unixReviewTime']

In [7]:
from nltk.corpus import stopwords
import string

def reviewText_listed(row):
    all_words = row.split()
    all_words = [w.lower() for w in all_words]
    subset_list = [''.join(c for c in s if c not in string.punctuation) for s in all_words]
    subset_list = [w for w in subset_list if w != '']
    subset_list = [word for word in subset_list if word not in stopwords.words('english')]
    return all_words, len(all_words), subset_list, len(subset_list)

review_allText = {}
review_allText_count = {}
review_keyText = {}
review_keyText_count = {}
vocabulary = []
count = 0
reviewText = list(train_df1['reviewText'])
for text in reviewText:
    all_, all_count, subset_, subset_count = reviewText_listed(text)
    review_allText[count] = all_
    review_allText_count[count] = all_count
    review_keyText[count] = subset_
    review_keyText_count[count] = subset_count
    vocabulary.append(subset_)
    count += 1

In [8]:
from itertools import chain
vocabulary = set(list(chain.from_iterable(vocabulary)))

In [9]:
print('There are', train_df1['reviewerID'].nunique(), 'unique reviewerIDS out of', train_df1.shape[0], 'training records.')
print('There are', train_df1['itemID'].nunique(), 'unique itemIDs out of', train_df1.shape[0], 'training records.')

There are 39249 unique reviewerIDS out of 200000 training records.
There are 19913 unique itemIDs out of 200000 training records.


In [10]:
def RPD(row):
    if (row['max'] - row['min']).days == 0:
        return 0
    else:
        return row['count']/ (row['max'] - row['min']).days

In [11]:
rt_count = train_df1.groupby('reviewerID')['reviewTime'].count()
rt_max = train_df1.groupby('reviewerID')['reviewTime'].max()
rt_min = train_df1.groupby('reviewerID')['reviewTime'].min()

reviewerID_RDP = pd.concat([rt_count, rt_max, rt_min], axis=1, join="inner")
reviewerID_RDP.columns.values[0] = 'count'
reviewerID_RDP.columns.values[1] = 'max'
reviewerID_RDP.columns.values[2] = 'min'
  
reviewerID_RDP['reviewerID_RPD'] = reviewerID_RDP.apply(RPD, axis=1)
reviewerID_RDP.head()

Unnamed: 0_level_0,count,max,min,reviewerID_RPD
reviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
U000005418,13,2014-01-25,2011-12-29,0.01715
U000025708,7,2013-12-03,2013-05-30,0.037433
U000095100,20,2014-06-07,2012-02-14,0.023697
U000129529,3,2014-01-01,2013-12-31,3.0
U000130531,7,2013-08-15,2013-03-29,0.05036


In [12]:
rt_count = train_df1.groupby('itemID')['reviewTime'].count()
rt_max = train_df1.groupby('itemID')['reviewTime'].max()
rt_min = train_df1.groupby('itemID')['reviewTime'].min()

itemID_RPD = pd.concat([rt_count, rt_max, rt_min], axis=1, join="inner")
itemID_RPD.columns.values[0] = 'count'
itemID_RPD.columns.values[1] = 'max'
itemID_RPD.columns.values[2] = 'min'
  
itemID_RPD['itemID_RPD'] = itemID_RPD.apply(RPD, axis=1)
itemID_RPD.head()

Unnamed: 0_level_0,count,max,min,itemID_RPD
itemID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
I000059267,4,2014-02-26,2010-12-21,0.003439
I000139473,9,2014-04-10,2013-02-20,0.021739
I000159068,7,2014-02-09,2013-01-03,0.017413
I000235837,24,2014-07-02,2010-01-26,0.014833
I000384418,13,2014-05-24,2010-06-26,0.009104


In [13]:
train_df1.head(3)

Unnamed: 0,categoryID,categories,itemID,reviewerID,rating,reviewText,reviewTime,summary,price,nHelpful,outOf,helpful_rate
0,0,"[[Clothing, Shoes & Jewelry, Women], [Clothing...",I655355328,U745881038,3.0,"These are cute, but they are a little small. ...",2014-05-20,Cute,,0,0,0.0
1,0,"[[Clothing, Shoes & Jewelry, Women, Clothing, ...",I241092314,U023577405,4.0,"I love the look of this bra, it is what I want...",2013-02-07,Beautiful but size runs small,,0,0,0.0
2,0,"[[Clothing, Shoes & Jewelry, Wedding Party Gif...",I408260822,U441384838,3.0,it's better on a man's hand.I didn't find it v...,2014-05-13,Good price but...,19.99,2,2,1.0


In [14]:
train_df1.shape

(200000, 12)

In [62]:
review_keyword_length = pd.DataFrame.from_dict(review_keyText_count, orient='index')
review_allword_length = pd.DataFrame.from_dict(review_allText_count, orient='index')

In [32]:
categories = list(train_df1['categories'])
g = list(chain.from_iterable(categories))
print('There are', len(set(list(chain.from_iterable(g)))), 'unique categories values.')

There are 1042 unique categories values


In [34]:
print('There are', train_df1['categoryID'].nunique(), 'unique categoriesID values.')

5

In [295]:
dummies = pd.get_dummies(train_df1['categoryID']).rename(columns=lambda x: 'categoryID_'+str(x))
# master = pd.concat([train_df1, dummies], axis=1)
master = pd.concat([train_df1, dummies, review_keyword_length, review_allword_length], axis=1)
master.columns.values[17] = 'review_content_len'
master.columns.values[18] = 'review_all_len'
master['review_contentratio'] = master['review_content_len']/master['review_all_len']
del master['categoryID']
master.head(3)

Unnamed: 0,categories,itemID,reviewerID,rating,reviewText,reviewTime,summary,price,nHelpful,outOf,helpful_rate,categoryID_0,categoryID_1,categoryID_2,categoryID_3,categoryID_4,review_content_len,review_all_len,review_contentratio
0,"[[Clothing, Shoes & Jewelry, Women], [Clothing...",I655355328,U745881038,3.0,"These are cute, but they are a little small. ...",2014-05-20,Cute,,0,0,0.0,1,0,0,0,0,10,24,0.416667
1,"[[Clothing, Shoes & Jewelry, Women, Clothing, ...",I241092314,U023577405,4.0,"I love the look of this bra, it is what I want...",2013-02-07,Beautiful but size runs small,,0,0,0.0,1,0,0,0,0,25,57,0.438596
2,"[[Clothing, Shoes & Jewelry, Wedding Party Gif...",I408260822,U441384838,3.0,it's better on a man's hand.I didn't find it v...,2014-05-13,Good price but...,19.99,2,2,1.0,1,0,0,0,0,15,28,0.535714


In [281]:
# create a model just using rating, outOf, categoryID_0-5; predict on nHelpful (only if outOf > 0)
# obviously is outOf == 0 then nHelpful == 0, do not need to model this

In [282]:
model = master[master['outOf'] != 0]
print('With all data', master.shape, ', when outOf != 0', model.shape)

With all data (200000, 19) , when outOf != 0 (63016, 19)


In [322]:
columns = ['rating', 'outOf', 'categoryID_0', 'categoryID_1', 'categoryID_2', 'categoryID_3', 'categoryID_4']
X_train = pd.DataFrame(model, columns=columns)
y_train = pd.DataFrame(model.ix[:, 'nHelpful'])                   

In [323]:
from sklearn.model_selection import train_test_split
X_train1, X_valid, y_train1, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=0)

In [299]:
### Test data
a = pd.DataFrame.from_dict(dict(test_df['helpful'])).T
test_df1 = pd.concat([test_df, a], axis=1)

review1_allText = {}
review1_allText_count = {}
review1_keyText = {}
review1_keyText_count = {}
vocabulary1 = []
count = 0
reviewText = list(test_df1['reviewText'])
for text in reviewText:
    all_, all_count, subset_, subset_count = reviewText_listed(text)
    review1_allText[count] = all_
    review1_allText_count[count] = all_count
    review1_keyText[count] = subset_
    review1_keyText_count[count] = subset_count
    vocabulary1.append(subset_)
    count += 1

In [300]:
review1_keyword_length = pd.DataFrame.from_dict(review1_keyText_count, orient='index')
review1_allword_length = pd.DataFrame.from_dict(review1_allText_count, orient='index')

In [324]:
dummies = pd.get_dummies(test_df1['categoryID']).rename(columns=lambda x: 'categoryID_'+str(x))
master = pd.concat([test_df1, dummies, review1_keyword_length, review1_allword_length], axis=1)
master.columns.values[18] = 'review_content_len'
master.columns.values[19] = 'review_all_len'
master['review_contentratio'] = master['review_content_len']/master['review_all_len']
del master['categoryID']
X_test = pd.DataFrame(master, columns=columns)
X_test.head()

Unnamed: 0,rating,outOf,categoryID_0,categoryID_1,categoryID_2,categoryID_3,categoryID_4,review_all_len
0,3.0,2,1,0,0,0,0,27
1,4.0,0,1,0,0,0,0,27
2,5.0,1,1,0,0,0,0,23
3,5.0,1,1,0,0,0,0,135
4,4.0,0,1,0,0,0,0,38


In [311]:
model1 = X_test[X_test['outOf'] != 0]
print('With all data', X_test.shape, ', when outOf != 0', model1.shape)

With all data (14000, 8) , when outOf != 0 (4400, 8)


Really have a training size of 63,016 to predict 4,400 values.

# Gradient Boosting Regression
- without rounding 0.16543
- with rounding 0.16271

In [312]:
from sklearn.grid_search import GridSearchCV
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error

In [None]:
# kf = StratifiedKFold(y, n_folds=10, random_state=None, shuffle=True)
gridparams = dict(learning_rate=[0.01, 0.1], loss=['ls', 'lad', 'huber', 'quantile'])
# gridparams = dict(learning_rate=[0.01, 0.1, 1, 10], loss=['ls', 'lad', 'huber', 'quantile'])
params = {'n_estimators': 100, 'max_depth': 4}
gbclf = GridSearchCV(ensemble.GradientBoostingRegressor(**params), gridparams, scoring='mean_absolute_error', n_jobs=-1)
# gbclf = GridSearchCV(ensemble.GradientBoostingRegressor(n_estimators= 200, max_depth= 4, criterion= 'mae'), gridparams, scoring='mean_absolute_error', n_jobs=-1)
gbclf.fit(X_train1, y_train1)

print("Best model:")
print(gbclf.best_estimator_)
print("")

y_pred = gbclf.predict(X_valid)
print("Mean absolute error: %0.3f" % mean_absolute_error(np.array(y_valid['nHelpful']), y_pred))

In [314]:
gbreg1 = ensemble.GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='lad', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)
gbreg1.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='lad', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False)

In [319]:
gbreg1_predictions = []
for i in range(len(X_test)):
    if X_test['outOf'][i] == 0:
        gbreg1_predictions.append(0)
    else:
        gbreg1_predictions.append(round(gbreg1.predict(X_test.ix[i])[0]))















In [320]:
# with rounding
predictions = open("predictions_gbreg_rounding_Helpful.csv", 'w')
predictions.write('userID-itemID-outOf,prediction\n')
for i in range(len(gbreg1_predictions)):
    user = test_user_id[i]
    item = test_item_id[i]
    outof = outOf[i]
    prediction = gbreg1_predictions[i]
    predictions.write(user + '-' + item + '-' + str(outof) + ',' + str(prediction) + '\n')