In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from util import *

# Load data

In [None]:
filename = './reviews_Office_Products.json'
meta_filename = './meta_Office_Products.json'
filename_without_extension = os.path.splitext(os.path.basename(filename))[0]
data = read_review_json_file(filename, meta_filename)
populate_fields(data)
np.random.seed(42)
train_indexes, valid_indexes, test_indexes = get_data_split(len(data), seed=42)

# Set up target

In [None]:
data['target'] = (data['totals'] >= 1) & (data['ups'] * 1.0 / (data['totals']) >= 0.7)
data.describe()

# Extract Features

In [None]:
recalculate_features = False
cache_features = False
if recalculate_features:
    name_to_feature_dict = extract_features(data, train_indexes, do_not_consider={'vectorized_summary', 'word_embedding'})
    if cache_features:
        cPickle.dump(name_to_feature_dict, open(filename_without_extension + '_name_to_feature_dict.pkl', 'wb'), protocol=cPickle.HIGHEST_PROTOCOL)
else:
    name_to_feature_dict = cPickle.load(open(filename_without_extension + '_name_to_feature_dict.pkl', 'rb'))

In [None]:
numerical_feature_names = ['length', 'overall', 'price', 'review_lateness', 'product_review_number', 'review_year']
bag_feature_names = ['vectorized_reviews', 'vectorized_product_title', 'vectorized_reviewer_name',
                     'vectorized_category']
special_feature_names = ['leaf_feature']

In [None]:
# Calculate leaf feature
numerical_features = get_features_by_names(name_to_feature_dict, numerical_feature_names)
leaf_feature = extract_leaf_feature(numerical_features, data['target'],
                                    train_indexes, {'n_estimators': 100, 'max_depth': 11})
name_to_feature_dict['leaf_feature'] = leaf_feature

# Model and Hyperparameter selection

In [None]:
from hyperopt import fmin, tpe, hp, rand
def f(params):
    start_time = time.time()
    for name in clip_set:
        if name in params:
            params[name] = int(params[name])
    print 'using ', params
    model = clf.set_params(**params)
    model.fit(features[train_indexes], data.loc[train_indexes]['target'])
    train_prob_preds = model.predict_proba(features[train_indexes])
    training_auc = metrics.average_precision_score(data.loc[train_indexes]['target'], train_prob_preds[:,1])
    valid_prob_preds = model.predict_proba(features[valid_indexes])
    valid_auc = metrics.average_precision_score(data.loc[valid_indexes]['target'], valid_prob_preds[:,1])
    print 'training_auc', training_auc
    print 'valid_auc', valid_auc
    print 'time %.2f mins'%((time.time() - start_time) / 60)
    return -valid_auc

model_name = 'MultinomialNB'

if model_name == 'XGBClassifier':
    clf = XGBClassifier()
    space = {'n_estimators':hp.quniform('n_estimators', 100, 400, 5),
             'max_depth': hp.quniform('max_depth', 2, 20, 1),
             'min_child_weight':hp.uniform('min_child_weight', 0.00001, 20),
             'gamma':hp.uniform('gamma', 0.00001, 20)}
    clip_set = {'n_estimators', 'max_depth'}
    used_feature_names = bag_feature_names + numerical_feature_names
elif model_name == 'LogisticRegression':
    clf = LogisticRegression()
    space = {'C':hp.loguniform('C', -5, 5)}
    clip_set = {}
    used_feature_names = bag_feature_names
elif model_name == 'MultinomialNB':
    clf = naive_bayes.MultinomialNB()
    space = {'alpha':hp.uniform('alpha', 0, 10)}
    clip_set = {}
    used_feature_names = bag_feature_names
else:
    print model_name + "doesn't exist"

features = get_features_by_names(name_to_feature_dict, used_feature_names)
print 'feature shape:', features.shape
best = fmin(fn=f,
    space=space,
    algo=tpe.suggest,
    max_evals=40)
print best
f(best)

# Manually Try specific model and parameters based on the previous cell
Trained on training set

In [None]:
used_feature_names = bag_feature_names + numerical_feature_names
features = get_features_by_names(name_to_feature_dict, used_feature_names, scale_feature_names=numerical_feature_names)
print 'feature shape:', features.shape
# model = LogisticRegression(C=0.101)
# model = XGBClassifier(n_estimators=300, max_depth=6)
# model = sklearn.linear_model.SGDClassifier(loss='log', n_iter=40, alpha=0.00011)
# model = naive_bayes.MultinomialNB(alpha=0.154)
# model = DummyClassifier()
model = XGBClassifier(**{'n_estimators': 200, 'max_depth': 13, 'gamma': 13, 'min_child_weight': 6})

train_and_evaluate(model, features, data, train_indexes, valid_indexes, test_indexes)

# Add features one by one to evaluate features 

In [None]:
# Feature addition test
used_feature_names = ['vectorized_reviews', 'vectorized_product_title', 'length', 'overall', 'vectorized_reviewer_name',
                       'price', 'vectorized_category', 'review_lateness', 'product_review_number', 'review_year']
features = hstack(map(name_to_feature_dict.__getitem__, used_feature_names), format='csr')
training_aucs = []
valid_aucs = []
model = XGBClassifier(**{'n_estimators': 200, 'max_depth': 13, 'gamma': 13, 'min_child_weight': 6})
for i in range(0, len(used_feature_names)):
    print '------------------------------'
    temp_feature_names = used_feature_names[:i + 1]
    print 'use', temp_feature_names
    features = hstack(map(name_to_feature_dict.__getitem__, temp_feature_names), format='csr')
    model.fit(features[train_indexes], data.loc[train_indexes]['target'])

    print 'training set'
    train_prob_preds = model.predict_proba(features[train_indexes])
    evaluate(data.loc[train_indexes]['target'], train_prob_preds[:,1])
    training_auc = metrics.average_precision_score(data.loc[train_indexes]['target'], train_prob_preds[:,1])
    training_aucs.append(training_auc)

    print ''
    print 'validation set'
    valid_prob_preds = model.predict_proba(features[valid_indexes])
    evaluate(data.loc[valid_indexes]['target'], valid_prob_preds[:,1])
    
    valid_auc = metrics.average_precision_score(data.loc[valid_indexes]['target'], valid_prob_preds[:,1])
    valid_aucs.append(valid_auc)

print 'training AUC', zip(used_feature_names, map(lambda x:'%.4f'%(x), training_aucs))
print 'validation AUC', zip(used_feature_names, map(lambda x:'%.4f'%(x), valid_aucs)) 

# Final Result
Trained on trainging and validation set, test on test set

In [None]:
dev_indexes = np.concatenate([train_indexes, valid_indexes])
final_name_to_feature_dict = extract_features(data, dev_indexes, do_not_consider={'vectorized_summary', 'word_embedding'})
used_feature_names = ['vectorized_reviews', 'vectorized_product_title', 'length', 'overall', 'vectorized_reviewer_name',
                      'price', 'vectorized_category', 'review_lateness', 'product_review_number', 'review_year']
final_features = get_features_by_names(final_name_to_feature_dict, used_feature_names)
print 'feature shape:', features.shape

final_model = XGBClassifier(**{'n_estimators': 200, 'max_depth': 13, 'gamma': 13, 'min_child_weight': 6})
final_model.fit(final_features[dev_indexes], data.loc[dev_indexes]['target'])

print ''
print 'dev set'
dev_prob_preds = final_model.predict_proba(final_features[dev_indexes])
evaluate(data.loc[dev_indexes]['target'], dev_prob_preds[:,1])

print ''
print 'test set'
test_prob_preds = final_model.predict_proba(final_features[test_indexes])
evaluate(data.loc[test_indexes]['target'], test_prob_preds[:,1], draw_roc=True, draw_precision_recall_curve=True)