============================================================================================

# XGBoost


============================================================================================


In [1]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [45]:
import xgboost as xgb
from bayes_opt import BayesianOptimization
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score

import pandas as pd
import numpy as np
import os
import re
import time
import random
from matplotlib import pyplot as plt

import fns_models as fns


from subprocess import check_output
print(check_output(['ls', 'data']).decode('utf-8'))

% matplotlib inline

athenaeum_authors_preview.csv
athenaeum_painting_filtered.csv
athenaeum_paintings.csv
athenaeum_paintings_sizes.csv
color_histograms.csv
complete_data.csv
images
images_athenaeum
images_sizes_2325.csv
net_predicted.csv
painter_info_clean.csv
painting_info_clean.csv
resized_200
test_author200.csv
test_data.csv
test_hist_author_knn.csv
test_hist_author_rf.csv
train_author200.csv
train_data.csv
train_hist_author_knn.csv
train_hist_author_rf.csv



In [3]:
train, train_labels, test, test_labels = fns.get_top_author(3)

[INFO] The size of train histogram for Random Forest(49890, 34)
[INFO] The size of test histogram for Random Forest(12473, 34)
24      1369
1793    1338
368     1335
Name: author_id, dtype: int64
24      342
1793    335
368     334
Name: author_id, dtype: int64
(4042,)
(4042, 34)


In [4]:
train = train.sample(20)
train_labels = train_labels.sample(20)
test = test.sample(15)
test_labels = test_labels.sample(15)

================================================================================================================

# Bayesian Optimization

[bayesian-optimization](https://scikit-optimize.github.io/notebooks/bayesian-optimization.html)

================================================================================================================

In [42]:
params = {}
def xgb_evaluate(min_child_weight,
                 colsample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):
    
    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)

    num_rounds = 3000
    random_state = 2017
    
    random_state = 2017
    
    cv_result = cross_val_score(
        xgb(
                 seed=random_state,
                 min_child_weight = int(min_child_weight),
                 cosample_bytree = max(min(colsample_bytree, 1), 0),
                 max_depth = int(max_depth),
                 subsample = max(min(subsample, 1), 0),
                 gamma = max(gamma, 0),
                 alpha = max(alpha, 0)        
             ),
        train, train_labels, 'f1', cv=5
    ).mean()
    return cv_result


def xgb_pca_evaluate(min_child_weight,
                 colsample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):

#     params['min_child_weight'] = int(min_child_weight)
#     params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
#     params['max_depth'] = int(max_depth)
#     params['subsample'] = max(min(subsample, 1), 0)
#     params['gamma'] = max(gamma, 0)
#     params['alpha'] = max(alpha, 0)
    

    random_state = 2017
    
    cv_result = cross_val_score(
        xgb(
                 seed=random_state,
                 min_child_weight = int(min_child_weight),
                 cosample_bytree = max(min(colsample_bytree, 1), 0),
                 max_depth = int(max_depth),
                 subsample = max(min(subsample, 1), 0),
                 gamma = max(gamma, 0),
                 alpha = max(alpha, 0)        
             ),
        pca_transformed, train_labels, 'f1', cv=5
    ).mean()
    return cv_result


In [43]:

def xgb_bo(xgb_fn = xgb_evaluate):
    start_time = time.time()
    
    
    random_state = 2017
    num_iter = 25
    init_points = 5
#     params = {
#         'eta': 0.1,
#         'silent': 1,
#         'eval_metric': 'mae',
#         'verbose_eval': True,
#         'seed': random_state
#     }

    xgbBO = BayesianOptimization(xgb_fn, {'min_child_weight': (1, 20),
                                                'colsample_bytree': (0.1, 1),
                                                'max_depth': (5, 15),
                                                'subsample': (0.5, 1),
                                                'gamma': (0, 10),
                                                'alpha': (0, 10),
                                                })

    xgbBO.maximize(init_points=init_points, n_iter=num_iter)
    print('-' * 53)
    print '\n%f' % (time.time() - start_time)

    print('-' * 53)
    print('Final Results')
    print('XGboost: %f' % xgbBO.res['max']['max_val'])
    print('-' * 53)
    print('XGboost: %f' % xgbBO.res['max'])
    fns.plot_bo(xgb_fn, xgbBO)


In [46]:
# Run BO for color histogram
xgb_bo(xgb_evaluate)


[31mInitialization[0m
[94m---------------------------------------------------------------------------------------------------------------------------[0m
 Step |   Time |      Value |     alpha |   colsample_bytree |     gamma |   max_depth |   min_child_weight |   subsample | 


TypeError: 'module' object is not callable

In [None]:
# Choose the best params from BO results then predict 
best_min_child_weight = 
best_colsample_bytree = 
best_subsample = 
best_gamma = 
best_alpha = 
best_xgb = xgb(n_jobs = 4, min_child_weight=best_min_child_weight, colsample_bytree=best_colsample_bytree
               max_depth = best_max_depth, subsample = best_subsample, gamma=gamma, alpha=best_alpha)

best_xgb.fit(train, train_labels)

# use the best params to predict
xgb_true, xgb_pred = test_labels, best_xgb.predict(test)
xgb_pred

In [None]:
test_data_df = fns.result_table(xgb_true, xgb_pred)
test_data_df

In [None]:
test_data_df.groupby(['actual', 'predictions']).aggregate({'results': 'count'}).unstack()

# PCA + XGB

In [5]:
## Get 15 principal components
xgb_pca = PCA(n_components=15)
xgb_pca.fit(train)
xgb_pca_transformed = xgb_pca.transform(train)
xgb_pca_transformed_test = xgb_pca.transform(test)

NameError: name 'PCA' is not defined

In [None]:
plt.plot(pca.explained_variance_ratio_)
plt.title("Scree Plot: 10 Principal Components")
plt.xlabel("Number of Components")
plt.ylabel("% of Explained Variance")
sum(pca.explained_variance_ratio_)

In [None]:
# Run BO for pca of color histogram
xgb_bo(xgb_pca_evaluate)

In [None]:
# Choose the best params from BO results then predict 
best_min_child_weight = 
best_colsample_bytree = 
best_subsample = 
best_gamma = 
best_alpha = 
best_xgb = xgb(n_jobs = 4, min_child_weight=best_min_child_weight, colsample_bytree=best_colsample_bytree
               max_depth = best_max_depth, subsample = best_subsample, gamma=gamma, alpha=best_alpha)

best_xgb.fit(xgb_pca_transformed, train_labels)

# use the best params to predict
xgb_true, xgb_pred = test_labels, best_xgb.predict(xgb_pca_transformed_test)
xgb_pred

In [None]:
best_xgb.score(xgb_pca_transformed, train_labels)
print "Accuracy of best xgb model on training: %s" % str(best_xgb.score)

best_xgb.score(xgb_pca_transformed_test, test_labels)
print "Accuracy of best xgb model on testing: %s" % str(best_xgb.score)


In [None]:

test_data_df = fns.result_table(xgb_true, xgb_pred)
test_data_df

In [None]:
test_data_df.groupby(['actual', 'predictions']).aggregate({'results': 'count'}).unstack()