In [1]:
import warnings
warnings.filterwarnings('ignore', category=DeprecationWarning)

In [6]:
import xgboost as xgb
from bayes_opt import BayesianOptimization

import pandas as pd
import numpy as np
import os
import re
import random
from matplotlib import pyplot as plt

from subprocess import check_output
print(check_output(['ls', 'data']).decode('utf-8'))

% matplotlib inline

athenaeum_authors_preview.csv
athenaeum_painting_filtered.csv
athenaeum_paintings.csv
athenaeum_paintings_sizes.csv
color_histograms.csv
complete_data.csv
images
images_athenaeum
images_sizes_2325.csv
net_predicted.csv
painter_info_clean.csv
painting_info_clean.csv
resized_200
test_author200.csv
test_data.csv
test_hist_author_knn.csv
test_hist_author_rf.csv
train_author200.csv
train_data.csv
train_hist_author_knn.csv
train_hist_author_rf.csv



In [7]:
train_data = pd.read_csv("data/train_hist_author_rf.csv")
test_data = pd.read_csv("data/test_hist_author_rf.csv")

print "[INFO] The size of train histogram for Random Forest" + str(train_data.shape)
print "[INFO] The size of test histogram for Random Forest" + str(test_data.shape)


[INFO] The size of train histogram for Random Forest(49890, 34)
[INFO] The size of test histogram for Random Forest(12473, 34)


In [8]:
def get_top_author(data, num_author=3):
    author_index = data.author_id.value_counts().index[:num_author]
    data_new = data.loc[data['author_id'].isin(author_index)]
    
    return data_new

train = get_top_author(train_data, 3)
test = get_top_author(test_data, 3)

print train.author_id.value_counts()
print "[trian above] " + '=' * 50 + "[test below]"
print test.author_id.value_counts()

24      1369
1793    1338
368     1335
Name: author_id, dtype: int64
24      342
1793    335
368     334
Name: author_id, dtype: int64


In [9]:
train_labels = train.author_id
train = train.drop(['author_id', 'painting_id'], axis=1)
test_labels = test.author_id
test = test.drop(['author_id', 'painting_id'], axis=1)

print test_labels.value_counts()
print "-" * 50
print train_labels.value_counts()

24      342
1793    335
368     334
Name: author_id, dtype: int64
--------------------------------------------------
24      1369
1793    1338
368     1335
Name: author_id, dtype: int64


================================================================================================================

# Bayesian Optimization

[bayesian-optimization](https://scikit-optimize.github.io/notebooks/bayesian-optimization.html)

================================================================================================================

In [None]:
def xgb_evaluate(min_child_weight,
                 colsample_bytree,
                 max_depth,
                 subsample,
                 gamma,
                 alpha):

    params['min_child_weight'] = int(min_child_weight)
    params['cosample_bytree'] = max(min(colsample_bytree, 1), 0)
    params['max_depth'] = int(max_depth)
    params['subsample'] = max(min(subsample, 1), 0)
    params['gamma'] = max(gamma, 0)
    params['alpha'] = max(alpha, 0)


    cv_result = xgb.cv(params, xgtrain, num_boost_round=num_rounds, nfold=5,
             seed=random_state,
             callbacks=[xgb.callback.early_stop(50)])

    return -cv_result['test-mae-mean'].values[-1]


In [None]:
def prepare_data():
    train = pd.read_csv('../input/train.csv')
    categorical_columns = train.select_dtypes(include=['object']).columns

    for column in tqdm(categorical_columns):
        le = LabelEncoder()
        train[column] = le.fit_transform(train[column])

    y = train['loss']

    X = train.drop(['loss', 'id'], 1)
    xgtrain = xgb.DMatrix(X, label=y)

    return xgtrain

In [None]:
if __name__ == '__main__':
#     xgtrain = prepare_data()
    xgtrain = train
    num_rounds = 3000
    random_state = 2017
    num_iter = 25
    init_points = 5
    params = {
        'eta': 0.1,
        'silent': 1,
        'eval_metric': 'mae',
        'verbose_eval': True,
        'seed': random_state
    }

    xgbBO = BayesianOptimization(xgb_evaluate, {'min_child_weight': (1, 20),
                                                'colsample_bytree': (0.1, 1),
                                                'max_depth': (5, 15),
                                                'subsample': (0.5, 1),
                                                'gamma': (0, 10),
                                                'alpha': (0, 10),
                                                })

    xgbBO.maximize(init_points=init_points, n_iter=num_iter)