In [24]:
import xgboost as xgb
import numpy as np
import matplotlib.pyplot as plt

In [25]:
data_path = "/Users/oliverpaul/Data_Science/idiap/lucideles/data"
data = np.load(data_path + '/train.npy')

In [26]:
data_x_dgp, data_y_dgp = data[:,:-2].copy(), data[:,-2].copy()
data_x_ill, data_y_ill = data[:,:-2].copy(), data[:,-1].copy()

In [27]:
data_dgp = xgb.DMatrix(data_x_dgp, label=data_y_dgp)
data_ill = xgb.DMatrix(data_x_ill, label=data_y_ill)

In [5]:
params_dgp = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':0.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'reg:squarederror',
    'eval_metric': 'mae'
}

params_ill = {
    'max_depth':6,
    'min_child_weight': 1,
    'eta':0.3,
    'subsample': 1,
    'colsample_bytree': 1,
    'objective':'reg:squarederror',
    'eval_metric': 'mae'
}

In [7]:
model = xgb.cv(
    params_dgp, 
    data_dgp,
    num_boost_round=1000,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)

In [8]:
model

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,0.740245,0.000108,0.741046,0.000486
1,0.530992,0.000414,0.532178,0.000277
2,0.384049,0.000475,0.385804,0.000445
3,0.280006,0.000776,0.282145,0.000768
4,0.207283,0.000648,0.209743,0.000459
...,...,...,...,...
125,0.029533,0.000154,0.044766,0.000499
126,0.029404,0.000173,0.044726,0.000494
127,0.029354,0.000162,0.044719,0.000491
128,0.029288,0.000189,0.044694,0.000497


In [9]:
model['test-mae-mean'].min()

0.0446914

In [10]:
#finding best max depth and min_child weight params
gridsearch_params = [(max_depth, min_child_weight) for max_depth in range(5,15) for min_child_weight in range(1,8)]

def depth_child_weight(params, data):
    params_cv = params.copy()
    # Define initial best params and MAE
    min_mae = float("Inf")
    best_params = None
    for max_depth, min_child_weight in gridsearch_params:
        print("CV with max_depth = %i, min_child_weight = %i" % (max_depth,min_child_weight))
        # Update our parameters
        params_cv['max_depth'] = max_depth
        params_cv['min_child_weight'] = min_child_weight
        # Run CV
        cv_results = xgb.cv(
            params_cv,
            data,
            num_boost_round=1000,
            seed=42,
            nfold=5,
            metrics={'mae'},
            early_stopping_rounds=10
        )
        # Update best MAE
        mean_mae = cv_results['test-mae-mean'].min()
        boost_rounds = cv_results['test-mae-mean'].argmin()
        print("\tMAE %.5f after %.5f rounds" % (mean_mae, boost_rounds))
        if mean_mae < min_mae:
            min_mae = mean_mae
            best_params = (max_depth, min_child_weight)
    params['max_depth'] = best_params[0]
    params['min_child_weight'] = best_params[1]
    print("Best params: max_depth: %i, min_child_weight: %i, MAE: %.5f" % (best_params[0], best_params[1], min_mae))

In [11]:
depth_child_weight(params_dgp, data_dgp)

CV with max_depth = 5, min_child_weight = 1
	MAE 0.04868 after 298.00000 rounds
CV with max_depth = 5, min_child_weight = 2
	MAE 0.05033 after 172.00000 rounds
CV with max_depth = 5, min_child_weight = 3
	MAE 0.05063 after 167.00000 rounds
CV with max_depth = 5, min_child_weight = 4
	MAE 0.04941 after 276.00000 rounds
CV with max_depth = 5, min_child_weight = 5
	MAE 0.04930 after 301.00000 rounds
CV with max_depth = 5, min_child_weight = 6
	MAE 0.04999 after 227.00000 rounds
CV with max_depth = 5, min_child_weight = 7
	MAE 0.04996 after 260.00000 rounds
CV with max_depth = 6, min_child_weight = 1
	MAE 0.04469 after 129.00000 rounds
CV with max_depth = 6, min_child_weight = 2
	MAE 0.04405 after 242.00000 rounds
CV with max_depth = 6, min_child_weight = 3
	MAE 0.04539 after 109.00000 rounds
CV with max_depth = 6, min_child_weight = 4
	MAE 0.04501 after 167.00000 rounds
CV with max_depth = 6, min_child_weight = 5
	MAE 0.04539 after 135.00000 rounds
CV with max_depth = 6, min_child_weight 

In [13]:
#same for ill
depth_child_weight(params_ill, data_ill)

CV with max_depth = 5, min_child_weight = 1
	MAE 0.02533 after 999.00000 rounds
CV with max_depth = 5, min_child_weight = 2
	MAE 0.02582 after 999.00000 rounds
CV with max_depth = 5, min_child_weight = 3
	MAE 0.02626 after 999.00000 rounds
CV with max_depth = 5, min_child_weight = 4
	MAE 0.02699 after 765.00000 rounds
CV with max_depth = 5, min_child_weight = 5
	MAE 0.02657 after 999.00000 rounds
CV with max_depth = 5, min_child_weight = 6
	MAE 0.02698 after 999.00000 rounds
CV with max_depth = 5, min_child_weight = 7
	MAE 0.02845 after 544.00000 rounds
CV with max_depth = 6, min_child_weight = 1
	MAE 0.02167 after 999.00000 rounds
CV with max_depth = 6, min_child_weight = 2
	MAE 0.02149 after 999.00000 rounds
CV with max_depth = 6, min_child_weight = 3
	MAE 0.02210 after 999.00000 rounds
CV with max_depth = 6, min_child_weight = 4
	MAE 0.02286 after 999.00000 rounds
CV with max_depth = 6, min_child_weight = 5
	MAE 0.02349 after 589.00000 rounds
CV with max_depth = 6, min_child_weight 

In [14]:
#saving updated params
np.save(data_path + '/XG_DGP_PARAMS.npy', params_dgp) 
np.save(data_path + '/XG_ILL_PARAMS.npy', params_ill)

In [15]:
#load the params dicts
params_dgp = np.load(data_path + '/XG_DGP_PARAMS.npy',allow_pickle='TRUE').item()
params_ill = np.load(data_path + '/XG_ILL_PARAMS.npy',allow_pickle='TRUE').item()

In [28]:
params_dgp

{'max_depth': 12,
 'min_child_weight': 1,
 'eta': 0.3,
 'subsample': 1,
 'colsample_bytree': 1,
 'objective': 'reg:squarederror',
 'eval_metric': 'mae'}

In [29]:
params_ill

{'max_depth': 10,
 'min_child_weight': 1,
 'eta': 0.3,
 'subsample': 1,
 'colsample_bytree': 1,
 'objective': 'reg:squarederror',
 'eval_metric': 'mae'}

In [42]:
#Now the same for row subsample and colsample
gridsearch_params = [(subsample, colsample) for subsample in [i/10. for i in range(8,11)] for colsample in [i/10. for i in range(8,11)]]

def subsample_colsample(params, data):
    params_cv = params.copy()
    # Define initial best params and MAE
    min_mae = float("Inf")
    best_params = None
    for subsample, colsample in gridsearch_params:
        print("CV with subsample = %.1f, colsample = %.1f" % (subsample, colsample))
        # Update our parameters
        params_cv['subsample'] = subsample
        params_cv['colsample_bytree'] = colsample
        # Run CV
        cv_results = xgb.cv(
            params_cv,
            data,
            num_boost_round=1000,
            seed=42,
            nfold=5,
            metrics={'mae'},
            early_stopping_rounds=10
        )
        # Update best MAE
        mean_mae = cv_results['test-mae-mean'].min()
        boost_rounds = cv_results['test-mae-mean'].argmin()
        print("\tMAE %.5f after %.5f rounds" % (mean_mae, boost_rounds))
        if mean_mae < min_mae:
            min_mae = mean_mae
            best_params = (subsample, colsample)
    params['subsample'] = best_params[0]
    params['colsample_bytree'] = best_params[1]
    print("Best params: subsample: %.1f, colsample: %.1f, MAE: %.5f" % (best_params[0], best_params[1], min_mae))

In [34]:
subsample_colsample(params_dgp, data_dgp)

CV with subsample = 0.5, colsample = 0.5
	MAE 0.10079 after 998.00000 rounds
CV with subsample = 0.5, colsample = 0.6
	MAE 0.07393 after 777.00000 rounds
CV with subsample = 0.5, colsample = 0.7
	MAE 0.07393 after 777.00000 rounds
CV with subsample = 0.5, colsample = 0.8
	MAE 0.06006 after 410.00000 rounds
CV with subsample = 0.5, colsample = 0.9
	MAE 0.04797 after 159.00000 rounds
CV with subsample = 0.5, colsample = 1.0
	MAE 0.03678 after 46.00000 rounds
CV with subsample = 0.6, colsample = 0.5
	MAE 0.09945 after 999.00000 rounds
CV with subsample = 0.6, colsample = 0.6
	MAE 0.07218 after 999.00000 rounds
CV with subsample = 0.6, colsample = 0.7
	MAE 0.07218 after 999.00000 rounds
CV with subsample = 0.6, colsample = 0.8
	MAE 0.05847 after 619.00000 rounds
CV with subsample = 0.6, colsample = 0.9
	MAE 0.04564 after 352.00000 rounds
CV with subsample = 0.6, colsample = 1.0
	MAE 0.03608 after 43.00000 rounds
CV with subsample = 0.7, colsample = 0.5
	MAE 0.09831 after 999.00000 rounds
C

In [43]:
subsample_colsample(params_ill, data_ill)

CV with subsample = 0.8, colsample = 0.8
	MAE 0.03518 after 833.00000 rounds
CV with subsample = 0.8, colsample = 0.9
	MAE 0.02476 after 572.00000 rounds
CV with subsample = 0.8, colsample = 1.0
	MAE 0.01869 after 522.00000 rounds
CV with subsample = 0.9, colsample = 0.8
	MAE 0.03469 after 607.00000 rounds
CV with subsample = 0.9, colsample = 0.9
	MAE 0.02420 after 532.00000 rounds
CV with subsample = 0.9, colsample = 1.0
	MAE 0.01804 after 434.00000 rounds
CV with subsample = 1.0, colsample = 0.8
	MAE 0.03560 after 335.00000 rounds
CV with subsample = 1.0, colsample = 0.9
	MAE 0.02522 after 242.00000 rounds
CV with subsample = 1.0, colsample = 1.0
	MAE 0.01810 after 191.00000 rounds
Best params: subsample: 0.9, colsample: 1.0, MAE: 0.01804


In [44]:
#saving updated params
np.save(data_path + '/XG_DGP_PARAMS.npy', params_dgp) 
np.save(data_path + '/XG_ILL_PARAMS.npy', params_ill)

In [66]:
#load the params dicts
params_dgp = np.load(data_path + '/XG_DGP_PARAMS.npy',allow_pickle='TRUE').item()
params_ill = np.load(data_path + '/XG_ILL_PARAMS.npy',allow_pickle='TRUE').item()

In [60]:
#Now the same for eta
#gridsearch_params = [np.round(i, 2) for i in np.linspace(0.03, 0.07, 5)]
#gridsearch_params = [np.round(i, 2) for i in np.linspace(0.1, 0.5, 5)]
gridsearch_params = [np.round(i, 2) for i in np.linspace(0.05, 0.1, 5)]

def eta(params, data):
       
    params_cv = params.copy()
    # Define initial best params and MAE
    min_mae = float("Inf")
    best_params = None
    for eta in gridsearch_params:
        print("CV with eta = %.3f" % (eta))
        # Update our parameters
        params_cv['eta'] = eta
        # Run CV
        cv_results = xgb.cv(
            params_cv,
            data,
            num_boost_round=2000,
            seed=42,
            nfold=5,
            metrics={'mae'},
            early_stopping_rounds=10
        )
        # Update best MAE
        mean_mae = cv_results['test-mae-mean'].min()
        boost_rounds = cv_results['test-mae-mean'].argmin()
        print("\tMAE %.5f after %.5f rounds" % (mean_mae, boost_rounds))
        if mean_mae < min_mae:
            min_mae = mean_mae
            best_params = (eta)
    params['eta'] = best_params
    print("Best params: eta: %.3f, MAE: %.5f" % (best_params, min_mae))

In [65]:
#saving updated params
np.save(data_path + '/XG_DGP_PARAMS.npy', params_dgp) 
np.save(data_path + '/XG_ILL_PARAMS.npy', params_ill)

In [62]:
eta(params_ill, data_ill)

CV with eta = 0.050
	MAE 0.01546 after 1638.00000 rounds
CV with eta = 0.060
	MAE 0.01560 after 1298.00000 rounds
CV with eta = 0.080
	MAE 0.01565 after 1040.00000 rounds
CV with eta = 0.090
	MAE 0.01598 after 979.00000 rounds
CV with eta = 0.100
	MAE 0.01599 after 1015.00000 rounds
Best params: eta: 0.050, MAE: 0.01546
