In [1]:
import pandas as pd
import numpy as np

import random

import xgboost as xgb
import gc

import utils

# Reload in case utils.py updated
# import importlib
# utils = importlib.reload(utils)

In [2]:
import warnings
warnings.filterwarnings('ignore')

#  Calculate loss for every sample from LOO results

In [3]:
# loo = pd.read_csv('loo.csv', sep=';', header=None)
# loo.columns=['loo']
# loo['target']=y_train
# loo_loss = loo.apply(lambda x: log_loss([x['target']], [x['loo']], labels=[1, 0]), axis=1)
# pd.DataFrame(loo_loss).to_csv('loo_loss.csv', index=False, header=False, sep=';')

# Load data and generate new features

In [4]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

train = utils.clean_data(train, more_clean=True)
test = utils.clean_data(test, more_clean=True)

train = utils.new_features(train)
test = utils.new_features(test)

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test.drop([], axis=1)

data = pd.concat((train.drop('cardio', axis=1), test), axis=0)
dic2 = data.groupby('age_group_orig')['age'].min().to_dict()
dic3 = data.groupby('age_group_orig')['age'].max().to_dict()
X_train['age_dif2'] = X_train[['age_group_orig', 'age']].apply(lambda x: (x['age'] - dic2[x['age_group_orig']]) / dic3[x['age_group_orig']], axis=1)
X_test['age_dif2']  =  X_test[['age_group_orig', 'age']].apply(lambda x: (x['age'] - dic2[x['age_group_orig']]) / dic3[x['age_group_orig']], axis=1)

# Calculte stratify groups

In [5]:
loo_loss = pd.read_csv('loo_loss.csv', sep=';', header=None)
strat = pd.qcut(loo_loss, 20, labels=False).astype(str)
strat = np.hstack((strat, y_train.reshape((-1,1))))
strat = np.apply_along_axis(lambda d: str(d[0]) + '_' + str(d[1]), 1, strat)

# Keras models

In [6]:
from keras.layers.core import Dense, Dropout
from keras.layers.advanced_activations import LeakyReLU
from keras.models import Sequential
from keras.optimizers import Adamax

from sklearn.preprocessing import StandardScaler

class KerasModel(object):
    def __init__(self,
                 var_num,
                 epochs=70,
                 learn_rate=0.1,
                 config=None,
                 batch_size=512,
                 verbose=0,
                 validation_split=0.2,
                 loss="binary_crossentropy"):

        self.epochs = epochs
        self.batch_size = batch_size
        self.verbose = verbose
        self.validation_split = validation_split
        
        self.model = Sequential()
        model = self.model
        
        if config is None:
            config =[(var_num, 0.0)]
        else:
            config = config.copy()
            
        n, dp = config.pop(0)

        model.add(Dense(n, input_dim=var_num, kernel_initializer='uniform'))
        model.add(LeakyReLU())
        if 0 < dp < 1:
            model.add(Dropout(dp))
        
        while config:
            n, dp = config.pop(0)
            model.add(Dense(n, kernel_initializer='uniform'))
            model.add(LeakyReLU())
            if 0 < dp < 1:
                model.add(Dropout(dp))


        model.add(Dense(1, activation='sigmoid'))
        opt = Adamax(lr=learn_rate)

        model.compile(loss=loss, optimizer=opt, metrics=['accuracy'])


    def fit(self, X, y, sample_weight=None, callbacks=[]):
        process_X = X.values if hasattr(X, 'iloc') else X
        process_y = y
        return self.model.fit(process_X, process_y, batch_size=self.batch_size,
                       epochs=self.epochs, verbose=self.verbose,
                       sample_weight=sample_weight,
                       callbacks=callbacks,
                       validation_split=self.validation_split,
                       shuffle=True)

    def predict_proba(self, X):
        process_x = X.values if hasattr(X, 'iloc') else X
        result  = self.model.predict(process_x)
        classone_probs = result
        classzero_probs = 1.0 - classone_probs
        return np.hstack((classzero_probs, classone_probs))
#         return result
    
    def predict(self, X):
        return np.argmax(self.predict_proba(X), axis=1)

Using Theano backend.


In [7]:
use_columns = [
'gender',
'ap_hi',
'ap_lo',
'cholesterol',
'active_restored',
'smoke_restored',
'alco_restored',
'height_mul_log_cholesterol',
'height_mul_log_gluc',
'BMI',
'age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'age_dif2',
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',
'cholesterol_div_ap_hi',
'ap_hi_mul_log_gluc',
'BMI_div_ap_hi',
'BMI_div_log_age',
# 'gluc',
'gluc_mul_height',

'ap_hi_1',
'ap_lo_1',
'ap_hi_2',
'ap_lo_2',
]

X1 = X_train[use_columns]
X2 = X_test[use_columns]

X = pd.concat((X1,X2), axis=0)
scaler = StandardScaler().fit(X)
X1 = pd.DataFrame(scaler.transform(X1)) #.values
X2 = pd.DataFrame(scaler.transform(X2)) #.values

def create(x1, x2):
    config = [(64,0.075), (64,0.025)]
    return KerasModel(var_num=len(use_columns),
                   epochs=200,
                   learn_rate=0.001,
                   config=config,
                   batch_size=1024,
                   verbose=0,
                   validation_split=0.0)
utils.execute_model(None,
              X1,
              y_train,
              X2,
              model_name="+-KERAS_4_hey3",
              n_splits=5,
              n_folds=10,
              stratification_groups=strat,
              create_callback=create
             )


10 folds logloss:
[0.53808503548482933, 0.53884686765635181, 0.54003997763976341, 0.53812692408851048, 0.53751825536679487, 0.53995355279650969, 0.53869798183479278, 0.53713969671228812, 0.53888532399449207, 0.53918895585501081]
mean: 0.538648257143
std: 0.000904036643709
5 Splits logloss:
[0.53943486353976744, 0.53942470778746621, 0.53994360993248192, 0.53803543300305801, 0.53956285256244951]
mean: 0.539280293365
std: 0.000650269690796
+-KERAS_4_hey3 results saved!


(0.53864825714293429, 0.53928029336504468)

In [8]:
use_columns = [
'gender',
'ap_hi',
'ap_lo',
'cholesterol',
'active_restored',
'smoke_restored',
'alco_restored',
'height_mul_log_cholesterol',
'height_mul_log_gluc',
'BMI',
'age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'age_dif2',
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',
'cholesterol_div_ap_hi',
'ap_hi_mul_log_gluc',
'BMI_div_ap_hi',
'BMI_div_log_age',
# 'gluc',
'gluc_mul_height',

'ap_hi_1',
'ap_lo_1',
'ap_hi_2',
'ap_lo_2',
]

X1 = X_train[use_columns]
X2 = X_test[use_columns]

X = pd.concat((X1,X2), axis=0)
scaler = StandardScaler().fit(X)
X1 = pd.DataFrame(scaler.transform(X1))
X2 = pd.DataFrame(scaler.transform(X2))

def create(x1, x2):
    config = [(150,0.5), (64,0.2)]
    return KerasModel(var_num=len(use_columns),
                   epochs=500,
                   learn_rate=0.001,
                   config=config,
                   batch_size=2000,
                   verbose=0,
                   validation_split=0.0)
utils.execute_model(None,
              X1,
              y_train,
              X2,
              model_name="+-KERAS_5_hey3",
              n_splits=5,
              n_folds=10,
              stratification_groups=strat,
              create_callback=create
             )


10 folds logloss:
[0.53761156280117783, 0.53892291320219388, 0.54010385235926828, 0.53782496609730723, 0.53629403980961943, 0.53930981373114251, 0.53902345156883069, 0.53752494536534967, 0.53881472586798329, 0.53895055540202796]
mean: 0.53843808262
std: 0.00104912802091
5 Splits logloss:
[0.53839273105703644, 0.53917217837549036, 0.53968607032778004, 0.53799788580222851, 0.53846568005629591]
mean: 0.538742909124
std: 0.000604709108768
+-KERAS_5_hey3 results saved!


(0.53843808262048998, 0.53874290912376632)

# XGB models

In [9]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
"active_fair",
"smoke_restored",
"alco_restored",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
'age_dif2',
'ap_lo_mul_log_ap_hi',

'ap_hi_1',
'ap_lo_1',
'ap_hi_2',
'ap_lo_2',
]
model_name = '+-XGB_1.5_hey3'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,
    #'scale_pos_weight': 1.0008,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': -1,
    'random_state': 1223,
    'silent': True,
}
model = xgb.XGBClassifier(**params)
utils.execute_model(model,
              X_train[use_columns],
              y_train,
              X_test[use_columns],
              model_name=model_name,
              n_splits=5,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53683087878243174, 0.53739440015392026, 0.53703107500107594, 0.53629097760746192, 0.53612553688910203, 0.53915086945768242, 0.53804198434940032, 0.53659263486673847, 0.53763225976905393, 0.53643972637262194]
mean: 0.537153034325
std: 0.000883736311121
5 Splits logloss:
[0.53684778302251579, 0.53814789299649146, 0.53768941315334473, 0.5381390738050853, 0.53736723468346259]
mean: 0.537638279532
std: 0.000492232765225
+-XGB_1.5_hey3 results saved!


46063

In [10]:
use_columns = [
'gender',
'ap_hi',
'ap_lo',
'cholesterol',
'active_fair',
'smoke_restored',
'alco_restored',
'height_mul_log_cholesterol',
'height_mul_log_gluc',
'BMI',
'age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'age_dif2',
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',
'cholesterol_div_ap_hi',
'ap_hi_mul_log_gluc',
'BMI_div_ap_hi',
'BMI_div_log_age',
# 'gluc',
'gluc_mul_height',

'ap_hi_1',
'ap_lo_1',
'ap_hi_2',
'ap_lo_2',
]
model_name = '+-XGB_hist_last_hey3'
params = {
     'colsample_bytree': 0.95,
     'gamma': 0.55,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 3,
     'n_estimators': 392,
     'reg_alpha': 0,
     'reg_lambda': 0.4,
     'subsample': 0.85,

    'tree_method': 'hist',
    'grow_policy': 'lossguide',
    
    'n_jobs': 4,
    'random_state': 2222,
    'silent': True,
}
utils.execute_model(xgb.XGBClassifier(**params),
              X_train[use_columns],
              y_train,
              X_test[use_columns],
              model_name=model_name,
              n_splits=5,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.5361988439327745, 0.53878553475211877, 0.53642550825857205, 0.53594055637981564, 0.5361002982727765, 0.5379512786791748, 0.53839335070994565, 0.53687083555218573, 0.53820052226766257, 0.53704927838189853]
mean: 0.537191600719
std: 0.00100157499167
5 Splits logloss:
[0.53745552328122514, 0.5381727428733416, 0.53720863004951247, 0.53826750450572447, 0.53750435736615743]
mean: 0.537721751615
std: 0.000420159697957
+-XGB_hist_last_hey3 results saved!


37980

In [11]:
use_columns = [
"gender",
"height",
"weight",
"ap_hi",
"ap_lo",
"cholesterol",
"height_div_ap_lo",
"active_fair",
"smoke_restored",
"alco_restored",
"height_mul_log_cholesterol",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
"ap_hi_mul_weight",

'ap_hi_1',
'ap_lo_1',
'ap_hi_2',
'ap_lo_2',
]

model_name = '+-XGB_5++_hey3'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': -1,
    'random_state': 5555,
    'silent': True,
}
utils.execute_model(xgb.XGBClassifier(**params),
              X_train[use_columns],
              y_train,
              X_test[use_columns],
              model_name=model_name,
              n_splits=5,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53679096203406984, 0.53686247819084321, 0.53722622819229526, 0.53670143130906356, 0.53668676167029405, 0.5388207351446832, 0.53853416463088055, 0.53751440324474176, 0.53710882082165756, 0.53709345427608357]
mean: 0.537333943951
std: 0.000716923158664
5 Splits logloss:
[0.53704756018075916, 0.53844297794997698, 0.53769848116522745, 0.53828573732823137, 0.53752307473602035]
mean: 0.537799566272
std: 0.000510395888075
+-XGB_5++_hey3 results saved!


98

In [12]:
use_columns = [
"gender",
"ap_hi",
"ap_lo",
"cholesterol",
"active_fair",
"smoke_restored",
"alco_restored",
"height_mul_log_cholesterol",
"height_mul_log_gluc",
"BMI",
"age_group",
"cholesterol_div_log_gluc",
"gluc_mul_log_age",
"ap_hi_mul_weight",
"age_dif2",
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',

'ap_hi_1',
'ap_lo_1',
'ap_hi_2',
'ap_lo_2',
]
model_name = '+-XGB_11.5_hey3'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': -1,
    'random_state': 1223,
    'silent': True,
}
utils.execute_model(xgb.XGBClassifier(**params),
              X_train[use_columns],
              y_train,
              X_test[use_columns],
              model_name=model_name,
              n_splits=5,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53655731619564451, 0.53760840705419, 0.53614940550827284, 0.53577996441323572, 0.53642402483270102, 0.53844697005811515, 0.53818211341601374, 0.53721983370543724, 0.53748259387352615, 0.53680509493777651]
mean: 0.537065572399
std: 0.000829288977356
5 Splits logloss:
[0.5372267876168092, 0.53809160436867254, 0.53759290787825986, 0.53803163087527672, 0.53729765687989339]
mean: 0.537648117524
std: 0.000359764570448
+-XGB_11.5_hey3 results saved!


98

In [13]:
# 1500+ features, very long execution!
use_columns = [
'gender',
'ap_hi',
'ap_lo',
'cholesterol',
'active_fair',
'smoke_restored',
'alco_restored',
'height_mul_log_cholesterol',
'height_mul_log_gluc',
'BMI',
'age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'age_dif2',
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',
'cholesterol_div_ap_hi',
'ap_hi_mul_log_gluc',
'BMI_div_ap_hi',
'BMI_div_log_age',
# 'gluc',
'gluc_mul_height',

'ap_hi_1',
'ap_lo_1',
'ap_hi_2',
'ap_lo_2',
]
model_name = 'XGB_16_hey3'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': -1,
    'random_state': 1223,
    'silent': True,
}

X1 = X_train[use_columns].copy()
X2 = X_test[use_columns].copy()

from itertools import combinations
from patsylearn import PatsyModel, PatsyTransformer

interactions = []
for i in range(2, 5):
    for comb in combinations(['alco_restored', 'smoke_restored', 'active_restored', 'age_group', 'gender', 'gluc', 'cholesterol'], i):
        interactions.append(':'.join(['C(%s)' % c for c in comb]))
formula = ' + '.join(interactions)
transformer = PatsyTransformer(formula)
transformer.fit(data)
X1 = pd.concat((X1, pd.DataFrame(transformer.transform(X_train))), axis=1)
X2 = pd.concat((X2, pd.DataFrame(transformer.transform(X_test))), axis=1)
utils.execute_model(xgb.XGBClassifier(**params),
              X1,
              y_train,
              X2,
              model_name=model_name,
              n_splits=5,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53584379411531313, 0.5379204437885805, 0.53685329615344046, 0.53593173720193965, 0.53589469605506002, 0.53835195593721119, 0.53813142447067774, 0.53708889733699383, 0.53744152613702623, 0.53700221169126938]
mean: 0.537045998289
std: 0.00088645551337
5 Splits logloss:
[0.53728746125722926, 0.5383591846114113, 0.53727894882041782, 0.53798174995254899, 0.53749453630812816]
mean: 0.53768037619
std: 0.000424657404709
XGB_16_hey3 results saved!


39

# Last model uses 'old' way of cleaning data

In [14]:
train = pd.read_csv('train.csv', sep=';')
test = pd.read_csv('test.csv', sep=';', na_values='None')

train = utils.clean_data_old(train)
test = utils.clean_data_old(test)

train = utils.new_features(train)
test = utils.new_features(test)

X_train = train.drop([ 'cardio'], axis=1)
y_train = train['cardio'].values.ravel()
X_test = test.drop([], axis=1)

data = pd.concat((train.drop('cardio', axis=1), test), axis=0)
dic2 = data.groupby('age_group_orig')['age'].min().to_dict()
dic3 = data.groupby('age_group_orig')['age'].max().to_dict()
X_train['age_dif2'] = X_train[['age_group_orig', 'age']].apply(lambda x: (x['age'] - dic2[x['age_group_orig']]) / dic3[x['age_group_orig']], axis=1)
X_test['age_dif2']  =  X_test[['age_group_orig', 'age']].apply(lambda x: (x['age'] - dic2[x['age_group_orig']]) / dic3[x['age_group_orig']], axis=1)

In [16]:
from sklearn.pipeline import Pipeline, FeatureUnion
from utils import SmoothLikelihood4, ColumnsFilter
from sklearn.model_selection import StratifiedKFold

def wrap_classifier(clf, use_columns, mean_columns):
    fs = [("filter", ColumnsFilter(use_columns))]
    
    for i, cc in enumerate(mean_columns):
        fs.append(('mean_'+str(i), SmoothLikelihood4(cc, 0.5,
                                                     kf=StratifiedKFold(random_state=111111+i, n_splits=20, shuffle=True),
                                                     alpha=13,
                                                     seed=10+i,
                                                     std=0.0003)))
    combined_features = FeatureUnion(fs)
    return Pipeline([("features", combined_features), ("model", clf)])

use_columns = [
'gender',
'height',
'weight',
'ap_hi',
'ap_lo',
'cholesterol',
'active_fair',
'smoke_restored',
'alco_restored',
'height_mul_log_cholesterol',
'height_mul_log_gluc',
'BMI',
'age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'ap_hi_mul_weight',
'age_dif2',
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',

'ap_hi_1',
'ap_lo_1',
'ap_hi_2',
'ap_lo_2',
]

# Trying to use mean target for interactions of categorical features...
mean_columns = [
    ['cholesterol','gluc','smoke_restored','active_restored'],
    ['ap_hi_group', 'age_group', 'gender'],
    ['gender','cholesterol','age_group']
]
model_name = 'old2_XGB_9'
params = {
     'colsample_bytree': 0.875,
     'gamma': 0.05,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 5,
     'n_estimators': 369,

     'reg_alpha': 0,
     'reg_lambda': 10,
     'subsample': 0.7,
    
    'n_jobs': 1,
    'random_state': 5555,
    'silent': True,
}
model = wrap_classifier(xgb.XGBClassifier(**params), use_columns, mean_columns)
utils.execute_model(model,
              X_train,
              y_train,
              X_test,
              model_name=model_name,
              n_splits=5,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53603066476075734, 0.53755204237491194, 0.53756356279751649, 0.53699439413520189, 0.53617252208793764, 0.53904005470963801, 0.53820523700075829, 0.53770240983956663, 0.5374563776603839, 0.53750300052720534]
mean: 0.537422026589
std: 0.000838733595604
5 Splits logloss:
[0.53785442561283703, 0.53831680767131707, 0.53822322433459613, 0.53821570676032982, 0.5378486527302081]
mean: 0.538091763422
std: 0.000199359414591
old2_XGB_9 results saved!


1763

In [17]:
use_columns = [
'gender',
'ap_hi',
'ap_lo',
'cholesterol',
'active_fair',
'smoke_restored',
'alco_restored',
'height_mul_log_cholesterol',
'height_mul_log_gluc',
'BMI',
'age_group',
'cholesterol_div_log_gluc',
'gluc_mul_log_age',
'age_dif2',
'ap_lo_mul_log_ap_hi',
'age_group_div_height',
'age_group_mul_log_MAP',
'cholesterol_div_ap_hi',
'ap_hi_mul_log_gluc',
'BMI_div_ap_hi',
'BMI_div_log_age',
# 'gluc',
'gluc_mul_height',

'ap_hi_1',
'ap_lo_1',
'ap_hi_2',
'ap_lo_2',
]
model_name = '+-XGB_hist_last_hey3_old'
params = {
     'colsample_bytree': 0.95,
     'gamma': 0.55,
     'learning_rate': 0.02,
     'max_depth': 5,
     'min_child_weight': 3,
     'n_estimators': 392,
     'reg_alpha': 0,
     'reg_lambda': 0.4,
     'subsample': 0.85,

    'tree_method': 'hist',
    'grow_policy': 'lossguide',
    
    'n_jobs': 4,
    'random_state': 2222,
    'silent': True,
}
utils.execute_model(xgb.XGBClassifier(**params),
              X_train[use_columns],
              y_train,
              X_test[use_columns],
              model_name=model_name,
              n_splits=5,
              n_folds=10,
              stratification_groups=strat,
             )
gc.collect()


10 folds logloss:
[0.53629192696554262, 0.53912417668723811, 0.53648656223284907, 0.5359235939373741, 0.53602853457454958, 0.53778239749724188, 0.53858169374014364, 0.53673672400797168, 0.53811822237132778, 0.53733641252020892]
mean: 0.537241024453
std: 0.0010651473799
5 Splits logloss:
[0.53753664910305465, 0.53840526516904075, 0.53747071837882199, 0.53829987020072123, 0.53773954474651031]
mean: 0.53789040952
std: 0.000389045409912
+-XGB_hist_last_hey3_old results saved!


49

# Merge models and save results

In [18]:
models = [
    '+-KERAS_4_hey3',
    '+-KERAS_5_hey3',
    '+-XGB_11.5_hey3',
    '+-XGB_1.5_hey3',
    '+-XGB_hist_last_hey3',
    '+-XGB_hist_last_hey3_old',
    '+-XGB_5++_hey3',
    'old2_XGB_9',
    'XGB_16_hey3',
         ]
result = utils.merge_models(models, method='mean')
pd.DataFrame(result).to_csv('merged_models.csv', index=False, header=False, sep=';')


+-KERAS_4_hey3
0.538648315917	0.539280293323	0.538648315917	0.539280293323

+-KERAS_5_hey3
0.53843806856	0.538742909025	0.53822013196	0.53870561627

+-XGB_11.5_hey3
0.537065334861	0.537648117547	0.53713347719	0.537527681376

+-XGB_1.5_hey3
0.537152971163	0.537638279601	0.536881998939	0.537242136536

+-XGB_hist_last_hey3
0.537191400652	0.5377217516	0.53672805302	0.537086841736

+-XGB_hist_last_hey3_old
0.537240859634	0.537890409453	0.536685065957	0.537071911656

+-XGB_5++_hey3
0.537333747496	0.537799566163	0.536664326068	0.537043567908

old2_XGB_9
0.537421764496	0.538091763398	0.536615154218	0.537017652488

XGB_16_hey3
0.537045771061	0.53768037617	0.536604862479	0.537024535086


In [None]:
CV:      0.536604862479
Public:  0.5426653
Private: 0.5306077