In [4]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.utils import shuffle

from catboost import CatBoostClassifier
from catboost import Pool

import os

In [33]:
def getPredictions(X_train, y_train, X_test, params, n_seed = 5, n_iter = 1000, fName = None, verbose = True):
    prob = np.zeros([X_test.shape[0],n_seed])
    dirName= '/tmp/porto/catboost/'
    
    # create pool
    trainPool = Pool(X_train, y_train)
    testPool =  Pool(X_test)
    
    for seed in range(0,n_seed):
        model = CatBoostClassifier(verbose= verbose, iterations= n_iter, eval_metric="AUC",
                                           depth=params['depth'], learning_rate=params['rate'],
                                           l2_leaf_reg=params['l2'], bagging_temperature=params['T'],
                                           od_type='Iter', od_wait=100,
                                           train_dir=dirName +"/"+ str(seed), random_seed= seed,
                                           gradient_iterations = 4, rsm= 0.9)
         
        model.fit(trainPool)
        prob[:,seed] = model.predict_proba(testPool)[:,1]
    
    df = pd.DataFrame(prob.mean(axis = 1), columns=['target'], index=X_test.index)
    if fName:
        df.to_csv(fName,index_label='id')
    
    return df

In [6]:
def prepareData(rawData):
    """
    Return Pandas dataframe for training.
    :return: pandas dataframe
    """

    #rawData = pd.read_csv('../data/train.csv', index_col='id')

    # drop _calc_ features
    dropList = list()
    for fName in rawData.columns.tolist():
        if fName.find('_calc_') > (-1):
            dropList.append(fName)
    df = rawData.drop(dropList, axis=1)

    # squared feature "ps_car_15"
    df = df.assign(ps_car_15_mod = np.power(df.ps_car_15,2).astype(int)).drop("ps_car_15", axis = 1)

    # inverse one-hot-encoding for ind_06 % ind_09
    df = df.assign(ps_ind_69_cat = 0*df.ps_ind_06_bin+df.ps_ind_07_bin+2*df.ps_ind_08_bin+3*df.ps_ind_09_bin)
    df.drop(['ps_ind_06_bin','ps_ind_07_bin','ps_ind_08_bin','ps_ind_09_bin'], inplace=True, axis = 1)

    #drop "ind_14"
    df.drop('ps_ind_14', axis = 1, inplace = True)

    return df

In [7]:
# get train data
train = prepareData(pd.read_csv('./data/train.csv', index_col='id'))

In [8]:
test = prepareData(pd.read_csv('./data/test.csv', index_col='id'))

## Обучение модели без upsampling

In [22]:
params = {'depth': 7, 'l2': 5.5, 'rate': 0.055, 'T': 1.5}
model1 = trainCatBoost(train.drop("target", axis= 1), train.target, params= params, verbose= False)

[Errno 17] File exists: '/tmp/porto/catboost/7_0.055_5.5_1.5'
Fold #0
Local score is 0.631454, tree count is 487
Fold #1
Local score is 0.644478, tree count is 547
Fold #2
Local score is 0.642380, tree count is 431
Fold #3
Local score is 0.644954, tree count is 360
Fold #4
Local score is 0.643083, tree count is 336


In [30]:
model1

{'gini': 0.28241054440757107,
 'score': 0.64120527220378554,
 'scoreList': [0.63145397180268126,
  0.64447774808639191,
  0.64237993791191628,
  0.6449541257103576,
  0.64308274059338544],
 'treeList': [487, 547, 431, 360, 336]}

In [31]:
np.array(model1['treeList']).mean(0)

432.19999999999999

In [35]:
pred = getPredictions(train.drop("target", axis= 1), train.target, test, params, n_iter= 450, fName= 'submission25.csv', verbose= False)

score 0.280

In [36]:
pred = getPredictions(train.drop("target", axis= 1), train.target, test, params, n_iter= 600, fName= 'submission26.csv', verbose = False)

score 0.280

## Добавлю upsampling в CatBoost

In [39]:
params = {'depth': 7, 'l2': 5.5, 'rate': 0.055, 'T': 1.5}
model2 = trainCatBoost(train.drop("target", axis= 1), train.target, params= params, verbose= False, class_weight= [1, 2])

[Errno 17] File exists: '/tmp/porto/catboost/7_0.055_5.5_1.5'
Fold #0
Local score is 0.626639, tree count is 428
Fold #1
Local score is 0.641386, tree count is 1576
Fold #2
Local score is 0.640462, tree count is 1297
Fold #3
Local score is 0.642378, tree count is 1994
Fold #4
Local score is 0.640151, tree count is 422


In [40]:
model2

{'gini': 0.27642036075049159,
 'score': 0.63821018037524579,
 'scoreList': [0.62663911578207776,
  0.64138649437504458,
  0.64046177186473319,
  0.6423781813920878,
  0.64015085372507363],
 'treeList': [428, 1576, 1297, 1994, 422]}

## Ручной upsamppling

In [46]:
train_upsampled = pd.concat([train, train.query("target == 1")])
train_upsampled = shuffle(train_upsampled)

Unnamed: 0_level_0,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,...,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15_mod,ps_ind_69_cat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0,2,2,5,1,0,0,0,0,0,...,0,0,1,12,2,0.4,0.883679,0.37081,13,1
9,0,1,1,7,0,0,0,0,0,0,...,1,2,1,19,3,0.316228,0.618817,0.388716,6,2
13,0,5,4,9,1,0,0,0,0,0,...,1,2,1,60,1,0.316228,0.641586,0.347275,11,2
16,0,0,1,2,0,0,0,0,0,0,...,1,3,1,104,1,0.374166,0.542949,0.294958,4,0
17,0,0,2,0,1,0,0,0,0,0,...,1,2,1,82,3,0.31607,0.565832,0.365103,4,0


In [52]:
model3 = trainCatBoost(train_upsampled.drop("target", axis= 1), train_upsampled.target, params= params, verbose= False, maxIter= 5000)

[Errno 17] File exists: '/tmp/porto/catboost/7_0.055_5.5_1.5'
Fold #0
Local score is 0.713011, tree count is 4992
Fold #1
Local score is 0.710714, tree count is 5000
Fold #2
Local score is 0.713906, tree count is 4996
Fold #3
Local score is 0.719510, tree count is 4998
Fold #4
Local score is 0.715529, tree count is 4994


In [55]:
getPredictions(train_upsampled.drop("target", axis= 1), train_upsampled.target, test, params, n_iter= 600, fName= 'submission27.csv', verbose = True)

Borders for float features generated
0:	learn 0.5585460748	total: 2.69s	remaining: 26m 53s
1:	learn 0.5813420721	total: 5.28s	remaining: 26m 19s
2:	learn 0.5929571282	total: 6.67s	remaining: 22m 8s
3:	learn 0.594548521	total: 8.01s	remaining: 19m 53s
4:	learn 0.5992992777	total: 9.25s	remaining: 18m 21s
5:	learn 0.6027363331	total: 9.82s	remaining: 16m 11s
6:	learn 0.606259153	total: 10.5s	remaining: 14m 52s
7:	learn 0.608193668	total: 11.9s	remaining: 14m 39s
8:	learn 0.6112627843	total: 13.1s	remaining: 14m 17s
9:	learn 0.6147275178	total: 13.7s	remaining: 13m 28s
10:	learn 0.6155198174	total: 14.8s	remaining: 13m 10s
11:	learn 0.6154578753	total: 15.3s	remaining: 12m 30s
12:	learn 0.6166126357	total: 16.1s	remaining: 12m 8s
13:	learn 0.6189890108	total: 17s	remaining: 11m 52s
14:	learn 0.6195942477	total: 17.5s	remaining: 11m 22s
15:	learn 0.6197846549	total: 18s	remaining: 10m 57s
16:	learn 0.6191868955	total: 18.4s	remaining: 10m 31s
17:	learn 0.6203395896	total: 19s	remaining: 10

151:	learn 0.6529597266	total: 1m 30s	remaining: 4m 26s
152:	learn 0.6530640928	total: 1m 30s	remaining: 4m 25s
153:	learn 0.6531732074	total: 1m 31s	remaining: 4m 24s
154:	learn 0.6532832185	total: 1m 31s	remaining: 4m 23s
155:	learn 0.6533295681	total: 1m 32s	remaining: 4m 23s
156:	learn 0.6534481748	total: 1m 32s	remaining: 4m 22s
157:	learn 0.6535136406	total: 1m 33s	remaining: 4m 21s
158:	learn 0.653683904	total: 1m 34s	remaining: 4m 20s
159:	learn 0.6537652744	total: 1m 34s	remaining: 4m 19s
160:	learn 0.6538726981	total: 1m 35s	remaining: 4m 19s
161:	learn 0.6539913929	total: 1m 35s	remaining: 4m 18s
162:	learn 0.6540853224	total: 1m 36s	remaining: 4m 17s
163:	learn 0.6542467849	total: 1m 36s	remaining: 4m 17s
164:	learn 0.6543168177	total: 1m 37s	remaining: 4m 16s
165:	learn 0.6544271699	total: 1m 37s	remaining: 4m 15s
166:	learn 0.6545747425	total: 1m 38s	remaining: 4m 14s
167:	learn 0.654755672	total: 1m 38s	remaining: 4m 13s
168:	learn 0.6550323274	total: 1m 39s	remaining: 4

299:	learn 0.6670223196	total: 2m 43s	remaining: 2m 43s
300:	learn 0.6670832795	total: 2m 44s	remaining: 2m 43s
301:	learn 0.6671306718	total: 2m 44s	remaining: 2m 42s
302:	learn 0.6672009635	total: 2m 45s	remaining: 2m 42s
303:	learn 0.6673230435	total: 2m 45s	remaining: 2m 41s
304:	learn 0.6674040077	total: 2m 46s	remaining: 2m 40s
305:	learn 0.6674903324	total: 2m 46s	remaining: 2m 40s
306:	learn 0.6675193347	total: 2m 47s	remaining: 2m 39s
307:	learn 0.6676458302	total: 2m 47s	remaining: 2m 39s
308:	learn 0.6676772238	total: 2m 48s	remaining: 2m 38s
309:	learn 0.6677712395	total: 2m 48s	remaining: 2m 37s
310:	learn 0.6678516531	total: 2m 49s	remaining: 2m 37s
311:	learn 0.6679298927	total: 2m 49s	remaining: 2m 36s
312:	learn 0.6679935689	total: 2m 50s	remaining: 2m 36s
313:	learn 0.6680841614	total: 2m 50s	remaining: 2m 35s
314:	learn 0.668132389	total: 2m 51s	remaining: 2m 34s
315:	learn 0.6682169827	total: 2m 51s	remaining: 2m 34s
316:	learn 0.6683045772	total: 2m 52s	remaining: 

447:	learn 0.6775374183	total: 3m 55s	remaining: 1m 19s
448:	learn 0.6775607043	total: 3m 56s	remaining: 1m 19s
449:	learn 0.6776276748	total: 3m 56s	remaining: 1m 18s
450:	learn 0.6777116524	total: 3m 57s	remaining: 1m 18s
451:	learn 0.6777722939	total: 3m 57s	remaining: 1m 17s
452:	learn 0.6779316245	total: 3m 58s	remaining: 1m 17s
453:	learn 0.6779821915	total: 3m 58s	remaining: 1m 16s
454:	learn 0.6780518417	total: 3m 59s	remaining: 1m 16s
455:	learn 0.6781216934	total: 3m 59s	remaining: 1m 15s
456:	learn 0.6781668085	total: 4m	remaining: 1m 15s
457:	learn 0.6782255665	total: 4m	remaining: 1m 14s
458:	learn 0.6783575677	total: 4m 1s	remaining: 1m 14s
459:	learn 0.67843686	total: 4m 1s	remaining: 1m 13s
460:	learn 0.6784949807	total: 4m 2s	remaining: 1m 13s
461:	learn 0.6785473927	total: 4m 2s	remaining: 1m 12s
462:	learn 0.6786528352	total: 4m 3s	remaining: 1m 12s
463:	learn 0.6787176443	total: 4m 3s	remaining: 1m 11s
464:	learn 0.6787371192	total: 4m 4s	remaining: 1m 10s
465:	lear

598:	learn 0.6868473703	total: 5m 16s	remaining: 529ms
599:	learn 0.6869058974	total: 5m 17s	remaining: 0us
Borders for float features generated
0:	learn 0.5729643569	total: 471ms	remaining: 4m 41s
1:	learn 0.6000099501	total: 934ms	remaining: 4m 39s
2:	learn 0.6023139347	total: 1.33s	remaining: 4m 24s
3:	learn 0.6033108254	total: 1.8s	remaining: 4m 28s
4:	learn 0.6103567604	total: 2.39s	remaining: 4m 44s
5:	learn 0.6098982165	total: 2.95s	remaining: 4m 52s
6:	learn 0.6115383837	total: 3.3s	remaining: 4m 39s
7:	learn 0.6146225225	total: 3.87s	remaining: 4m 46s
8:	learn 0.6175879169	total: 4.38s	remaining: 4m 47s
9:	learn 0.6204369382	total: 4.91s	remaining: 4m 49s
10:	learn 0.6210593111	total: 5.44s	remaining: 4m 51s
11:	learn 0.620758071	total: 5.99s	remaining: 4m 53s
12:	learn 0.6238843065	total: 6.5s	remaining: 4m 53s
13:	learn 0.6238748781	total: 6.82s	remaining: 4m 45s
14:	learn 0.6239909029	total: 7.29s	remaining: 4m 44s
15:	learn 0.6247023975	total: 7.9s	remaining: 4m 48s
16:	le

150:	learn 0.6518742373	total: 1m 21s	remaining: 4m 1s
151:	learn 0.6521128964	total: 1m 21s	remaining: 4m
152:	learn 0.6521635877	total: 1m 22s	remaining: 4m
153:	learn 0.6522138446	total: 1m 22s	remaining: 3m 59s
154:	learn 0.6523204667	total: 1m 23s	remaining: 3m 59s
155:	learn 0.6523566413	total: 1m 24s	remaining: 3m 59s
156:	learn 0.6524444213	total: 1m 24s	remaining: 3m 59s
157:	learn 0.652543044	total: 1m 25s	remaining: 3m 58s
158:	learn 0.6526505232	total: 1m 26s	remaining: 3m 58s
159:	learn 0.6528021819	total: 1m 26s	remaining: 3m 58s
160:	learn 0.6528804134	total: 1m 27s	remaining: 3m 57s
161:	learn 0.653102112	total: 1m 27s	remaining: 3m 57s
162:	learn 0.6532319925	total: 1m 28s	remaining: 3m 56s
163:	learn 0.6533450521	total: 1m 28s	remaining: 3m 56s
164:	learn 0.6534542411	total: 1m 29s	remaining: 3m 56s
165:	learn 0.6535670412	total: 1m 30s	remaining: 3m 55s
166:	learn 0.653647391	total: 1m 30s	remaining: 3m 55s
167:	learn 0.6537498506	total: 1m 31s	remaining: 3m 54s
168:

298:	learn 0.6655832743	total: 2m 47s	remaining: 2m 48s
299:	learn 0.6656648148	total: 2m 47s	remaining: 2m 47s
300:	learn 0.6657346477	total: 2m 48s	remaining: 2m 47s
301:	learn 0.6658147337	total: 2m 48s	remaining: 2m 46s
302:	learn 0.6658718068	total: 2m 49s	remaining: 2m 45s
303:	learn 0.665943133	total: 2m 49s	remaining: 2m 45s
304:	learn 0.6660015893	total: 2m 50s	remaining: 2m 44s
305:	learn 0.6661297081	total: 2m 51s	remaining: 2m 44s
306:	learn 0.6662206471	total: 2m 51s	remaining: 2m 43s
307:	learn 0.6662651018	total: 2m 52s	remaining: 2m 43s
308:	learn 0.6663702348	total: 2m 52s	remaining: 2m 42s
309:	learn 0.6664902951	total: 2m 53s	remaining: 2m 42s
310:	learn 0.666637715	total: 2m 54s	remaining: 2m 41s
311:	learn 0.6667363331	total: 2m 54s	remaining: 2m 41s
312:	learn 0.6668045747	total: 2m 55s	remaining: 2m 40s
313:	learn 0.6668807629	total: 2m 55s	remaining: 2m 40s
314:	learn 0.6670211417	total: 2m 56s	remaining: 2m 39s
315:	learn 0.6670559766	total: 2m 57s	remaining: 2

446:	learn 0.6767707673	total: 4m 9s	remaining: 1m 25s
447:	learn 0.6768445626	total: 4m 10s	remaining: 1m 24s
448:	learn 0.6768895863	total: 4m 10s	remaining: 1m 24s
449:	learn 0.6769240666	total: 4m 11s	remaining: 1m 23s
450:	learn 0.6769473727	total: 4m 12s	remaining: 1m 23s
451:	learn 0.67701174	total: 4m 12s	remaining: 1m 22s
452:	learn 0.6770663902	total: 4m 13s	remaining: 1m 22s
453:	learn 0.6771353329	total: 4m 13s	remaining: 1m 21s
454:	learn 0.6772174993	total: 4m 14s	remaining: 1m 21s
455:	learn 0.6772674991	total: 4m 15s	remaining: 1m 20s
456:	learn 0.6774201603	total: 4m 15s	remaining: 1m 20s
457:	learn 0.6774715158	total: 4m 16s	remaining: 1m 19s
458:	learn 0.6775249249	total: 4m 17s	remaining: 1m 18s
459:	learn 0.6776413921	total: 4m 17s	remaining: 1m 18s
460:	learn 0.6777055726	total: 4m 18s	remaining: 1m 17s
461:	learn 0.6777434805	total: 4m 18s	remaining: 1m 17s
462:	learn 0.6778161933	total: 4m 19s	remaining: 1m 16s
463:	learn 0.6778509048	total: 4m 19s	remaining: 1m

596:	learn 0.6860015629	total: 5m 34s	remaining: 1.68s
597:	learn 0.6860324958	total: 5m 35s	remaining: 1.12s
598:	learn 0.6861072857	total: 5m 36s	remaining: 561ms
599:	learn 0.6861480747	total: 5m 36s	remaining: 0us
Borders for float features generated
0:	learn 0.5467678305	total: 476ms	remaining: 4m 44s
1:	learn 0.6000009912	total: 961ms	remaining: 4m 47s
2:	learn 0.604919081	total: 1.49s	remaining: 4m 55s
3:	learn 0.6110460995	total: 2.01s	remaining: 4m 59s
4:	learn 0.6112660009	total: 2.54s	remaining: 5m 1s
5:	learn 0.609450709	total: 2.85s	remaining: 4m 41s
6:	learn 0.6121729795	total: 3.33s	remaining: 4m 41s
7:	learn 0.6168750991	total: 3.82s	remaining: 4m 42s
8:	learn 0.6181734834	total: 4.3s	remaining: 4m 42s
9:	learn 0.6176462527	total: 4.81s	remaining: 4m 43s
10:	learn 0.6193038719	total: 5.3s	remaining: 4m 43s
11:	learn 0.619340617	total: 5.8s	remaining: 4m 44s
12:	learn 0.6196740396	total: 6.28s	remaining: 4m 43s
13:	learn 0.6200208881	total: 6.75s	remaining: 4m 42s
14:	le

148:	learn 0.6526285221	total: 1m 13s	remaining: 3m 41s
149:	learn 0.6527855155	total: 1m 13s	remaining: 3m 40s
150:	learn 0.6528506479	total: 1m 14s	remaining: 3m 40s
151:	learn 0.6529489389	total: 1m 14s	remaining: 3m 39s
152:	learn 0.6531765205	total: 1m 15s	remaining: 3m 39s
153:	learn 0.6533337837	total: 1m 15s	remaining: 3m 38s
154:	learn 0.6534171053	total: 1m 15s	remaining: 3m 38s
155:	learn 0.6535408375	total: 1m 16s	remaining: 3m 37s
156:	learn 0.6537212959	total: 1m 16s	remaining: 3m 37s
157:	learn 0.6538119573	total: 1m 17s	remaining: 3m 36s
158:	learn 0.6538955378	total: 1m 17s	remaining: 3m 36s
159:	learn 0.6539813877	total: 1m 18s	remaining: 3m 35s
160:	learn 0.6540357627	total: 1m 18s	remaining: 3m 35s
161:	learn 0.6541070513	total: 1m 19s	remaining: 3m 34s
162:	learn 0.6541928855	total: 1m 19s	remaining: 3m 34s
163:	learn 0.654271055	total: 1m 20s	remaining: 3m 33s
164:	learn 0.6544072488	total: 1m 20s	remaining: 3m 33s
165:	learn 0.6545429788	total: 1m 21s	remaining: 

296:	learn 0.6664575241	total: 2m 29s	remaining: 2m 32s
297:	learn 0.6665313562	total: 2m 30s	remaining: 2m 32s
298:	learn 0.6665818091	total: 2m 30s	remaining: 2m 31s
299:	learn 0.6666894952	total: 2m 31s	remaining: 2m 31s
300:	learn 0.6667991106	total: 2m 31s	remaining: 2m 30s
301:	learn 0.6669147964	total: 2m 32s	remaining: 2m 30s
302:	learn 0.6669746397	total: 2m 32s	remaining: 2m 29s
303:	learn 0.667034737	total: 2m 33s	remaining: 2m 29s
304:	learn 0.6671199295	total: 2m 33s	remaining: 2m 28s
305:	learn 0.6672784187	total: 2m 34s	remaining: 2m 28s
306:	learn 0.6673146364	total: 2m 34s	remaining: 2m 27s
307:	learn 0.6673639446	total: 2m 35s	remaining: 2m 27s
308:	learn 0.6674207959	total: 2m 35s	remaining: 2m 26s
309:	learn 0.6675317622	total: 2m 36s	remaining: 2m 26s
310:	learn 0.6676111629	total: 2m 36s	remaining: 2m 25s
311:	learn 0.6677289238	total: 2m 37s	remaining: 2m 25s
312:	learn 0.6678193283	total: 2m 37s	remaining: 2m 24s
313:	learn 0.6678594454	total: 2m 38s	remaining: 

444:	learn 0.6763612516	total: 3m 51s	remaining: 1m 20s
445:	learn 0.6764158693	total: 3m 52s	remaining: 1m 20s
446:	learn 0.6764921377	total: 3m 52s	remaining: 1m 19s
447:	learn 0.6765549864	total: 3m 53s	remaining: 1m 19s
448:	learn 0.6766262459	total: 3m 53s	remaining: 1m 18s
449:	learn 0.6766851832	total: 3m 54s	remaining: 1m 18s
450:	learn 0.6767293563	total: 3m 54s	remaining: 1m 17s
451:	learn 0.6767944559	total: 3m 55s	remaining: 1m 17s
452:	learn 0.6768586787	total: 3m 55s	remaining: 1m 16s
453:	learn 0.6769130142	total: 3m 56s	remaining: 1m 16s
454:	learn 0.6769338052	total: 3m 57s	remaining: 1m 15s
455:	learn 0.6769710173	total: 3m 57s	remaining: 1m 15s
456:	learn 0.6770268718	total: 3m 58s	remaining: 1m 14s
457:	learn 0.6771563714	total: 3m 58s	remaining: 1m 14s
458:	learn 0.6772219238	total: 3m 59s	remaining: 1m 13s
459:	learn 0.6772723873	total: 4m	remaining: 1m 13s
460:	learn 0.6773234415	total: 4m	remaining: 1m 12s
461:	learn 0.6773753635	total: 4m 1s	remaining: 1m 12s
4

595:	learn 0.6854202083	total: 5m 13s	remaining: 2.1s
596:	learn 0.685514442	total: 5m 13s	remaining: 1.57s
597:	learn 0.6855995189	total: 5m 14s	remaining: 1.05s
598:	learn 0.6856340146	total: 5m 14s	remaining: 525ms
599:	learn 0.6857214975	total: 5m 15s	remaining: 0us
Borders for float features generated
0:	learn 0.5549947986	total: 439ms	remaining: 4m 23s
1:	learn 0.5903191168	total: 951ms	remaining: 4m 44s
2:	learn 0.600980047	total: 1.46s	remaining: 4m 50s
3:	learn 0.6108485169	total: 1.95s	remaining: 4m 49s
4:	learn 0.6114593543	total: 2.53s	remaining: 5m 1s
5:	learn 0.6146878934	total: 3.01s	remaining: 4m 58s
6:	learn 0.6207569421	total: 3.51s	remaining: 4m 57s
7:	learn 0.6205845857	total: 4.03s	remaining: 4m 58s
8:	learn 0.6214955178	total: 4.54s	remaining: 4m 58s
9:	learn 0.6225969117	total: 5.06s	remaining: 4m 58s
10:	learn 0.6240849425	total: 5.55s	remaining: 4m 57s
11:	learn 0.623944257	total: 6.05s	remaining: 4m 56s
12:	learn 0.6259540371	total: 6.56s	remaining: 4m 56s
13:

147:	learn 0.6526602746	total: 1m 15s	remaining: 3m 49s
148:	learn 0.6529176184	total: 1m 15s	remaining: 3m 49s
149:	learn 0.6529807673	total: 1m 16s	remaining: 3m 48s
150:	learn 0.6530515086	total: 1m 16s	remaining: 3m 48s
151:	learn 0.6531947542	total: 1m 17s	remaining: 3m 48s
152:	learn 0.6532809742	total: 1m 17s	remaining: 3m 47s
153:	learn 0.6534296045	total: 1m 18s	remaining: 3m 46s
154:	learn 0.6535231619	total: 1m 18s	remaining: 3m 46s
155:	learn 0.6536412276	total: 1m 19s	remaining: 3m 45s
156:	learn 0.6537936362	total: 1m 19s	remaining: 3m 45s
157:	learn 0.6538720867	total: 1m 20s	remaining: 3m 44s
158:	learn 0.6539589999	total: 1m 20s	remaining: 3m 44s
159:	learn 0.6540583568	total: 1m 21s	remaining: 3m 43s
160:	learn 0.6541532363	total: 1m 21s	remaining: 3m 43s
161:	learn 0.6542166005	total: 1m 22s	remaining: 3m 42s
162:	learn 0.6543573694	total: 1m 22s	remaining: 3m 42s
163:	learn 0.6546012436	total: 1m 23s	remaining: 3m 41s
164:	learn 0.6547281915	total: 1m 23s	remaining:

295:	learn 0.6668836333	total: 2m 31s	remaining: 2m 35s
296:	learn 0.6669817992	total: 2m 32s	remaining: 2m 35s
297:	learn 0.6670426096	total: 2m 32s	remaining: 2m 34s
298:	learn 0.6670861745	total: 2m 33s	remaining: 2m 34s
299:	learn 0.6671441287	total: 2m 33s	remaining: 2m 33s
300:	learn 0.6672194105	total: 2m 34s	remaining: 2m 33s
301:	learn 0.6673051666	total: 2m 34s	remaining: 2m 32s
302:	learn 0.6674635141	total: 2m 35s	remaining: 2m 32s
303:	learn 0.6675616875	total: 2m 35s	remaining: 2m 31s
304:	learn 0.6676419244	total: 2m 36s	remaining: 2m 30s
305:	learn 0.6677453687	total: 2m 36s	remaining: 2m 30s
306:	learn 0.6678674845	total: 2m 37s	remaining: 2m 29s
307:	learn 0.6679112217	total: 2m 37s	remaining: 2m 29s
308:	learn 0.6679994715	total: 2m 38s	remaining: 2m 28s
309:	learn 0.6680423579	total: 2m 38s	remaining: 2m 28s
310:	learn 0.6681738663	total: 2m 39s	remaining: 2m 27s
311:	learn 0.6683175931	total: 2m 39s	remaining: 2m 27s
312:	learn 0.6683838682	total: 2m 40s	remaining:

443:	learn 0.6775350931	total: 3m 47s	remaining: 1m 20s
444:	learn 0.6776882957	total: 3m 48s	remaining: 1m 19s
445:	learn 0.677771687	total: 3m 48s	remaining: 1m 19s
446:	learn 0.6778521926	total: 3m 49s	remaining: 1m 18s
447:	learn 0.6779616203	total: 3m 49s	remaining: 1m 17s
448:	learn 0.6780152786	total: 3m 50s	remaining: 1m 17s
449:	learn 0.6780650055	total: 3m 50s	remaining: 1m 16s
450:	learn 0.6781083824	total: 3m 51s	remaining: 1m 16s
451:	learn 0.6782178737	total: 3m 51s	remaining: 1m 15s
452:	learn 0.6782595258	total: 3m 52s	remaining: 1m 15s
453:	learn 0.6783591007	total: 3m 52s	remaining: 1m 14s
454:	learn 0.6784206917	total: 3m 53s	remaining: 1m 14s
455:	learn 0.6784448773	total: 3m 53s	remaining: 1m 13s
456:	learn 0.6784813663	total: 3m 54s	remaining: 1m 13s
457:	learn 0.6786007915	total: 3m 54s	remaining: 1m 12s
458:	learn 0.6786242185	total: 3m 55s	remaining: 1m 12s
459:	learn 0.6786868992	total: 3m 55s	remaining: 1m 11s
460:	learn 0.6786838948	total: 3m 56s	remaining: 

594:	learn 0.6867452851	total: 5m 5s	remaining: 2.57s
595:	learn 0.6867914111	total: 5m 5s	remaining: 2.05s
596:	learn 0.6868694995	total: 5m 6s	remaining: 1.54s
597:	learn 0.6869312121	total: 5m 6s	remaining: 1.03s
598:	learn 0.6870290676	total: 5m 7s	remaining: 513ms
599:	learn 0.687059568	total: 5m 7s	remaining: 0us
Borders for float features generated
0:	learn 0.5620229788	total: 366ms	remaining: 3m 39s
1:	learn 0.5782487322	total: 752ms	remaining: 3m 44s
2:	learn 0.5943675795	total: 1.3s	remaining: 4m 18s
3:	learn 0.5998080931	total: 1.77s	remaining: 4m 23s
4:	learn 0.6010947611	total: 2.28s	remaining: 4m 31s
5:	learn 0.6071204205	total: 2.78s	remaining: 4m 35s
6:	learn 0.614365203	total: 3.28s	remaining: 4m 37s
7:	learn 0.620687321	total: 3.79s	remaining: 4m 40s
8:	learn 0.6213283998	total: 4.26s	remaining: 4m 40s
9:	learn 0.6192729081	total: 4.78s	remaining: 4m 42s
10:	learn 0.6206863481	total: 5.29s	remaining: 4m 43s
11:	learn 0.6209815702	total: 5.8s	remaining: 4m 44s
12:	lear

294:	learn 0.6675415202	total: 2m 32s	remaining: 2m 38s
295:	learn 0.6675919357	total: 2m 33s	remaining: 2m 37s
296:	learn 0.667614938	total: 2m 33s	remaining: 2m 36s
297:	learn 0.6676478549	total: 2m 34s	remaining: 2m 36s
298:	learn 0.6676888159	total: 2m 34s	remaining: 2m 35s
299:	learn 0.6677186543	total: 2m 35s	remaining: 2m 35s
300:	learn 0.667748683	total: 2m 35s	remaining: 2m 34s
301:	learn 0.6678105713	total: 2m 36s	remaining: 2m 34s
302:	learn 0.6678535293	total: 2m 36s	remaining: 2m 33s
303:	learn 0.6679660586	total: 2m 37s	remaining: 2m 33s
304:	learn 0.6680371726	total: 2m 37s	remaining: 2m 32s
305:	learn 0.6680958263	total: 2m 38s	remaining: 2m 32s
306:	learn 0.6681482597	total: 2m 38s	remaining: 2m 31s
307:	learn 0.6682031781	total: 2m 39s	remaining: 2m 31s
308:	learn 0.6682768116	total: 2m 39s	remaining: 2m 30s
309:	learn 0.6683994704	total: 2m 40s	remaining: 2m 29s
310:	learn 0.6684546686	total: 2m 40s	remaining: 2m 29s
311:	learn 0.6685132456	total: 2m 41s	remaining: 2

442:	learn 0.6780677575	total: 3m 52s	remaining: 1m 22s
443:	learn 0.6781423865	total: 3m 53s	remaining: 1m 22s
444:	learn 0.6781920578	total: 3m 54s	remaining: 1m 21s
445:	learn 0.6783061405	total: 3m 54s	remaining: 1m 21s
446:	learn 0.6783654862	total: 3m 55s	remaining: 1m 20s
447:	learn 0.6784460429	total: 3m 55s	remaining: 1m 20s
448:	learn 0.6785336234	total: 3m 56s	remaining: 1m 19s
449:	learn 0.6785956113	total: 3m 57s	remaining: 1m 19s
450:	learn 0.6786210891	total: 3m 57s	remaining: 1m 18s
451:	learn 0.6787022194	total: 3m 58s	remaining: 1m 18s
452:	learn 0.6787788786	total: 3m 58s	remaining: 1m 17s
453:	learn 0.6788401102	total: 3m 59s	remaining: 1m 17s
454:	learn 0.6788886659	total: 4m	remaining: 1m 16s
455:	learn 0.6789661679	total: 4m	remaining: 1m 16s
456:	learn 0.6790595222	total: 4m 1s	remaining: 1m 15s
457:	learn 0.6792161637	total: 4m 1s	remaining: 1m 14s
458:	learn 0.6792558045	total: 4m 2s	remaining: 1m 14s
459:	learn 0.6792909803	total: 4m 3s	remaining: 1m 13s
460:

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0.048814
1,0.048920
2,0.046441
3,0.027008
4,0.065030
5,0.080789
6,0.029823
8,0.067785
10,0.098166
11,0.113069


Score 0.281

Модицифировал процедуру обучения и внёс upsamling в каждый фолд для предотвращения утечки данных.

In [36]:
def trainCatBoost(trainSet, params= {'depth': 7, 'rate': 0.03, 'l2': 8, 'T': 1},
                  folds= 5, maxIter= 2000, verbose= True, dirName= '/tmp/porto/catboost/', upsampling = False):
    # create log directory
    dirName = dirName + str(params.get('depth')) + '_' + str(params.get('rate')) + \
              '_' + str(params.get('l2')) + '_' + str(params.get('T'))

    try:
        os.makedirs(dirName)
    except Exception as inst:
        print inst  # __str__ allows args to be printed directly

    treeList = list()
    scoreList = list()
    modelList = list()
    prob = np.zeros([trainSet.shape[0]])

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    for i, (train_index, val_index) in enumerate(skf.split(trainSet.drop("target", axis= 1), trainSet.target)):
        print 'Fold #%d'%i
        model = CatBoostClassifier(verbose=verbose, iterations=maxIter, eval_metric="AUC",
                                   depth=params['depth'], learning_rate=params['rate'],
                                   l2_leaf_reg=params['l2'], bagging_temperature=params['T'],
                                   od_type='Iter', od_wait=100,
                                   gradient_iterations = 4, rsm= 0.9,
                                   train_dir=dirName +"/"+ str(i), random_seed=i)

        # create pool
        if upsampling:            
            trainFold = pd.concat([trainSet.iloc[train_index], trainSet.iloc[train_index].query("target == 1")])
            trainFold = shuffle(trainFold)            
        else:
            trainFold = trainSet.iloc[train_index]
            
            
            
        trainPool = Pool(trainFold.drop("target", axis= 1), trainFold.target,
                         feature_names=trainSet.columns.tolist())

        valPool = Pool(trainSet.iloc[val_index].drop("target", axis= 1), trainSet.iloc[val_index].target,
                         feature_names=trainSet.columns.tolist())

        # fit and estimate the model
        model.fit(trainPool, eval_set=valPool, use_best_model=True)
        prob[val_index] = model.predict_proba(valPool)[:, 1]
        localScore = roc_auc_score(trainSet.iloc[val_index].target, prob[val_index])

        treeList.append(model.tree_count_)
        scoreList.append(localScore)
        modelList.append(model)
        print 'Local score is %f, tree count is %d'%(localScore, model.tree_count_)
        
    score = roc_auc_score(trainSet.target, prob)
    return {'score':score, 'treeList':treeList, 'scoreList':scoreList, 'gini':2*score-1, 'models':modelList}

In [33]:
params = {'depth': 7, 'l2': 5.5, 'rate': 0.055, 'T': 1.5}
model1 = trainCatBoost(train, params= params, verbose= False)

[Errno 17] File exists: '/tmp/porto/catboost/7_0.055_5.5_1.5'
Fold #0
Local score is 0.631454, tree count is 487
Fold #1
Local score is 0.644478, tree count is 547
Fold #2
Local score is 0.642380, tree count is 431
Fold #3
Local score is 0.644954, tree count is 360
Fold #4
Local score is 0.643083, tree count is 336


In [37]:
model2 = trainCatBoost(train, params= params, verbose= False, upsampling= True)

[Errno 17] File exists: '/tmp/porto/catboost/7_0.055_5.5_1.5'
Fold #0
Local score is 0.631361, tree count is 409
Fold #1
Local score is 0.644713, tree count is 324
Fold #2
Local score is 0.641957, tree count is 433
Fold #3
Local score is 0.645733, tree count is 411
Fold #4
Local score is 0.642546, tree count is 246


In [38]:
params = {'depth': 7, 'l2': 5.5, 'rate': 0.03, 'T': 1.5}
model3 = trainCatBoost(train, params= params, verbose= False)
model4 = trainCatBoost(train, params= params, verbose= False, upsampling= True)

Fold #0
Local score is 0.632157, tree count is 612
Fold #1
Local score is 0.644707, tree count is 905
Fold #2
Local score is 0.642789, tree count is 737
Fold #3
Local score is 0.645000, tree count is 914
Fold #4
Local score is 0.642460, tree count is 609
[Errno 17] File exists: '/tmp/porto/catboost/7_0.03_5.5_1.5'
Fold #0
Local score is 0.633088, tree count is 791
Fold #1
Local score is 0.645483, tree count is 636
Fold #2
Local score is 0.643230, tree count is 780
Fold #3
Local score is 0.645424, tree count is 586
Fold #4
Local score is 0.643887, tree count is 585


In [39]:
params = {'depth': 7, 'l2': 5.5, 'rate': 0.02, 'T': 1.5}
model5 = trainCatBoost(train, params= params, verbose= False)
model6 = trainCatBoost(train, params= params, verbose= False, upsampling= True)

Fold #0
Local score is 0.632346, tree count is 1117
Fold #1
Local score is 0.645180, tree count is 1546
Fold #2
Local score is 0.643523, tree count is 1279
Fold #3
Local score is 0.645027, tree count is 1449
Fold #4
Local score is 0.642703, tree count is 877
[Errno 17] File exists: '/tmp/porto/catboost/7_0.02_5.5_1.5'
Fold #0
Local score is 0.633103, tree count is 1041
Fold #1
Local score is 0.645278, tree count is 1002
Fold #2
Local score is 0.643400, tree count is 1186
Fold #3
Local score is 0.645622, tree count is 1212
Fold #4
Local score is 0.643543, tree count is 1010


In [1]:
model5

NameError: name 'model5' is not defined