In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier
from catboost import Pool

import os

In [51]:
def trainCatBoost(trainSet, targetSet, params= {'depth': 7, 'rate': 0.03, 'l2': 8, 'T': 1},
                  folds= 5, maxIter= 2000, verbose= True, dirName= '/tmp/porto/catboost/', class_weight = [1, 1]):
    # create log directory
    dirName = dirName + str(params.get('depth')) + '_' + str(params.get('rate')) + \
              '_' + str(params.get('l2')) + '_' + str(params.get('T'))

    try:
        os.makedirs(dirName)
    except Exception as inst:
        print inst  # __str__ allows args to be printed directly

    treeList = list()
    scoreList = list()
    modelList = list()
    prob = np.zeros([trainSet.shape[0]])

    skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    for i, (train_index, val_index) in enumerate(skf.split(trainSet, targetSet)):
        print 'Fold #%d'%i
        model = CatBoostClassifier(verbose=verbose, iterations=maxIter, eval_metric="AUC", class_weights= class_weight,
                                   depth=params['depth'], learning_rate=params['rate'],
                                   l2_leaf_reg=params['l2'], bagging_temperature=params['T'],
                                   od_type='Iter', od_wait=100,
                                   gradient_iterations = 4, rsm= 0.9,
                                   train_dir=dirName +"/"+ str(i), random_seed=i)

        # create pool
        trainPool = Pool(trainSet.iloc[train_index], targetSet.iloc[train_index],
                         feature_names=trainSet.columns.tolist())

        valPool = Pool(trainSet.iloc[val_index], targetSet.iloc[val_index],
                         feature_names=trainSet.columns.tolist())

        # fit and estimate the model
        model.fit(trainPool, eval_set=valPool, use_best_model=True)
        prob[val_index] = model.predict_proba(valPool)[:, 1]
        localScore = roc_auc_score(targetSet.iloc[val_index], prob[val_index])

        treeList.append(model.tree_count_)
        scoreList.append(localScore)
        modelList.append(model)
        print 'Local score is %f, tree count is %d'%(localScore, model.tree_count_)
        
    score = roc_auc_score(targetSet, prob)
    return {'score':score, 'treeList':treeList, 'scoreList':scoreList, 'gini':2*score-1, 'models':modelList}

In [33]:
def getPredictions(X_train, y_train, X_test, params, n_seed = 5, n_iter = 1000, fName = None, verbose = True):
    prob = np.zeros([X_test.shape[0],n_seed])
    dirName= '/tmp/porto/catboost/'
    
    # create pool
    trainPool = Pool(X_train, y_train)
    testPool =  Pool(X_test)
    
    for seed in range(0,n_seed):
        model = CatBoostClassifier(verbose= verbose, iterations= n_iter, eval_metric="AUC",
                                           depth=params['depth'], learning_rate=params['rate'],
                                           l2_leaf_reg=params['l2'], bagging_temperature=params['T'],
                                           od_type='Iter', od_wait=100,
                                           train_dir=dirName +"/"+ str(seed), random_seed= seed,
                                           gradient_iterations = 4, rsm= 0.9)
         
        model.fit(trainPool)
        prob[:,seed] = model.predict_proba(testPool)[:,1]
    
    df = pd.DataFrame(prob.mean(axis = 1), columns=['target'], index=X_test.index)
    if fName:
        df.to_csv(fName,index_label='id')
    
    return df

In [9]:
def prepareData(rawData):
    """
    Return Pandas dataframe for training.
    :return: pandas dataframe
    """

    #rawData = pd.read_csv('../data/train.csv', index_col='id')

    # drop _calc_ features
    dropList = list()
    for fName in rawData.columns.tolist():
        if fName.find('_calc_') > (-1):
            dropList.append(fName)
    df = rawData.drop(dropList, axis=1)

    # squared feature "ps_car_15"
    df = df.assign(ps_car_15_mod = np.power(df.ps_car_15,2).astype(int)).drop("ps_car_15", axis = 1)

    # inverse one-hot-encoding for ind_06 % ind_09
    df = df.assign(ps_ind_69_cat = 0*df.ps_ind_06_bin+df.ps_ind_07_bin+2*df.ps_ind_08_bin+3*df.ps_ind_09_bin)
    df.drop(['ps_ind_06_bin','ps_ind_07_bin','ps_ind_08_bin','ps_ind_09_bin'], inplace=True, axis = 1)

    #drop "ind_14"
    df.drop('ps_ind_14', axis = 1, inplace = True)

    return df

In [10]:
# get train data
train = prepareData(pd.read_csv('./data/train.csv', index_col='id'))

In [11]:
test = prepareData(pd.read_csv('./data/test.csv', index_col='id'))

## Обучение модели без upsampling

In [22]:
params = {'depth': 7, 'l2': 5.5, 'rate': 0.055, 'T': 1.5}
model1 = trainCatBoost(train.drop("target", axis= 1), train.target, params= params, verbose= False)

[Errno 17] File exists: '/tmp/porto/catboost/7_0.055_5.5_1.5'
Fold #0
Local score is 0.631454, tree count is 487
Fold #1
Local score is 0.644478, tree count is 547
Fold #2
Local score is 0.642380, tree count is 431
Fold #3
Local score is 0.644954, tree count is 360
Fold #4
Local score is 0.643083, tree count is 336


In [30]:
model1

{'gini': 0.28241054440757107,
 'score': 0.64120527220378554,
 'scoreList': [0.63145397180268126,
  0.64447774808639191,
  0.64237993791191628,
  0.6449541257103576,
  0.64308274059338544],
 'treeList': [487, 547, 431, 360, 336]}

In [31]:
np.array(model1['treeList']).mean(0)

432.19999999999999

In [35]:
pred = getPredictions(train.drop("target", axis= 1), train.target, test, params, n_iter= 450, fName= 'submission25.csv', verbose= False)

score 0.280

In [36]:
pred = getPredictions(train.drop("target", axis= 1), train.target, test, params, n_iter= 600, fName= 'submission26.csv', verbose = False)

score 0.280

## Добавлю upsampling в CatBoost

In [39]:
params = {'depth': 7, 'l2': 5.5, 'rate': 0.055, 'T': 1.5}
model2 = trainCatBoost(train.drop("target", axis= 1), train.target, params= params, verbose= False, class_weight= [1, 2])

[Errno 17] File exists: '/tmp/porto/catboost/7_0.055_5.5_1.5'
Fold #0
Local score is 0.626639, tree count is 428
Fold #1
Local score is 0.641386, tree count is 1576
Fold #2
Local score is 0.640462, tree count is 1297
Fold #3
Local score is 0.642378, tree count is 1994
Fold #4
Local score is 0.640151, tree count is 422


In [40]:
model2

{'gini': 0.27642036075049159,
 'score': 0.63821018037524579,
 'scoreList': [0.62663911578207776,
  0.64138649437504458,
  0.64046177186473319,
  0.6423781813920878,
  0.64015085372507363],
 'treeList': [428, 1576, 1297, 1994, 422]}

## Ручной upsamppling

In [47]:
from sklearn.utils import shuffle

In [46]:
train_upsampled = pd.concat([train, train.query("target == 1")])
train_upsampled.head()

Unnamed: 0_level_0,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,...,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15_mod,ps_ind_69_cat
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7,0,2,2,5,1,0,0,0,0,0,...,0,0,1,12,2,0.4,0.883679,0.37081,13,1
9,0,1,1,7,0,0,0,0,0,0,...,1,2,1,19,3,0.316228,0.618817,0.388716,6,2
13,0,5,4,9,1,0,0,0,0,0,...,1,2,1,60,1,0.316228,0.641586,0.347275,11,2
16,0,0,1,2,0,0,0,0,0,0,...,1,3,1,104,1,0.374166,0.542949,0.294958,4,0
17,0,0,2,0,1,0,0,0,0,0,...,1,2,1,82,3,0.31607,0.565832,0.365103,4,0


In [48]:
train_upsampled = shuffle(train_upsampled)

In [52]:
model3 = trainCatBoost(train_upsampled.drop("target", axis= 1), train_upsampled.target, params= params, verbose= False, maxIter= 5000)

[Errno 17] File exists: '/tmp/porto/catboost/7_0.055_5.5_1.5'
Fold #0
Local score is 0.713011, tree count is 4992
Fold #1
Local score is 0.710714, tree count is 5000
Fold #2
Local score is 0.713906, tree count is 4996
Fold #3
Local score is 0.719510, tree count is 4998
Fold #4
Local score is 0.715529, tree count is 4994
