In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import shap

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool

import optuna

In [None]:
sns.set()

# TYPE DICT

In [None]:
types_dict = {
'feature_31': 'int8',
'feature_43': 'int8',
'feature_61': 'int8',
'feature_64': 'int8',
'feature_80': 'int8',
'feature_83': 'int8',
'feature_92': 'int8',
'feature_133': 'int8',
'feature_143': 'int8',
'feature_191': 'int8',
'feature_201': 'int8',
'feature_209': 'int8',
'feature_251': 'int8',
'feature_253': 'int8',
'feature_299': 'int8',
'feature_300': 'int8',
'feature_343': 'int8',
'feature_382': 'int8',
'feature_392': 'int8',
'feature_406': 'int8',
'feature_423': 'int8',
'feature_446': 'int8',
'feature_449': 'int8',
'feature_459': 'int8',
'feature_490': 'int8',
'feature_17': 'int8',
'feature_21': 'int8',
'feature_22': 'int8',
'feature_27': 'int8',
'feature_166': 'int8',
'feature_173': 'int8',
'feature_347': 'int8',
'feature_405': 'int8',
'feature_434': 'int8',
'feature_492': 'int8',
'target': 'int8',
'feature_1': 'float16',
'feature_2': 'float16',
'feature_3': 'float16',
'feature_4': 'float16',
'feature_5': 'float16',
'feature_6': 'float16',
'feature_7': 'float16',
'feature_8': 'float16',
'feature_9': 'float16',
'feature_10': 'float16',
'feature_11': 'float16',
'feature_12': 'float16',
'feature_13': 'float16',
'feature_14': 'float16',
'feature_15': 'float16',
'feature_16': 'float16',
'feature_18': 'float16',
'feature_19': 'float16',
'feature_20': 'float16',
'feature_23': 'float16',
'feature_24': 'float16',
'feature_25': 'float16',
'feature_26': 'float16',
'feature_28': 'float16',
'feature_29': 'float16',
'feature_30': 'float16',
'feature_32': 'float16',
'feature_33': 'float16',
'feature_34': 'float16',
'feature_35': 'float16',
'feature_36': 'float16',
'feature_37': 'float16',
'feature_38': 'float16',
'feature_39': 'float16',
'feature_40': 'float16',
'feature_41': 'float16',
'feature_42': 'float16',
'feature_44': 'float16',
'feature_45': 'float16',
'feature_46': 'float16',
'feature_47': 'float16',
'feature_48': 'float16',
'feature_49': 'float16',
'feature_50': 'float16',
'feature_51': 'float16',
'feature_52': 'float16',
'feature_53': 'float16',
'feature_54': 'float16',
'feature_55': 'float16',
'feature_56': 'float16',
'feature_57': 'float16',
'feature_58': 'float16',
'feature_59': 'float16',
'feature_60': 'float16',
'feature_62': 'float16',
'feature_63': 'float16',
'feature_65': 'float16',
'feature_66': 'float16',
'feature_67': 'float16',
'feature_68': 'float16',
'feature_69': 'float16',
'feature_70': 'float16',
'feature_71': 'float16',
'feature_72': 'float16',
'feature_73': 'float16',
'feature_74': 'float16',
'feature_75': 'float16',
'feature_76': 'float16',
'feature_77': 'float16',
'feature_78': 'float16',
'feature_79': 'float16',
'feature_81': 'float16',
'feature_82': 'float16',
'feature_84': 'float16',
'feature_85': 'float16',
'feature_86': 'float16',
'feature_87': 'float16',
'feature_88': 'float16',
'feature_89': 'float16',
'feature_90': 'float16',
'feature_91': 'float16',
'feature_93': 'float16',
'feature_94': 'float16',
'feature_95': 'float16',
'feature_96': 'float16',
'feature_97': 'float16',
'feature_98': 'float16',
'feature_99': 'float16',
'feature_100': 'float16',
'feature_101': 'float16',
'feature_102': 'float16',
'feature_103': 'float16',
'feature_104': 'float16',
'feature_105': 'float16',
'feature_106': 'float16',
'feature_107': 'float16',
'feature_108': 'float16',
'feature_109': 'float16',
'feature_110': 'float16',
'feature_111': 'float16',
'feature_112': 'float16',
'feature_113': 'float16',
'feature_114': 'float16',
'feature_115': 'float16',
'feature_116': 'float16',
'feature_117': 'float16',
'feature_118': 'float16',
'feature_119': 'float16',
'feature_120': 'float16',
'feature_121': 'float16',
'feature_122': 'float16',
'feature_123': 'float16',
'feature_124': 'float16',
'feature_125': 'float16',
'feature_126': 'float16',
'feature_127': 'float16',
'feature_128': 'float16',
'feature_129': 'float16',
'feature_130': 'float16',
'feature_131': 'float16',
'feature_132': 'float16',
'feature_134': 'float16',
'feature_135': 'float16',
'feature_136': 'float16',
'feature_137': 'float16',
'feature_138': 'float16',
'feature_139': 'float16',
'feature_140': 'float16',
'feature_141': 'float16',
'feature_142': 'float16',
'feature_144': 'float16',
'feature_145': 'float16',
'feature_146': 'float16',
'feature_147': 'float16',
'feature_148': 'float16',
'feature_149': 'float16',
'feature_150': 'float16',
'feature_151': 'float16',
'feature_152': 'float16',
'feature_153': 'float16',
'feature_154': 'float16',
'feature_155': 'float16',
'feature_156': 'float16',
'feature_157': 'float16',
'feature_158': 'float16',
'feature_159': 'float16',
'feature_160': 'float16',
'feature_161': 'float16',
'feature_162': 'float16',
'feature_163': 'float16',
'feature_164': 'float16',
'feature_165': 'float16',
'feature_167': 'float16',
'feature_168': 'float16',
'feature_169': 'float16',
'feature_170': 'float16',
'feature_171': 'float16',
'feature_172': 'float16',
'feature_174': 'float16',
'feature_175': 'float16',
'feature_176': 'float16',
'feature_177': 'float16',
'feature_178': 'float16',
'feature_179': 'float16',
'feature_180': 'float16',
'feature_181': 'float16',
'feature_182': 'float16',
'feature_183': 'float16',
'feature_184': 'float16',
'feature_185': 'float16',
'feature_186': 'float16',
'feature_187': 'float16',
'feature_188': 'float16',
'feature_189': 'float16',
'feature_190': 'float16',
'feature_192': 'float16',
'feature_193': 'float16',
'feature_194': 'float16',
'feature_195': 'float16',
'feature_196': 'float16',
'feature_197': 'float16',
'feature_198': 'float16',
'feature_199': 'float16',
'feature_200': 'float16',
'feature_202': 'float16',
'feature_203': 'float16',
'feature_204': 'float16',
'feature_205': 'float16',
'feature_206': 'float16',
'feature_207': 'float16',
'feature_208': 'float16',
'feature_210': 'float16',
'feature_211': 'float16',
'feature_212': 'float16',
'feature_213': 'float16',
'feature_214': 'float16',
'feature_215': 'float16',
'feature_216': 'float16',
'feature_217': 'float16',
'feature_218': 'float16',
'feature_219': 'float16',
'feature_220': 'float16',
'feature_221': 'float16',
'feature_222': 'float16',
'feature_223': 'float16',
'feature_224': 'float16',
'feature_225': 'float16',
'feature_226': 'float16',
'feature_227': 'float16',
'feature_228': 'float16',
'feature_229': 'float16',
'feature_230': 'float16',
'feature_231': 'float16',
'feature_232': 'float16',
'feature_233': 'float16',
'feature_234': 'float16',
'feature_235': 'float16',
'feature_236': 'float16',
'feature_237': 'float16',
'feature_238': 'float16',
'feature_239': 'float16',
'feature_240': 'float16',
'feature_241': 'float16',
'feature_242': 'float16',
'feature_243': 'float16',
'feature_244': 'float16',
'feature_245': 'float16',
'feature_246': 'float16',
'feature_247': 'float16',
'feature_248': 'float16',
'feature_249': 'float16',
'feature_250': 'float16',
'feature_252': 'float16',
'feature_254': 'float16',
'feature_255': 'float16',
'feature_256': 'float16',
'feature_257': 'float16',
'feature_258': 'float16',
'feature_259': 'float16',
'feature_260': 'float16',
'feature_261': 'float16',
'feature_262': 'float16',
'feature_263': 'float16',
'feature_264': 'float16',
'feature_265': 'float16',
'feature_266': 'float16',
'feature_267': 'float16',
'feature_268': 'float16',
'feature_269': 'float16',
'feature_270': 'float16',
'feature_271': 'float16',
'feature_272': 'float16',
'feature_273': 'float16',
'feature_274': 'float16',
'feature_275': 'float16',
'feature_276': 'float16',
'feature_277': 'float16',
'feature_278': 'float16',
'feature_279': 'float16',
'feature_280': 'float16',
'feature_281': 'float16',
'feature_282': 'float16',
'feature_283': 'float16',
'feature_284': 'float16',
'feature_285': 'float16',
'feature_286': 'float16',
'feature_287': 'float16',
'feature_288': 'float16',
'feature_289': 'float16',
'feature_290': 'float16',
'feature_291': 'float16',
'feature_292': 'float16',
'feature_293': 'float16',
'feature_294': 'float16',
'feature_295': 'float16',
'feature_296': 'float16',
'feature_297': 'float16',
'feature_298': 'float16',
'feature_301': 'float16',
'feature_302': 'float16',
'feature_303': 'float16',
'feature_304': 'float16',
'feature_305': 'float16',
'feature_306': 'float16',
'feature_307': 'float16',
'feature_308': 'float16',
'feature_309': 'float16',
'feature_310': 'float16',
'feature_311': 'float16',
'feature_312': 'float16',
'feature_313': 'float16',
'feature_314': 'float16',
'feature_315': 'float16',
'feature_316': 'float16',
'feature_317': 'float16',
'feature_318': 'float16',
'feature_319': 'float16',
'feature_320': 'float16',
'feature_321': 'float16',
'feature_322': 'float16',
'feature_323': 'float16',
'feature_324': 'float16',
'feature_325': 'float16',
'feature_326': 'float16',
'feature_327': 'float16',
'feature_328': 'float16',
'feature_329': 'float16',
'feature_330': 'float16',
'feature_331': 'float16',
'feature_332': 'float16',
'feature_333': 'float16',
'feature_334': 'float16',
'feature_335': 'float16',
'feature_336': 'float16',
'feature_337': 'float16',
'feature_338': 'float16',
'feature_339': 'float16',
'feature_340': 'float16',
'feature_341': 'float16',
'feature_342': 'float16',
'feature_344': 'float16',
'feature_345': 'float16',
'feature_346': 'float16',
'feature_348': 'float16',
'feature_349': 'float16',
'feature_350': 'float16',
'feature_351': 'float16',
'feature_352': 'float16',
'feature_353': 'float16',
'feature_354': 'float16',
'feature_355': 'float16',
'feature_356': 'float16',
'feature_357': 'float16',
'feature_358': 'float16',
'feature_359': 'float16',
'feature_360': 'float16',
'feature_361': 'float16',
'feature_362': 'float16',
'feature_363': 'float16',
'feature_364': 'float16',
'feature_365': 'float16',
'feature_366': 'float16',
'feature_367': 'float16',
'feature_368': 'float16',
'feature_369': 'float16',
'feature_370': 'float16',
'feature_371': 'float16',
'feature_372': 'float16',
'feature_373': 'float16',
'feature_374': 'float16',
'feature_375': 'float16',
'feature_376': 'float16',
'feature_377': 'float16',
'feature_378': 'float16',
'feature_379': 'float16',
'feature_380': 'float16',
'feature_381': 'float16',
'feature_383': 'float16',
'feature_384': 'float16',
'feature_385': 'float16',
'feature_386': 'float16',
'feature_387': 'float16',
'feature_388': 'float16',
'feature_389': 'float16',
'feature_390': 'float16',
'feature_391': 'float16',
'feature_393': 'float16',
'feature_394': 'float16',
'feature_395': 'float16',
'feature_396': 'float16',
'feature_397': 'float16',
'feature_398': 'float16',
'feature_399': 'float16',
'feature_400': 'float16',
'feature_401': 'float16',
'feature_402': 'float16',
'feature_403': 'float16',
'feature_404': 'float16',
'feature_407': 'float16',
'feature_408': 'float16',
'feature_409': 'float16',
'feature_410': 'float16',
'feature_411': 'float16',
'feature_412': 'float16',
'feature_413': 'float16',
'feature_414': 'float16',
'feature_415': 'float16',
'feature_416': 'float16',
'feature_417': 'float16',
'feature_418': 'float16',
'feature_419': 'float16',
'feature_420': 'float16',
'feature_421': 'float16',
'feature_422': 'float16',
'feature_424': 'float16',
'feature_425': 'float16',
'feature_426': 'float16',
'feature_427': 'float16',
'feature_428': 'float16',
'feature_429': 'float16',
'feature_430': 'float16',
'feature_431': 'float16',
'feature_432': 'float16',
'feature_433': 'float16',
'feature_435': 'float16',
'feature_436': 'float16',
'feature_437': 'float16',
'feature_438': 'float16',
'feature_439': 'float16',
'feature_440': 'float16',
'feature_441': 'float16',
'feature_442': 'float16',
'feature_443': 'float16',
'feature_444': 'float16',
'feature_445': 'float16',
'feature_447': 'float16',
'feature_448': 'float16',
'feature_450': 'float16',
'feature_451': 'float16',
'feature_452': 'float16',
'feature_453': 'float16',
'feature_454': 'float16',
'feature_455': 'float16',
'feature_456': 'float16',
'feature_457': 'float16',
'feature_458': 'float16',
'feature_460': 'float16',
'feature_461': 'float16',
'feature_462': 'float16',
'feature_463': 'float16',
'feature_464': 'float16',
'feature_465': 'float16',
'feature_466': 'float16',
'feature_467': 'float16',
'feature_468': 'float16',
'feature_469': 'float16',
'feature_470': 'float16',
'feature_471': 'float16',
'feature_472': 'float16',
'feature_473': 'float16',
'feature_474': 'float16',
'feature_475': 'float16',
'feature_476': 'float16',
'feature_477': 'float16',
'feature_478': 'float16',
'feature_479': 'float16',
'feature_480': 'float16',
'feature_481': 'float16',
'feature_482': 'float16',
'feature_483': 'float16',
'feature_484': 'float16',
'feature_485': 'float16',
'feature_486': 'float16',
'feature_487': 'float16',
'feature_488': 'float16',
'feature_489': 'float16',
'feature_491': 'float16',
'feature_493': 'float16',
'feature_494': 'float16',
'feature_495': 'float16',
'feature_496': 'float16',
'feature_497': 'float16',
'feature_498': 'float16',
'feature_499': 'float16',
'feature_500': 'float16'
}

# UPLOAD DATA

Загружаем данные

In [None]:
train = pd.read_csv('train_1.csv', dtype=types_dict)

In [None]:
for i in range(2, 11):
    train = pd.concat([train, pd.read_csv(f'train_{i}.csv', dtype=types_dict)])

# DATA PREP

Применяем предобработку данных из ноутбука eda_and_data_prep

In [None]:
train = train.drop(['smpl', 'id'], axis=1)

In [None]:
train

Unnamed: 0,target,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_491,feature_492,feature_493,feature_494,feature_495,feature_496,feature_497,feature_498,feature_499,feature_500
0,0,0.372314,1.500977,2.132812,-0.957520,-0.119019,0.122925,0.151001,0.506836,-0.839355,...,-1.254883,0,0.803711,-1.017578,-0.520996,0.646973,1.454102,-0.833496,0.184082,-0.438232
1,0,0.382324,0.962402,-0.192505,-1.019531,-1.330078,-0.100159,-1.130859,-1.172852,-1.790039,...,-0.884277,0,-1.643555,-1.264648,-1.523438,0.604980,0.491943,-0.003685,0.469727,-1.094727
2,0,0.472412,-0.695312,0.538086,-0.032990,-0.364746,-0.441895,-0.035950,-0.921387,0.746582,...,-0.322754,0,-0.496826,-0.051941,0.743164,-1.395508,0.147949,-0.007553,-0.981445,0.270996
3,0,0.512207,-1.232422,0.555176,1.457031,1.435547,0.168579,-0.628906,0.249756,1.475586,...,0.976562,0,1.182617,-0.178711,1.467773,-0.791016,0.000645,0.674805,0.803223,1.037109
4,0,1.488281,-0.164429,-1.197266,1.548828,0.952637,1.144531,-0.963867,0.031891,1.262695,...,-1.655273,0,1.076172,-0.555664,1.249023,-0.485840,0.458740,-0.444580,-0.333740,0.380127
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
449049,1,-0.276367,-0.344482,0.609863,-0.214722,-0.345947,0.772461,-0.074585,2.414062,2.994141,...,2.000000,0,2.474609,1.364258,2.972656,1.421875,-0.688965,1.313477,0.201904,0.562988
449050,0,-0.270996,-1.203125,-0.068298,0.587891,0.373535,1.384766,-0.474854,1.772461,0.625000,...,0.808594,0,3.041016,0.568359,0.613281,-1.115234,2.103516,1.550781,-0.843262,-0.333740
449051,0,0.616211,-1.386719,1.023438,-1.549805,1.183594,3.136719,1.113281,3.097656,2.240234,...,3.128906,0,2.220703,-0.923340,2.207031,3.003906,0.151123,1.565430,-0.528809,-0.351562
449052,0,-2.146484,-1.831055,0.590820,0.657227,-1.164062,0.397461,0.155518,0.782715,-0.567383,...,1.198242,0,-0.649902,0.685059,-0.877441,-0.434326,-1.021484,-0.267090,-0.418457,-0.224731


In [None]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4490468 entries, 0 to 449053
Columns: 501 entries, target to feature_500
dtypes: float16(465), int8(36)
memory usage: 4.1 GB


In [None]:
cat_features = ['feature_31','feature_61','feature_64','feature_80','feature_83','feature_92','feature_133','feature_143','feature_201','feature_209','feature_251','feature_253',
                'feature_300','feature_343','feature_382','feature_406','feature_423','feature_446','feature_449','feature_459','feature_490']

In [None]:
cols_to_drop = ['feature_17', 'feature_21', 'feature_27', 'feature_166', 'feature_173', 'feature_299', 'feature_424', 'feature_434', 'feature_191'] # duplicates and high corrillated

In [None]:
train = train.drop(cols_to_drop, axis=1)

In [None]:
def preprocess_83(x):
    if x in [31, 14, 33, 27, 34, 15, 17, 32, 36, 30, 18]:
        return -1
    else:
        return x

def preprocess_133(x):
    if x in [11, 15, 8, 17, 19, 9, 7, 10, 12, 6, 16, 18, 20]:
        return -1
    else:
        return x

def preprocess_201(x):
    if x in [22, 14]:
        return -1
    else:
        return x

def preprocess_251(x):
    if x in [17, 7, 20, 13, 22, 11, 19, 10, 21, 12, 16, 24, 15, 23]:
        return -1
    else:
        return x

def preprocess_253(x):
    if x in [31, 35, 34, 29, 27, 26, 19, 25, 32, 30]:
        return -1
    else:
        return x

def preprocess_343(x):
    if x in [29, 14, 15, 30, 32, 25, 31, 28, 21, 34, 6, 11, 13]:
        return -1
    else:
        return x

def preprocess_382(x):
    if x in [12, 11]:
        return -1
    else:
        return x

def preprocess_406(x):
    if x in [10, 14, 17, 11, 12, 15, 3]:
        return -1
    else:
        return x

def preprocess_423(x):
    if x in [11, 14, 12, 13, 10, 8, 18, 6, 15, 17, 9]:
        return -1
    else:
        return x

def preprocess_449(x):
    if x in [15, 14, 12, 13, 10]:
        return -1
    else:
        return x

def preprocess_490(x):
    if x in [58, 53, 55]:
        return -1
    else:
        return x

In [None]:
y = train['target']
train = train.drop('target', axis=1)

In [None]:
train['feature_83'] =  train['feature_83'].apply(preprocess_83).astype('int8')
train['feature_133'] = train['feature_133'].apply(preprocess_133).astype('int8')
train['feature_201'] = train['feature_201'].apply(preprocess_201).astype('int8')
train['feature_251'] = train['feature_251'].apply(preprocess_251).astype('int8')
train['feature_253'] = train['feature_253'].apply(preprocess_253).astype('int8')
train['feature_343'] = train['feature_343'].apply(preprocess_343).astype('int8')
train['feature_382'] = train['feature_382'].apply(preprocess_382).astype('int8')
train['feature_406'] = train['feature_406'].apply(preprocess_406).astype('int8')
train['feature_423'] = train['feature_423'].apply(preprocess_423).astype('int8')
train['feature_449'] = train['feature_449'].apply(preprocess_449).astype('int8')
train['feature_490'] = train['feature_490'].apply(preprocess_490).astype('int8')

In [None]:
train.shape

(4490468, 491)

In [None]:
train = pd.get_dummies(train, columns=cat_features, drop_first=True)

In [None]:
train.shape

(4490468, 725)

In [None]:
no_cols = ['feature_83_4', 'feature_83_7', 'feature_83_8', 'feature_83_12', 'feature_83_13', 'feature_83_20', 'feature_83_22',
           'feature_83_24', 'feature_133_4', 'feature_251_3', 'feature_251_5', 'feature_251_6', 'feature_251_9', 'feature_343_3',
           'feature_343_8', 'feature_343_12', 'feature_343_18', 'feature_343_19', 'feature_406_6', 'feature_446_7', 'feature_459_2']

In [None]:
zero_weight_cols = ['feature_490_24','feature_253_9','feature_83_1','feature_406_4','feature_64_3','feature_31_2','feature_253_21','feature_490_50',
                    'feature_61_2','feature_253_17','feature_253_11','feature_490_43','feature_490_40','feature_64_2','feature_300_5','feature_490_18',
                    'feature_83_35','feature_446_3','feature_382_4','feature_382_6','feature_446_6','feature_446_4','feature_423_5','feature_423_4',
                    'feature_382_8','feature_406_9','feature_382_9','feature_406_7','feature_251_4','feature_449_7','feature_490_15','feature_343_23',
                    'feature_490_14','feature_92_8','feature_22','feature_92_9','feature_490_5','feature_343_35','feature_490_4','feature_92_12',
                    'feature_343_26','feature_343_33','feature_143_5']

In [None]:
weak_cols = ['feature_490_13','feature_490_46','feature_201_5','feature_490_48','feature_201_15','feature_490_35','feature_343_17',
             'feature_490_54','feature_251_8','feature_61_3','feature_406_5','feature_253_12','feature_343_27','feature_92_11',
             'feature_382_13','feature_253_18','feature_83_10','feature_253_20','feature_201_10','feature_143_7','feature_343_24',
             'feature_201_18','feature_83_6','feature_253_23','feature_253_13','feature_253_22','feature_253_28','feature_92_13',
             'feature_406_16','feature_253_7','feature_201_8','feature_343_16','feature_83_19','feature_83_28']

In [None]:
train = train.drop(no_cols+zero_weight_cols+weak_cols, axis=1)

In [None]:
train.shape

(4490468, 627)

In [None]:
x_train, x_valid, y_train, y_valid = train_test_split(train, y, test_size=0.1, stratify=y, random_state=42)

In [None]:
train_pool = Pool(data=x_train, label=y_train)
valid_pool = Pool(data=x_valid, label=y_valid)

In [None]:
full_data_pool = Pool(data=train, label=y)

# MODELING

Подберем лучшую модель с помощью optuna

In [None]:
n_rounds = []
def objective(trial):
    space = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.04, log=True),
        "depth": trial.suggest_int("depth", 11, 13),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 0, 100, step=1),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 500, 6000),
        "objective": "Logloss",
        'iterations': 32768,
        'eval_metric': 'AUC',
        'early_stopping_rounds': 30,
        'random_state': 666,
        'verbose': False,
        'task_type': 'GPU'
    }

    model = CatBoostClassifier(**space,
                per_float_feature_quantization=['41:border_count=1024','396:border_count=1024','364:border_count=1024'])

    model.fit(train_pool, eval_set=valid_pool)
    n_rounds.append(model.best_iteration_)

    res = model.best_score_['validation']['AUC']
    return res

In [None]:
tpe_sampler = optuna.samplers.TPESampler(seed=666)
study = optuna.create_study(direction='maximize', sampler=tpe_sampler)
study.optimize(objective, n_trials=100)

[I 2024-11-09 16:05:40,537] A new study created in memory with name: no-name-bab44aed-57a4-4a17-a1ca-500332226b11
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-11-09 16:32:02,998] Trial 0 finished with value: 0.8642334342002869 and parameters: {'learning_rate': 0.026406154958457, 'depth': 13, 'l2_leaf_reg': 68.0, 'min_data_in_leaf': 4503}. Best is trial 0 with value: 0.8642334342002869.
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-11-09 16:39:23,486] Trial 1 finished with value: 0.8617404699325562 and parameters: {'learning_rate': 0.037396828233921084, 'depth': 11, 'l2_leaf_reg': 41.0, 'min_data_in_leaf': 768}. Best is trial 0 with value: 0.8642334342002869.
Default metric period is 5 because AUC is/are not implemented for GPU
[I 2024-11-09 16:39:49,661] Trial 2 finished with value: 0.8192185759544373 and parameters: {'learning_rate': 0.011485845992452866, 'depth': 12, 'l2_leaf_reg': 20.0, 'min_data_in_leaf': 4593}. Best 

KeyboardInterrupt: 

In [None]:
best_params = study.best_params

In [None]:
best_params['iterations'] = n_rounds[16] + 1

In [None]:
best_params

{'learning_rate': 0.015795105281243844,
 'depth': 13,
 'l2_leaf_reg': 78.0,
 'min_data_in_leaf': 3958,
 'iterations': 8911}

Обучим лучшую конфигурацию на всех данных

In [None]:
model = CatBoostClassifier(**best_params, random_state=666, verbose=1000, eval_metric='AUC', objective='Logloss', task_type='GPU',
                           per_float_feature_quantization=['41:border_count=1024','396:border_count=1024','364:border_count=1024'])

In [None]:
model.fit(full_data_pool)

Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 337ms	remaining: 50m 5s
1000:	total: 6m	remaining: 47m 31s
2000:	total: 11m 58s	remaining: 41m 19s
3000:	total: 17m 59s	remaining: 35m 26s
4000:	total: 24m 3s	remaining: 29m 31s
5000:	total: 30m 9s	remaining: 23m 34s
6000:	total: 36m 37s	remaining: 17m 45s
7000:	total: 42m 53s	remaining: 11m 42s
8000:	total: 49m 4s	remaining: 5m 34s
8910:	total: 54m 47s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x1d13c20410>

# UPLOAD TEST AND PREDICT

Загрузим тестовые данные и сделаем сабмит

In [None]:
test = pd.read_csv('test_1.csv', dtype=types_dict)

In [None]:
for i in range(2, 11):
    test = pd.concat([test, pd.read_csv(f'test_{i}.csv', dtype=types_dict)])

In [None]:
submit = test[['id']]

In [None]:
test['feature_83'] =  test['feature_83'].apply(preprocess_83).astype('int8')
test['feature_133'] = test['feature_133'].apply(preprocess_133).astype('int8')
test['feature_201'] = test['feature_201'].apply(preprocess_201).astype('int8')
test['feature_251'] = test['feature_251'].apply(preprocess_251).astype('int8')
test['feature_253'] = test['feature_253'].apply(preprocess_253).astype('int8')
test['feature_343'] = test['feature_343'].apply(preprocess_343).astype('int8')
test['feature_382'] = test['feature_382'].apply(preprocess_382).astype('int8')
test['feature_406'] = test['feature_406'].apply(preprocess_406).astype('int8')
test['feature_423'] = test['feature_423'].apply(preprocess_423).astype('int8')
test['feature_449'] = test['feature_449'].apply(preprocess_449).astype('int8')
test['feature_490'] = test['feature_490'].apply(preprocess_490).astype('int8')

In [None]:
test = pd.get_dummies(test, columns=cat_features, drop_first=True)

In [None]:
submit['target'] = model.predict_proba(test[x_valid.columns])[:, 1]

In [None]:
submit

Unnamed: 0,id,target
0,4490468,0.010694
1,4490469,0.008120
2,4490470,0.013523
3,4490471,0.117551
4,4490472,0.407733
...,...,...
509527,4999995,0.080105
509528,4999996,0.010839
509529,4999997,0.025132
509530,4999998,0.035854


In [None]:
submit.to_csv('catboost_tunned_09_11_2024_13_depth.csv', index=False)

# GENERATING PREDS ON TRAIN DATA

Сохраним предсказания модели по всем обучающим данным для стекинга

In [None]:
train = train.reset_index(drop=True)

In [None]:
train['id'] = train.index

In [None]:
cat_boost_preds = pd.DataFrame(columns=['id', 'catboost_86.11_proba'])

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_index, valid_index) in enumerate(skf.split(train, y)):

    x_train, y_train = train.iloc[train_index, :], y.iloc[train_index]
    x_valid, y_valid = train.iloc[valid_index, :], y.iloc[valid_index]

    scored_sample = x_valid[['id']]

    x_train = x_train.drop('id', axis=1)
    x_valid = x_valid.drop('id', axis=1)

    model = CatBoostClassifier(**best_params, random_state=666, verbose=1000, eval_metric='AUC', objective='Logloss', task_type='GPU',
                           per_float_feature_quantization=['41:border_count=1024','396:border_count=1024','364:border_count=1024'])

    model.fit(x_train, y_train)

    best_iter = model.best_iteration_
    preds = model.predict_proba(x_valid)[:, 1]
    valid_roc_auc = roc_auc_score(y_valid, preds)

    print(f'Fold {i+1}:')
    print(f'best_iteration: {best_iter}, validation roc auc: {valid_roc_auc}')

    scored_sample['catboost_86.11_proba'] = preds
    cat_boost_preds = pd.concat([cat_boost_preds, scored_sample])

print('Done!')

Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 341ms	remaining: 50m 37s
1000:	total: 5m 25s	remaining: 42m 51s
2000:	total: 10m 52s	remaining: 37m 34s
3000:	total: 16m 21s	remaining: 32m 12s
4000:	total: 21m 51s	remaining: 26m 49s
5000:	total: 27m 23s	remaining: 21m 25s
6000:	total: 32m 58s	remaining: 15m 59s
7000:	total: 38m 37s	remaining: 10m 32s
8000:	total: 44m 18s	remaining: 5m 2s
8910:	total: 49m 28s	remaining: 0us
Fold 1:
best_iteration: None, validation roc auc: 0.8632530882598113


  cat_boost_preds = pd.concat([cat_boost_preds, scored_sample])
Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 214ms	remaining: 31m 43s
1000:	total: 5m 27s	remaining: 43m 4s
2000:	total: 10m 58s	remaining: 37m 52s
3000:	total: 16m 31s	remaining: 32m 33s
4000:	total: 22m 9s	remaining: 27m 10s
5000:	total: 27m 49s	remaining: 21m 45s
6000:	total: 33m 26s	remaining: 16m 12s
7000:	total: 38m 58s	remaining: 10m 37s
8000:	total: 44m 32s	remaining: 5m 3s
8910:	total: 49m 38s	remaining: 0us
Fold 2:
best_iteration: None, validation roc auc: 0.8637066036875847


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 308ms	remaining: 45m 45s
1000:	total: 5m 24s	remaining: 42m 41s
2000:	total: 10m 50s	remaining: 37m 27s
3000:	total: 16m 19s	remaining: 32m 9s
4000:	total: 21m 51s	remaining: 26m 49s
5000:	total: 27m 24s	remaining: 21m 25s
6000:	total: 32m 57s	remaining: 15m 58s
7000:	total: 38m 34s	remaining: 10m 31s
8000:	total: 44m 12s	remaining: 5m 1s
8910:	total: 49m 18s	remaining: 0us
Fold 3:
best_iteration: None, validation roc auc: 0.8611133270654701


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 217ms	remaining: 32m 12s
1000:	total: 5m 24s	remaining: 42m 42s
2000:	total: 10m 51s	remaining: 37m 31s
3000:	total: 16m 20s	remaining: 32m 10s
4000:	total: 21m 47s	remaining: 26m 44s
5000:	total: 27m 14s	remaining: 21m 17s
6000:	total: 32m 40s	remaining: 15m 50s
7000:	total: 38m 12s	remaining: 10m 25s
8000:	total: 43m 47s	remaining: 4m 58s
8910:	total: 48m 52s	remaining: 0us
Fold 4:
best_iteration: None, validation roc auc: 0.8648427874651434


Default metric period is 5 because AUC is/are not implemented for GPU


0:	total: 213ms	remaining: 31m 37s
1000:	total: 5m 24s	remaining: 42m 46s
2000:	total: 10m 52s	remaining: 37m 31s
3000:	total: 16m 22s	remaining: 32m 13s
4000:	total: 21m 52s	remaining: 26m 51s
5000:	total: 27m 25s	remaining: 21m 26s
6000:	total: 32m 57s	remaining: 15m 58s
7000:	total: 38m 30s	remaining: 10m 30s
8000:	total: 44m 7s	remaining: 5m 1s
8910:	total: 49m 13s	remaining: 0us
Fold 5:
best_iteration: None, validation roc auc: 0.8648171781012188
Done!


In [None]:
cat_boost_preds

Unnamed: 0,id,catboost_86.11_proba
9,9,0.013441
15,15,0.001824
31,31,0.007838
32,32,0.000588
35,35,0.006474
...,...,...
4490442,4490442,0.116213
4490447,4490447,0.015797
4490449,4490449,0.007971
4490461,4490461,0.012226


In [None]:
cat_boost_preds.sort_values('id', ascending=True).to_csv('catboost_86.11_train_preds.csv', index=False)