## Загрузка библиотек и датасета, создание необходимых директорий

In [23]:
import pandas as pd
import numpy as np
import joblib
import glob
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_auc_score

from catboost import CatBoostClassifier

from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task

In [27]:
path_train = './data/train'
path_test = './data/test'
path_saved_models = './data/saved_models'
path_saved_predictions = './data/saved_predictions'

paths = [path_train, path_test, path_saved_models, path_saved_predictions]

for path in paths:
    if not os.path.exists(path):
        os.mkdir(path)

In [3]:
filenames_train = glob.glob(path_train + "/*.csv")
data_files_train = []

for filename in filenames_train:
    data_files_train.append(pd.read_csv(filename))
    print(f"completed {filename}")

train_data = pd.concat(data_files_train, ignore_index=True)
del data_files_train, filenames_train

completed ./data/train\train_1-006.csv
completed ./data/train\train_10-005.csv
completed ./data/train\train_2-007.csv
completed ./data/train\train_3-009.csv
completed ./data/train\train_4-008.csv
completed ./data/train\train_5-010.csv
completed ./data/train\train_6-002.csv
completed ./data/train\train_7-001.csv
completed ./data/train\train_8-001.csv
completed ./data/train\train_9-004.csv


In [14]:
MODEL1_NAME = "1-DESOLATOR-importance-50k-0.005-d6"
MODEL2_NAME = "2-DESOLATOR-normid-drop400-25k-0.01-d6"
MODEL3_NAME = "3-DESOLATOR-drop300-25k-0.01-d6"
MODEL4_NAME = "4-DESOLATOR-drop350-50k-0.05-d6"
MODEL5_NAME = "5-DESOLATOR-drop400-25k-0.01-d6"
MODEL6_NAME = "6-DESOLATOR-drop350-25k-0.01-d6"
MODEL7_NAME = "7-DESOLATOR-500k-0.001-d6"
MODEL8_NAME = "8-RADIC-3h"
MODEL9_NAME = "9-RADIC-drop350-6h"
MODEL10_NAME = "10-RADIC-drop400-normid-14ks"

# Catboost модели

#### Первая модель, используем для сортировки фич по значимости (не используется в финальном блендинге)
##### обучается в том числе и на id

In [None]:
X = train_data.drop(columns=['target', 'smpl'])
y = train_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

model = CatBoostClassifier(
    iterations=50000,
    learning_rate=0.005,
    depth=6,
    scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
    eval_metric='AUC',
    task_type="GPU",
    verbose=1000
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

y_pred = model.predict_proba(X_test)
y_pred = pd.Series(y_pred[:, 1])

metric = roc_auc_score(y_test, y_pred)
print(f'Метрика roc-auc на валидационных данных имеет значение: {metric}')

joblib.dump(model, f"./data/saved_models/{MODEL1_NAME}.pkl")

#### Получаем X (300, 350, 400) самых незначимых фич

In [21]:
loaded_model = joblib.load(f"./data/saved_models/{MODEL1_NAME}.pkl")
feature_importance = loaded_model.feature_importances_
sorted_idx = np.argsort(feature_importance)
s = ""
for i in range(0, 400):
    s += f'\'feature_{sorted_idx[i]}\', '
s = s[:-2]

print(s)
del loaded_model

'feature_22', 'feature_17', 'feature_492', 'feature_459', 'feature_21', 'feature_347', 'feature_64', 'feature_43', 'feature_61', 'feature_299', 'feature_405', 'feature_27', 'feature_406', 'feature_92', 'feature_446', 'feature_173', 'feature_209', 'feature_300', 'feature_191', 'feature_423', 'feature_382', 'feature_143', 'feature_425', 'feature_470', 'feature_303', 'feature_219', 'feature_305', 'feature_226', 'feature_414', 'feature_343', 'feature_184', 'feature_249', 'feature_75', 'feature_224', 'feature_57', 'feature_73', 'feature_189', 'feature_62', 'feature_83', 'feature_483', 'feature_204', 'feature_294', 'feature_419', 'feature_278', 'feature_458', 'feature_55', 'feature_429', 'feature_99', 'feature_91', 'feature_286', 'feature_16', 'feature_156', 'feature_181', 'feature_409', 'feature_416', 'feature_497', 'feature_4', 'feature_489', 'feature_132', 'feature_283', 'feature_96', 'feature_398', 'feature_158', 'feature_36', 'feature_408', 'feature_485', 'feature_58', 'feature_328', 'f

In [9]:
drop_400 = ['feature_22', 'feature_17', 'feature_492', 'feature_459', 'feature_21', 'feature_347',
           'feature_64', 'feature_43', 'feature_61', 'feature_299', 'feature_405', 'feature_27', 'feature_406',
           'feature_92', 'feature_446', 'feature_173', 'feature_209', 'feature_300', 'feature_191', 'feature_423',
           'feature_382', 'feature_143', 'feature_425', 'feature_470', 'feature_303', 'feature_219', 'feature_305',
           'feature_226', 'feature_414', 'feature_343', 'feature_184', 'feature_249', 'feature_75', 'feature_224',
           'feature_57', 'feature_73', 'feature_189', 'feature_62', 'feature_83', 'feature_483', 'feature_204',
           'feature_294', 'feature_419', 'feature_278', 'feature_458', 'feature_55', 'feature_429', 'feature_99',
           'feature_91', 'feature_286', 'feature_16', 'feature_156', 'feature_181', 'feature_409', 'feature_416',
           'feature_497', 'feature_4', 'feature_489', 'feature_132', 'feature_283', 'feature_96', 'feature_398',
           'feature_158', 'feature_36', 'feature_408', 'feature_485', 'feature_58', 'feature_328', 'feature_498',
           'feature_95', 'feature_440', 'feature_327', 'feature_49', 'feature_258', 'feature_70', 'feature_307',
           'feature_206', 'feature_29', 'feature_445', 'feature_231', 'feature_233', 'feature_315', 'feature_400',
           'feature_166', 'feature_46', 'feature_243', 'feature_320', 'feature_437', 'feature_210', 'feature_89',
           'feature_330', 'feature_203', 'feature_464', 'feature_281', 'feature_93', 'feature_214', 'feature_499',
           'feature_358', 'feature_71', 'feature_88', 'feature_475', 'feature_26', 'feature_438', 'feature_279',
           'feature_149', 'feature_30', 'feature_298', 'feature_312', 'feature_211', 'feature_207', 'feature_442',
           'feature_200', 'feature_254', 'feature_488', 'feature_473', 'feature_311', 'feature_297', 'feature_256',
           'feature_391', 'feature_469', 'feature_38', 'feature_180', 'feature_289', 'feature_182', 'feature_63',
           'feature_317', 'feature_145', 'feature_169', 'feature_344', 'feature_34', 'feature_479', 'feature_468',
           'feature_420', 'feature_394', 'feature_23', 'feature_216', 'feature_326', 'feature_478', 'feature_220',
           'feature_272', 'feature_32', 'feature_155', 'feature_194', 'feature_177', 'feature_337', 'feature_275',
           'feature_127', 'feature_238', 'feature_199', 'feature_309', 'feature_443', 'feature_240', 'feature_192',
           'feature_237', 'feature_352', 'feature_59', 'feature_222', 'feature_486', 'feature_165', 'feature_339',
           'feature_267', 'feature_28', 'feature_363', 'feature_282', 'feature_118', 'feature_104', 'feature_474',
           'feature_185', 'feature_60', 'feature_431', 'feature_484', 'feature_41', 'feature_413', 'feature_472',
           'feature_263', 'feature_261', 'feature_348', 'feature_295', 'feature_304', 'feature_85', 'feature_415',
           'feature_273', 'feature_432', 'feature_372', 'feature_374', 'feature_109', 'feature_7', 'feature_10',
           'feature_229', 'feature_15', 'feature_385', 'feature_236', 'feature_471', 'feature_321', 'feature_2',
           'feature_375', 'feature_163', 'feature_111', 'feature_287', 'feature_313', 'feature_449', 'feature_477',
           'feature_384', 'feature_274', 'feature_69', 'feature_399', 'feature_430', 'feature_113', 'feature_94',
           'feature_3', 'feature_314', 'feature_227', 'feature_453', 'feature_373', 'feature_125', 'feature_221',
           'feature_345', 'feature_79', 'feature_389', 'feature_319', 'feature_116', 'feature_175', 'feature_144',
           'feature_426', 'feature_455', 'feature_357', 'feature_52', 'feature_188', 'feature_380', 'feature_346',
           'feature_110', 'feature_215', 'feature_140', 'feature_121', 'feature_342', 'feature_54', 'feature_450',
           'feature_250', 'feature_90', 'feature_213', 'feature_417', 'feature_316', 'feature_260', 'feature_266',
           'feature_360', 'feature_359', 'feature_130', 'feature_301', 'feature_285', 'feature_228', 'feature_187',
           'feature_76', 'feature_117', 'feature_370', 'feature_351', 'feature_280', 'feature_276', 'feature_288',
           'feature_126', 'feature_333', 'feature_171', 'feature_377', 'feature_115', 'feature_72', 'feature_310',
           'feature_20', 'feature_476', 'feature_162', 'feature_159', 'feature_105', 'feature_269', 'feature_397',
           'feature_245', 'feature_457', 'feature_355', 'feature_302', 'feature_107', 'feature_183', 'feature_123',
           'feature_447', 'feature_481', 'feature_354', 'feature_350', 'feature_271', 'feature_422', 'feature_217',
           'feature_448', 'feature_137', 'feature_500', 'feature_436', 'feature_172', 'feature_306', 'feature_362',
           'feature_401', 'feature_170', 'feature_427', 'feature_308', 'feature_235', 'feature_193', 'feature_241',
           'feature_134', 'feature_277', 'feature_257', 'feature_196', 'feature_101', 'feature_176', 'feature_112',
           'feature_493', 'feature_160', 'feature_456', 'feature_480', 'feature_135', 'feature_338', 'feature_268',
           'feature_13', 'feature_24', 'feature_120', 'feature_218', 'feature_67', 'feature_122', 'feature_368',
           'feature_361', 'feature_292', 'feature_255', 'feature_324', 'feature_465', 'feature_65', 'feature_369',
           'feature_77', 'feature_466', 'feature_293', 'feature_403', 'feature_190', 'feature_404', 'feature_441',
           'feature_349', 'feature_25', 'feature_433', 'feature_367', 'feature_202', 'feature_396', 'feature_119',
           'feature_290', 'feature_74', 'feature_402', 'feature_78', 'feature_128', 'feature_150', 'feature_340',
           'feature_322', 'feature_291', 'feature_487', 'feature_395', 'feature_146', 'feature_460', 'feature_39',
           'feature_325', 'feature_205', 'feature_418', 'feature_411', 'feature_82', 'feature_106', 'feature_253',
           'feature_407', 'feature_152', 'feature_393', 'feature_197', 'feature_40', 'feature_179', 'feature_102',
           'feature_496', 'feature_252', 'feature_387', 'feature_37', 'feature_463', 'feature_284', 'feature_296',
           'feature_1', 'feature_14', 'feature_335', 'feature_491', 'feature_225', 'feature_461', 'feature_334',
           'feature_87', 'feature_494', 'feature_6', 'feature_147', 'feature_248', 'feature_31', 'feature_84',
           'feature_329', 'feature_33', 'feature_56', 'feature_178', 'feature_42', 'feature_174', 'feature_366',
           'feature_136', 'feature_148']
drop_350 = drop_400[:-50]
drop_300 = drop_400[:-100]

#### Вторая модель, используем MinMaxScaler для id и отбрасываем 400 самых незначащих фич (используется в блендинге)

In [22]:
X = train_data.drop(columns=[*drop_400, 'smpl', 'target'])
scaler = MinMaxScaler()
X['id'] = scaler.fit_transform(train_data[['id']])
y = train_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42, stratify=y)

model = CatBoostClassifier(
    iterations=25000,
    learning_rate=0.01,
    depth=6,
    scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
    eval_metric='AUC',
    task_type="GPU",
    border_count=254,
    boosting_type="Plain",
    verbose=100
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

y_pred = model.predict_proba(X_test)
y_pred = pd.Series(y_pred[:, 1])

metric = roc_auc_score(y_test, y_pred)
print(f'Метрика roc-auc на валидационных данных имеет значение: {metric}')

joblib.dump(model, f'./data/saved_models/{MODEL2_NAME}.pkl')

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7811395	best: 0.7811395 (0)	total: 58.8ms	remaining: 24m 28s
1000:	test: 0.8459162	best: 0.8459162 (1000)	total: 45.9s	remaining: 18m 20s
2000:	test: 0.8516487	best: 0.8516487 (2000)	total: 1m 30s	remaining: 17m 16s
3000:	test: 0.8544977	best: 0.8544977 (3000)	total: 2m 13s	remaining: 16m 21s
4000:	test: 0.8561102	best: 0.8561102 (4000)	total: 2m 59s	remaining: 15m 41s
5000:	test: 0.8571810	best: 0.8571810 (5000)	total: 3m 43s	remaining: 14m 54s
6000:	test: 0.8579650	best: 0.8579650 (6000)	total: 4m 28s	remaining: 14m 8s
7000:	test: 0.8585862	best: 0.8585862 (7000)	total: 5m 12s	remaining: 13m 24s
8000:	test: 0.8590645	best: 0.8590672 (7995)	total: 5m 56s	remaining: 12m 38s
9000:	test: 0.8594635	best: 0.8594642 (8970)	total: 6m 40s	remaining: 11m 52s
10000:	test: 0.8598075	best: 0.8598075 (10000)	total: 7m 24s	remaining: 11m 6s
11000:	test: 0.8601505	best: 0.8601505 (11000)	total: 8m 7s	remaining: 10m 20s
12000:	test: 0.8604147	best: 0.8604147 (12000)	total: 8m 50s	remaining

['./data/saved_models/2-DESOLATOR-normid-drop400-25k-0.01-d6.pkl']

#### Третья модель, отбрасываем 300 самых незначащих фич (используется в блендинге)

In [25]:
X = train_data.drop(columns=[*drop_300, 'smpl', 'target'])
y = train_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

model = CatBoostClassifier(
    iterations=25000,
    learning_rate=0.01,
    depth=6,
    scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
    eval_metric='AUC',
    task_type="GPU",
    border_count=254,
    boosting_type="Plain",
    verbose=100
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

y_pred = model.predict_proba(X_test)
y_pred = pd.Series(y_pred[:, 1])

metric = roc_auc_score(y_test, y_pred)
print(f'Метрика roc-auc на валидационных данных имеет значение: {metric}')

joblib.dump(model, f'./data/saved_models/{MODEL3_NAME}.pkl')

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7829268	best: 0.7829268 (0)	total: 68.1ms	remaining: 28m 23s
100:	test: 0.8154120	best: 0.8154120 (100)	total: 6.4s	remaining: 26m 17s
200:	test: 0.8287149	best: 0.8287149 (200)	total: 12.9s	remaining: 26m 31s
300:	test: 0.8348721	best: 0.8348721 (300)	total: 19.1s	remaining: 26m 6s
400:	test: 0.8386215	best: 0.8386215 (400)	total: 25.2s	remaining: 25m 44s
500:	test: 0.8411649	best: 0.8411649 (500)	total: 31.3s	remaining: 25m 28s
600:	test: 0.8431700	best: 0.8431700 (600)	total: 37.3s	remaining: 25m 13s
700:	test: 0.8446877	best: 0.8446877 (700)	total: 43.2s	remaining: 24m 58s
800:	test: 0.8459573	best: 0.8459573 (800)	total: 49.3s	remaining: 24m 49s
900:	test: 0.8469706	best: 0.8469706 (900)	total: 55.3s	remaining: 24m 38s
1000:	test: 0.8479531	best: 0.8479531 (1000)	total: 1m 1s	remaining: 24m 33s
1100:	test: 0.8488008	best: 0.8488008 (1100)	total: 1m 7s	remaining: 24m 23s
1200:	test: 0.8495265	best: 0.8495265 (1200)	total: 1m 13s	remaining: 24m 13s
1300:	test: 0.8501871	b

['./data/saved_models/3-DESOLATOR-drop300-25k-0.01-d6.pkl']

#### Четвёртая модель, отбрасываем 350 самых незначащих фич (используется в блендинге)

In [26]:
X = train_data.drop(columns=[*drop_350, 'smpl', 'target'])
y = train_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

model = CatBoostClassifier(
    iterations=50000,
    learning_rate=0.005,
    depth=6,
    scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
    eval_metric='AUC',
    task_type="GPU",
    border_count=254,
    boosting_type="Plain",
    verbose=100
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

y_pred = model.predict_proba(X_test)
y_pred = pd.Series(y_pred[:, 1])

metric = roc_auc_score(y_test, y_pred)
print(f'Метрика roc-auc на валидационных данных имеет значение: {metric}')

joblib.dump(model, f'./data/saved_models/{MODEL4_NAME}.pkl')

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7829268	best: 0.7829268 (0)	total: 55.8ms	remaining: 46m 32s
100:	test: 0.8020344	best: 0.8020344 (100)	total: 5.3s	remaining: 43m 36s
200:	test: 0.8157485	best: 0.8157485 (200)	total: 10.5s	remaining: 43m 16s
300:	test: 0.8237808	best: 0.8237808 (300)	total: 15.6s	remaining: 43m
400:	test: 0.8286671	best: 0.8286671 (400)	total: 20.9s	remaining: 43m 11s
500:	test: 0.8321014	best: 0.8321014 (500)	total: 26.1s	remaining: 43m 3s
600:	test: 0.8346680	best: 0.8346680 (600)	total: 31.3s	remaining: 42m 51s
700:	test: 0.8367104	best: 0.8367104 (700)	total: 36.4s	remaining: 42m 39s
800:	test: 0.8384460	best: 0.8384460 (800)	total: 41.5s	remaining: 42m 30s
900:	test: 0.8398820	best: 0.8398820 (900)	total: 46.6s	remaining: 42m 20s
1000:	test: 0.8411036	best: 0.8411036 (1000)	total: 51.7s	remaining: 42m 11s
1100:	test: 0.8421664	best: 0.8421664 (1100)	total: 56.8s	remaining: 42m 1s
1200:	test: 0.8430812	best: 0.8430812 (1200)	total: 1m 1s	remaining: 41m 50s
1300:	test: 0.8438844	best: 0

['./data/saved_models/4-DESOLATOR-drop350-50k-0.05-d6.pkl']

#### Пятая модель, отбрасываем 400 самых незначащих фич (используется в блендинге)

In [27]:
X = train_data.drop(columns=[*drop_400, 'smpl', 'target'])
y = train_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)


model = CatBoostClassifier(
    iterations=25000,
    learning_rate=0.01,
    depth=6,
    scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
    eval_metric='AUC',
    task_type="GPU",
    border_count=254,
    boosting_type="Plain",
    verbose=100
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

y_pred = model.predict_proba(X_test)
y_pred = pd.Series(y_pred[:, 1])

metric = roc_auc_score(y_test, y_pred)
print(f'Метрика roc-auc на валидационных данных имеет значение: {metric}')

joblib.dump(model, f'./data/saved_models/{MODEL5_NAME}.pkl')

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7830338	best: 0.7830338 (0)	total: 50ms	remaining: 20m 51s
100:	test: 0.8156499	best: 0.8156499 (100)	total: 4.6s	remaining: 18m 53s
200:	test: 0.8288191	best: 0.8288191 (200)	total: 9.16s	remaining: 18m 49s
300:	test: 0.8349061	best: 0.8349061 (300)	total: 13.7s	remaining: 18m 44s
400:	test: 0.8386211	best: 0.8386211 (400)	total: 18.2s	remaining: 18m 34s
500:	test: 0.8411810	best: 0.8411810 (500)	total: 22.6s	remaining: 18m 25s
600:	test: 0.8431681	best: 0.8431681 (600)	total: 27s	remaining: 18m 16s
700:	test: 0.8446616	best: 0.8446616 (700)	total: 31.4s	remaining: 18m 8s
800:	test: 0.8459177	best: 0.8459177 (800)	total: 35.9s	remaining: 18m 3s
900:	test: 0.8469667	best: 0.8469667 (900)	total: 40.3s	remaining: 17m 57s
1000:	test: 0.8479019	best: 0.8479019 (1000)	total: 45s	remaining: 17m 58s
1100:	test: 0.8486876	best: 0.8486876 (1100)	total: 49.6s	remaining: 17m 57s
1200:	test: 0.8494065	best: 0.8494065 (1200)	total: 54s	remaining: 17m 50s
1300:	test: 0.8500735	best: 0.850

['./data/saved_models/5-DESOLATOR-drop400-25k-0.01-d6.pkl']

#### Шестая модель, отбрасываем 350 фич (но в отличие от предыдущей с 350 мы тут ставим 25к тераций и learning_rate=0.01)

In [28]:
X = train_data.drop(columns=[*drop_350, 'smpl', 'target'])
y = train_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

model = CatBoostClassifier(
    iterations=25000,
    learning_rate=0.01,
    depth=6,
    scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
    eval_metric='AUC',
    task_type="GPU",
    border_count=254,
    boosting_type="Plain",
    verbose=100
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

y_pred = model.predict_proba(X_test)
y_pred = pd.Series(y_pred[:, 1])

metric = roc_auc_score(y_test, y_pred)
print(f'Метрика roc-auc на валидационных данных имеет значение: {metric}')

joblib.dump(model, f'./data/saved_models/{MODEL6_NAME}.pkl')

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.7829268	best: 0.7829268 (0)	total: 56ms	remaining: 23m 19s
100:	test: 0.8156181	best: 0.8156181 (100)	total: 5.24s	remaining: 21m 31s
200:	test: 0.8286188	best: 0.8286188 (200)	total: 10.4s	remaining: 21m 27s
300:	test: 0.8347054	best: 0.8347054 (300)	total: 15.6s	remaining: 21m 18s
400:	test: 0.8385237	best: 0.8385237 (400)	total: 20.7s	remaining: 21m 10s
500:	test: 0.8411992	best: 0.8411992 (500)	total: 25.8s	remaining: 21m 2s
600:	test: 0.8431656	best: 0.8431656 (600)	total: 30.9s	remaining: 20m 52s
700:	test: 0.8446100	best: 0.8446100 (700)	total: 35.9s	remaining: 20m 45s
800:	test: 0.8458527	best: 0.8458527 (800)	total: 40.9s	remaining: 20m 36s
900:	test: 0.8468942	best: 0.8468942 (900)	total: 46s	remaining: 20m 30s
1000:	test: 0.8478534	best: 0.8478534 (1000)	total: 50.9s	remaining: 20m 21s
1100:	test: 0.8486596	best: 0.8486596 (1100)	total: 55.8s	remaining: 20m 11s
1200:	test: 0.8494233	best: 0.8494233 (1200)	total: 1m	remaining: 20m 2s
1300:	test: 0.8501065	best: 0.8

['./data/saved_models/6-DESOLATOR-drop350-25k-0.01-d6.pkl']

#### Седьмая модель, обучаем на всех фичах, но 500к итераций и learning_rate=0.001

In [None]:
X = train_data.drop(columns=['smpl', 'target'])
y = train_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)


model = CatBoostClassifier(
    iterations=500000,
    learning_rate=0.001,
    depth=6,
    scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
    eval_metric='AUC',
    task_type="GPU",
    border_count=254,
    boosting_type="Plain",
    verbose=100
)

model.fit(X_train, y_train, eval_set=(X_test, y_test), use_best_model=True)

y_pred = model.predict_proba(X_test)
y_pred = pd.Series(y_pred[:, 1])

metric = roc_auc_score(y_test, y_pred)
print(f'Метрика roc-auc на валидационных данных имеет значение: {metric}')

joblib.dump(model, f'./data/saved_models/{MODEL7_NAME}.pkl')

## LightAutoML модели

#### Восьмая модель, на 3 часа (используется в блендинге)

In [4]:
X = train_data.drop(columns=['target', 'smpl'])
y = train_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
del X, y
X_train = pd.concat([X_train, y_train], axis=1)
X_test = pd.concat([X_test, y_test], axis=1) 

task = Task('binary')
bst_models = ["lgb", "cb"]
linear_models = ["linear_l2"]
all_models = bst_models + linear_models

automl = TabularAutoML(
    task=task,
    timeout=3 * 3600,
    cpu_limit=15,
    general_params={"use_algos": [all_models]},
    nn_params={"n_epochs": 200, "bs": 512, "num_workers": 0, "path_to_save": None},
    reader_params={'n_jobs': 15}
)

oof_pred = automl.fit_predict(X_train, roles={'target': 'target'}, verbose=4)

test_pred = automl.predict(X_test)

print('AUC на тестовой выборке:', roc_auc_score(y_test, test_pred.data[:, 0]))

joblib.dump(automl, f"./data/saved_models/{MODEL8_NAME}.pkl")

[23:02:46] Stdout logging level is DEBUG.
[23:02:46] Copying TaskTimer may affect the parent PipelineTimer, so copy will create new unlimited TaskTimer
[23:02:46] Task: binary

[23:02:46] Start automl preset with listed constraints:
[23:02:46] - time: 10800.00 seconds
[23:02:46] - CPU: 15 cores
[23:02:46] - memory: 16 GB

[23:02:46] [1mTrain data shape: (4041421, 502)[0m

[23:04:45] Feats was rejected during automatic roles guess: []
[23:05:15] Layer [1m1[0m train process start. Time left 10651.19 secs
[23:09:24] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[23:09:24] Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [499, 500, 501, 502, 503, 504, 505, 506, 507, 508, 509, 510, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 

['./data/saved_models/8-RADIC-3h.pkl']

#### Девятая модель, на 6ч, отбрасываем 350 самых незначащих фич (используется в блендинге)

In [11]:
X = train_data.drop(columns=[*drop_350, 'smpl', 'target'])
y = train_data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
del X, y
X_train = pd.concat([X_train, y_train], axis=1)
X_test = pd.concat([X_test, y_test], axis=1) 

task = Task('binary')
bst_models = ["lgb", "cb"]
linear_models = ["linear_l2"]
all_models = bst_models + linear_models

automl = TabularAutoML(
    task=task,
    timeout=6 * 3600,
    cpu_limit=15,
    general_params={"use_algos": [all_models]},
    reader_params={'n_jobs': 15}
)

oof_pred = automl.fit_predict(X_train, roles={'target': 'target'}, verbose=4)

test_pred = automl.predict(X_test)

print('AUC на тестовой выборке:', roc_auc_score(y_test, test_pred.data[:, 0]))

joblib.dump(automl, f"./data/saved_models/{MODEL9_NAME}.pkl")

[07:06:03] Stdout logging level is DEBUG.
[07:06:03] Task: binary

[07:06:03] Start automl preset with listed constraints:
[07:06:03] - time: 21600.00 seconds
[07:06:03] - CPU: 15 cores
[07:06:03] - memory: 16 GB

[07:06:03] [1mTrain data shape: (4041421, 152)[0m

[07:06:29] Feats was rejected during automatic roles guess: []
[07:06:32] Layer [1m1[0m train process start. Time left 21570.70 secs
[07:08:50] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[07:08:50] Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188], 'embed_sizes': array([30, 22, 17, 17, 15, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 1

['./data/saved_models/9-RADIC-drop350-6h.pkl']

## Десятая модель, обучаем 14к секунд, отбрасываем 400 самых незначащих фич (используется в блендинге)

In [12]:
X = train_data.drop(columns=[*drop_400, 'smpl', 'target'])
y = train_data['target']
scaler = MinMaxScaler()
X['id'] = scaler.fit_transform(train_data[['id']])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)
del X, y
X_train = pd.concat([X_train, y_train], axis=1)
X_test = pd.concat([X_test, y_test], axis=1) 

task = Task('binary')
bst_models = ["lgb", "cb"]
linear_models = ["linear_l2"]
all_models = bst_models + linear_models

automl = TabularAutoML(
    task=task,
    timeout=14000,
    cpu_limit=15,
    general_params={"use_algos": [all_models]},
    reader_params={'n_jobs': 15}
)

oof_pred = automl.fit_predict(X_train, roles={'target': 'target'}, verbose=4)

test_pred = automl.predict(X_test)

print('AUC на тестовой выборке:', roc_auc_score(y_test, test_pred.data[:, 0]))

joblib.dump(automl, f"./data/saved_models/{MODEL10_NAME}.pkl")

[09:09:15] Stdout logging level is DEBUG.
[09:09:15] Task: binary

[09:09:15] Start automl preset with listed constraints:
[09:09:15] - time: 14000.00 seconds
[09:09:15] - CPU: 15 cores
[09:09:15] - memory: 16 GB

[09:09:15] [1mTrain data shape: (4041421, 102)[0m

[09:09:36] Feats was rejected during automatic roles guess: []
[09:09:38] Layer [1m1[0m train process start. Time left 13977.33 secs
[09:11:40] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
[09:11:40] Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133], 'embed_sizes': array([17, 15, 22, 17, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11]), 'data_size': 134}
[09:11:

['./data/saved_models/10-RADIC-drop400-normid-14ks.pkl']

## Получим результаты этих моделей на тестовой выборке

In [16]:
loaded_model_2 = joblib.load(f'./data/saved_models/{MODEL2_NAME}.pkl')
loaded_model_3 = joblib.load(f'./data/saved_models/{MODEL3_NAME}.pkl')
loaded_model_4 = joblib.load(f'./data/saved_models/{MODEL4_NAME}.pkl')
loaded_model_5 = joblib.load(f'./data/saved_models/{MODEL5_NAME}.pkl')
loaded_model_6 = joblib.load(f'./data/saved_models/{MODEL6_NAME}.pkl')
loaded_model_7 = joblib.load(f'./data/saved_models/{MODEL7_NAME}.pkl')
loaded_model_8 = joblib.load(f'./data/saved_models/{MODEL8_NAME}.pkl')
loaded_model_9 = joblib.load(f'./data/saved_models/{MODEL9_NAME}.pkl')
loaded_model_10 = joblib.load(f'./data/saved_models/{MODEL10_NAME}.pkl')

#### Загрузим тестовые данные

In [18]:
filenames_test = glob.glob(path_test + "/*.csv")
data_files_test = []

for filename in filenames_test:
    data_files_test.append(pd.read_csv(filename))
    print(f"completed {filename}")

test_data = pd.concat(data_files_test, ignore_index=True)
del data_files_test, filenames_test

completed ./data/test\test_1.csv
completed ./data/test\test_10.csv
completed ./data/test\test_2.csv
completed ./data/test\test_3.csv
completed ./data/test\test_4.csv
completed ./data/test\test_5.csv
completed ./data/test\test_6.csv
completed ./data/test\test_7.csv
completed ./data/test\test_8.csv
completed ./data/test\test_9.csv


#### Создаем предсказания для тестовых данных

In [34]:
# 2
res = pd.DataFrame()
res['id'] = test_data['id']
X_real_test = test_data.drop(columns=['smpl', *drop_400])
scaler = MinMaxScaler()
X_real_test['id'] = scaler.fit_transform(test_data[['id']])
y_pred = loaded_model_2.predict_proba(X_real_test)
y_pred = pd.Series(y_pred[:, 1])
res['target'] = y_pred
res[['id', 'target']].to_csv(f"{path_saved_predictions}/{MODEL2_NAME}.csv", index=False)

In [35]:
# 3
res = pd.DataFrame()
res['id'] = test_data['id']
X_real_test = test_data.drop(columns=['smpl', *drop_300])
y_pred = loaded_model_3.predict_proba(X_real_test)
y_pred = pd.Series(y_pred[:, 1])
res['target'] = y_pred
res[['id', 'target']].to_csv(f"{path_saved_predictions}/{MODEL3_NAME}.csv", index=False)

In [36]:
# 4
res = pd.DataFrame()
res['id'] = test_data['id']
X_real_test = test_data.drop(columns=['smpl', *drop_350])
y_pred = loaded_model_4.predict_proba(X_real_test)
y_pred = pd.Series(y_pred[:, 1])
res['target'] = y_pred
res[['id', 'target']].to_csv(f"{path_saved_predictions}/{MODEL4_NAME}.csv", index=False)

In [37]:
# 5
res = pd.DataFrame()
res['id'] = test_data['id']
X_real_test = test_data.drop(columns=['smpl', *drop_400])
y_pred = loaded_model_5.predict_proba(X_real_test)
y_pred = pd.Series(y_pred[:, 1])
res['target'] = y_pred
res[['id', 'target']].to_csv(f"{path_saved_predictions}/{MODEL5_NAME}.csv", index=False)

In [38]:
# 6
res = pd.DataFrame()
res['id'] = test_data['id']
X_real_test = test_data.drop(columns=['smpl', *drop_350])
y_pred = loaded_model_6.predict_proba(X_real_test)
y_pred = pd.Series(y_pred[:, 1])
res['target'] = y_pred
res[['id', 'target']].to_csv(f"{path_saved_predictions}/{MODEL6_NAME}.csv", index=False)

In [39]:
# 7
res = pd.DataFrame()
res['id'] = test_data['id']
X_real_test = test_data.drop(columns=['smpl'])
y_pred = loaded_model_7.predict_proba(X_real_test)
y_pred = pd.Series(y_pred[:, 1])
res['target'] = y_pred
res[['id', 'target']].to_csv(f"{path_saved_predictions}/{MODEL7_NAME}.csv", index=False)

In [41]:
# 8
res = pd.DataFrame()
res['id'] = test_data['id']
X_real_test = test_data.drop(columns=['smpl'])
y_pred = loaded_model_8.predict(X_real_test)
y_pred = y_pred.data[:, 0]
res['target'] = y_pred
res[['id', 'target']].to_csv(f"{path_saved_predictions}/{MODEL8_NAME}.csv", index=False)

In [42]:
# 9
res = pd.DataFrame()
res['id'] = test_data['id']
X_real_test = test_data.drop(columns=['smpl', *drop_350])
y_pred = loaded_model_9.predict(X_real_test)
y_pred = y_pred.data[:, 0]
res['target'] = y_pred
res[['id', 'target']].to_csv(f"{path_saved_predictions}/{MODEL9_NAME}.csv", index=False)

In [43]:
# 10
res = pd.DataFrame()
res['id'] = test_data['id']
X_real_test = test_data.drop(columns=['smpl', *drop_400])
scaler = MinMaxScaler()
X_real_test['id'] = scaler.fit_transform(test_data[['id']])
y_pred = loaded_model_10.predict(X_real_test)
y_pred = y_pred.data[:, 0]
res['target'] = y_pred
res[['id', 'target']].to_csv(f"{path_saved_predictions}/{MODEL10_NAME}.csv", index=False)