In [1]:
import pandas as pd
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt    
from fastparquet import ParquetFile

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, roc_auc_score

from utils import remove_highly_correlated_features, feature_drop

import warnings
warnings.filterwarnings("ignore")

  from pandas.core import (
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
file_path = "Data/train_ai_comp_final_dp.parquet"
pf = ParquetFile(file_path)
df = pf.to_pandas()

In [3]:
fature_to_drop = remove_highly_correlated_features(df, threshold=0.94)
df.drop(columns=fature_to_drop, inplace=True)
df = feature_drop(df)
df = df.fillna(0)

Optuna

In [6]:
import optuna
from catboost import CatBoostClassifier

X = df.drop(["target"], axis = 1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

def objective(trial):
    param = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-8, 100),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'bagging_temperature': trial.suggest_loguniform('bagging_temperature', 1.0, 100.0),
        'random_strength': trial.suggest_loguniform('random_strength', 1e-8, 1.0),
    }

    model = CatBoostClassifier(**param, loss_function='Logloss')
    model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

best_params = study.best_params
best_model = CatBoostClassifier(**best_params)
best_model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

[I 2024-03-11 17:29:26,666] A new study created in memory with name: no-name-4f12ccd2-964b-4d7e-b14b-3f8da52dbf00


0:	learn: 0.6901089	test: 0.6901159	best: 0.6901159 (0)	total: 80.9ms	remaining: 30.6s
100:	learn: 0.4589726	test: 0.4594493	best: 0.4594493 (100)	total: 6.34s	remaining: 17.5s
200:	learn: 0.3284557	test: 0.3293326	best: 0.3293326 (200)	total: 12.5s	remaining: 11.1s
300:	learn: 0.2540422	test: 0.2552969	best: 0.2552969 (300)	total: 18.7s	remaining: 4.84s
378:	learn: 0.2182098	test: 0.2197252	best: 0.2197252 (378)	total: 23.8s	remaining: 0us

bestTest = 0.2197252273
bestIteration = 378



[I 2024-03-11 17:29:51,337] Trial 0 finished with value: 0.9641080415307487 and parameters: {'iterations': 379, 'learning_rate': 0.0018214514701896244, 'depth': 8, 'l2_leaf_reg': 0.13923648506539132, 'border_count': 107, 'bagging_temperature': 6.388009669759876, 'random_strength': 6.065009944146732e-07}. Best is trial 0 with value: 0.9641080415307487.


0:	learn: 0.5540710	test: 0.5544735	best: 0.5544735 (0)	total: 89.8ms	remaining: 1m 9s


[I 2024-03-11 17:29:57,114] Trial 1 finished with value: 0.9640406839679379 and parameters: {'iterations': 771, 'learning_rate': 0.0907337864350032, 'depth': 9, 'l2_leaf_reg': 2.9955721382375857e-06, 'border_count': 19, 'bagging_temperature': 3.515027877040358, 'random_strength': 0.004076859439917506}. Best is trial 0 with value: 0.9641080415307487.


Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1452641565
bestIteration = 21

Shrink model to first 22 iterations.
0:	learn: 0.6873074	test: 0.6873260	best: 0.6873260 (0)	total: 232ms	remaining: 1m 31s
100:	learn: 0.3349630	test: 0.3358271	best: 0.3358271 (100)	total: 20.7s	remaining: 1m
200:	learn: 0.2145154	test: 0.2163504	best: 0.2163504 (200)	total: 41.4s	remaining: 40s
300:	learn: 0.1697562	test: 0.1725962	best: 0.1725962 (300)	total: 1m 2s	remaining: 19.4s
394:	learn: 0.1519065	test: 0.1557666	best: 0.1557666 (394)	total: 1m 21s	remaining: 0us

bestTest = 0.1557666168
bestIteration = 394



[I 2024-03-11 17:31:20,149] Trial 2 finished with value: 0.9640984190217757 and parameters: {'iterations': 395, 'learning_rate': 0.0035831057248123578, 'depth': 10, 'l2_leaf_reg': 0.076498894834829, 'border_count': 232, 'bagging_temperature': 7.108296475382758, 'random_strength': 0.14380376944283146}. Best is trial 0 with value: 0.9641080415307487.


0:	learn: 0.6898225	test: 0.6898392	best: 0.6898392 (0)	total: 69.9ms	remaining: 1m 5s
100:	learn: 0.4431325	test: 0.4435830	best: 0.4435830 (100)	total: 7.27s	remaining: 1m
200:	learn: 0.3112511	test: 0.3120807	best: 0.3120807 (200)	total: 14.4s	remaining: 52.8s
300:	learn: 0.2395895	test: 0.2407818	best: 0.2407818 (300)	total: 21.5s	remaining: 45.5s
400:	learn: 0.1994598	test: 0.2009661	best: 0.2009661 (400)	total: 28.7s	remaining: 38.4s
500:	learn: 0.1763489	test: 0.1781209	best: 0.1781209 (500)	total: 35.9s	remaining: 31.3s
600:	learn: 0.1625342	test: 0.1645480	best: 0.1645480 (600)	total: 43.3s	remaining: 24.2s
700:	learn: 0.1540853	test: 0.1563250	best: 0.1563250 (700)	total: 50.7s	remaining: 17.1s
800:	learn: 0.1487635	test: 0.1512193	best: 0.1512193 (800)	total: 58.3s	remaining: 9.89s
900:	learn: 0.1453181	test: 0.1479684	best: 0.1479684 (900)	total: 1m 5s	remaining: 2.63s


[I 2024-03-11 17:32:29,757] Trial 3 finished with value: 0.9640984190217757 and parameters: {'iterations': 937, 'learning_rate': 0.0020024066381994477, 'depth': 7, 'l2_leaf_reg': 0.015859389032544465, 'border_count': 163, 'bagging_temperature': 42.71968524796292, 'random_strength': 0.001565059724409236}. Best is trial 0 with value: 0.9641080415307487.


936:	learn: 0.1443936	test: 0.1471144	best: 0.1471144 (936)	total: 1m 8s	remaining: 0us

bestTest = 0.1471144364
bestIteration = 936

0:	learn: 0.6843353	test: 0.6843572	best: 0.6843572 (0)	total: 69.7ms	remaining: 15.9s
100:	learn: 0.2569721	test: 0.2581369	best: 0.2581369 (100)	total: 5.79s	remaining: 7.34s
200:	learn: 0.1700091	test: 0.1719763	best: 0.1719763 (200)	total: 11.9s	remaining: 1.65s


[I 2024-03-11 17:32:44,263] Trial 4 finished with value: 0.9640791740038298 and parameters: {'iterations': 229, 'learning_rate': 0.005360310662197945, 'depth': 7, 'l2_leaf_reg': 2.7773734847722597e-08, 'border_count': 234, 'bagging_temperature': 4.126566968676439, 'random_strength': 0.0008487848645879283}. Best is trial 0 with value: 0.9641080415307487.


228:	learn: 0.1609451	test: 0.1630872	best: 0.1630872 (228)	total: 13.6s	remaining: 0us

bestTest = 0.1630872456
bestIteration = 228

0:	learn: 0.6164309	test: 0.6166670	best: 0.6166670 (0)	total: 105ms	remaining: 11.6s
100:	learn: 0.1319618	test: 0.1404441	best: 0.1404441 (100)	total: 9.01s	remaining: 892ms


[I 2024-03-11 17:32:55,296] Trial 5 finished with value: 0.9641753990935596 and parameters: {'iterations': 111, 'learning_rate': 0.04826417429001852, 'depth': 9, 'l2_leaf_reg': 0.08398314732164144, 'border_count': 98, 'bagging_temperature': 55.89212412368227, 'random_strength': 0.05471602596321361}. Best is trial 5 with value: 0.9641753990935596.


110:	learn: 0.1313431	test: 0.1403357	best: 0.1403357 (110)	total: 9.96s	remaining: 0us

bestTest = 0.1403356677
bestIteration = 110

0:	learn: 0.6502169	test: 0.6503526	best: 0.6503526 (0)	total: 133ms	remaining: 2m 8s
100:	learn: 0.1386792	test: 0.1421137	best: 0.1421137 (100)	total: 10.9s	remaining: 1m 33s
200:	learn: 0.1349914	test: 0.1402683	best: 0.1402683 (200)	total: 22.7s	remaining: 1m 26s
300:	learn: 0.1340230	test: 0.1400058	best: 0.1400058 (300)	total: 32.9s	remaining: 1m 12s
400:	learn: 0.1324027	test: 0.1396788	best: 0.1396778 (399)	total: 43.2s	remaining: 1m 1s
500:	learn: 0.1311118	test: 0.1394748	best: 0.1394748 (500)	total: 53.4s	remaining: 49.6s
600:	learn: 0.1306527	test: 0.1394618	best: 0.1394603 (557)	total: 1m 2s	remaining: 38.1s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1394603195
bestIteration = 557

Shrink model to first 558 iterations.


[I 2024-03-11 17:33:59,584] Trial 6 finished with value: 0.9641850216025326 and parameters: {'iterations': 967, 'learning_rate': 0.02658897321419524, 'depth': 9, 'l2_leaf_reg': 69.82128787993716, 'border_count': 173, 'bagging_temperature': 6.442235736265833, 'random_strength': 0.04395871461511254}. Best is trial 6 with value: 0.9641850216025326.


0:	learn: 0.6899998	test: 0.6900132	best: 0.6900132 (0)	total: 217ms	remaining: 3m 21s
100:	learn: 0.4523545	test: 0.4529686	best: 0.4529686 (100)	total: 20.5s	remaining: 2m 47s
200:	learn: 0.3213814	test: 0.3224855	best: 0.3224855 (200)	total: 40.7s	remaining: 2m 27s
300:	learn: 0.2475363	test: 0.2491239	best: 0.2491239 (300)	total: 1m	remaining: 2m 5s
400:	learn: 0.2050552	test: 0.2071000	best: 0.2071000 (400)	total: 1m 19s	remaining: 1m 44s
500:	learn: 0.1800323	test: 0.1824910	best: 0.1824910 (500)	total: 1m 39s	remaining: 1m 24s
600:	learn: 0.1646640	test: 0.1675301	best: 0.1675301 (600)	total: 1m 58s	remaining: 1m 4s
700:	learn: 0.1549935	test: 0.1582545	best: 0.1582545 (700)	total: 2m 18s	remaining: 44.9s
800:	learn: 0.1487269	test: 0.1523606	best: 0.1523606 (800)	total: 2m 37s	remaining: 25.2s
900:	learn: 0.1445653	test: 0.1485610	best: 0.1485610 (900)	total: 2m 57s	remaining: 5.5s
928:	learn: 0.1436537	test: 0.1477496	best: 0.1477496 (928)	total: 3m 2s	remaining: 0us

bestTest

[I 2024-03-11 17:37:03,176] Trial 7 finished with value: 0.9641176640397218 and parameters: {'iterations': 929, 'learning_rate': 0.0018789283893821023, 'depth': 10, 'l2_leaf_reg': 5.277859430285405, 'border_count': 170, 'bagging_temperature': 1.341974740167281, 'random_strength': 0.023284862185227195}. Best is trial 6 with value: 0.9641850216025326.


0:	learn: 0.6883409	test: 0.6883493	best: 0.6883493 (0)	total: 49.7ms	remaining: 8.15s
100:	learn: 0.3713537	test: 0.3718453	best: 0.3718453 (100)	total: 3.76s	remaining: 2.38s


[I 2024-03-11 17:37:10,072] Trial 8 finished with value: 0.9640984190217757 and parameters: {'iterations': 165, 'learning_rate': 0.0029606729656976344, 'depth': 5, 'l2_leaf_reg': 0.1868449634798751, 'border_count': 95, 'bagging_temperature': 3.5279259838056953, 'random_strength': 0.00926659943107888}. Best is trial 6 with value: 0.9641850216025326.


164:	learn: 0.2774901	test: 0.2782638	best: 0.2782638 (164)	total: 6.1s	remaining: 0us

bestTest = 0.2782637948
bestIteration = 164

0:	learn: 0.5877857	test: 0.5880079	best: 0.5880079 (0)	total: 175ms	remaining: 2m 22s


[I 2024-03-11 17:37:24,399] Trial 9 finished with value: 0.9641080415307487 and parameters: {'iterations': 815, 'learning_rate': 0.06888180724705877, 'depth': 10, 'l2_leaf_reg': 0.00010930994189362305, 'border_count': 42, 'bagging_temperature': 65.68430176588623, 'random_strength': 0.17327256057937157}. Best is trial 6 with value: 0.9641850216025326.


Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1427787064
bestIteration = 37

Shrink model to first 38 iterations.
0:	learn: 0.6653243	test: 0.6653969	best: 0.6653969 (0)	total: 49.2ms	remaining: 30.9s
100:	learn: 0.1481570	test: 0.1500759	best: 0.1500759 (100)	total: 3.42s	remaining: 17.9s
200:	learn: 0.1400032	test: 0.1426463	best: 0.1426463 (200)	total: 6.9s	remaining: 14.7s
300:	learn: 0.1387216	test: 0.1417320	best: 0.1417320 (300)	total: 10.4s	remaining: 11.3s
400:	learn: 0.1380391	test: 0.1412759	best: 0.1412759 (400)	total: 13.9s	remaining: 7.89s
500:	learn: 0.1375551	test: 0.1409804	best: 0.1409804 (500)	total: 17.5s	remaining: 4.46s
600:	learn: 0.1372384	test: 0.1408050	best: 0.1408050 (600)	total: 20.8s	remaining: 968ms


[I 2024-03-11 17:37:47,072] Trial 10 finished with value: 0.9640887965128028 and parameters: {'iterations': 629, 'learning_rate': 0.017394460499227316, 'depth': 4, 'l2_leaf_reg': 26.202596280754044, 'border_count': 181, 'bagging_temperature': 18.422950281790417, 'random_strength': 1.0125359144673647e-05}. Best is trial 6 with value: 0.9641850216025326.


628:	learn: 0.1371499	test: 0.1407587	best: 0.1407587 (628)	total: 21.7s	remaining: 0us

bestTest = 0.1407586857
bestIteration = 628

0:	learn: 0.6538794	test: 0.6539581	best: 0.6539581 (0)	total: 77.6ms	remaining: 37.8s
100:	learn: 0.1409111	test: 0.1436078	best: 0.1436078 (100)	total: 8.62s	remaining: 33s
200:	learn: 0.1365552	test: 0.1409055	best: 0.1409055 (200)	total: 17.3s	remaining: 24.7s
300:	learn: 0.1347193	test: 0.1402647	best: 0.1402647 (300)	total: 26s	remaining: 16.2s
400:	learn: 0.1333699	test: 0.1399223	best: 0.1399223 (400)	total: 34.5s	remaining: 7.48s
487:	learn: 0.1324551	test: 0.1397677	best: 0.1397676 (486)	total: 41.8s	remaining: 0us

bestTest = 0.1397676434
bestIteration = 486

Shrink model to first 487 iterations.


[I 2024-03-11 17:38:29,950] Trial 11 finished with value: 0.9641850216025326 and parameters: {'iterations': 488, 'learning_rate': 0.02604600844351376, 'depth': 8, 'l2_leaf_reg': 27.10416864989904, 'border_count': 71, 'bagging_temperature': 19.836877254791684, 'random_strength': 0.9543806349350501}. Best is trial 6 with value: 0.9641850216025326.


0:	learn: 0.6624999	test: 0.6625842	best: 0.6625842 (0)	total: 76.1ms	remaining: 40.7s
100:	learn: 0.1447682	test: 0.1469256	best: 0.1469256 (100)	total: 8.51s	remaining: 36.6s
200:	learn: 0.1378701	test: 0.1414254	best: 0.1414254 (200)	total: 17.2s	remaining: 28.6s
300:	learn: 0.1361020	test: 0.1406030	best: 0.1406030 (300)	total: 25.9s	remaining: 20.2s
400:	learn: 0.1349114	test: 0.1401735	best: 0.1401735 (400)	total: 34.5s	remaining: 11.5s
500:	learn: 0.1339021	test: 0.1399007	best: 0.1399007 (500)	total: 43s	remaining: 2.92s
534:	learn: 0.1335875	test: 0.1398095	best: 0.1398095 (534)	total: 45.9s	remaining: 0us

bestTest = 0.1398094569
bestIteration = 534



[I 2024-03-11 17:39:16,794] Trial 12 finished with value: 0.9641465315666407 and parameters: {'iterations': 535, 'learning_rate': 0.02024433407608719, 'depth': 8, 'l2_leaf_reg': 57.900138763498944, 'border_count': 71, 'bagging_temperature': 17.80206081478529, 'random_strength': 0.8216478400112532}. Best is trial 6 with value: 0.9641850216025326.


0:	learn: 0.6509446	test: 0.6510665	best: 0.6510665 (0)	total: 58.6ms	remaining: 34.6s
100:	learn: 0.1403687	test: 0.1430229	best: 0.1430229 (100)	total: 4.86s	remaining: 23.6s
200:	learn: 0.1372101	test: 0.1408298	best: 0.1408298 (200)	total: 9.67s	remaining: 18.8s
300:	learn: 0.1359765	test: 0.1402798	best: 0.1402798 (300)	total: 14.4s	remaining: 13.9s
400:	learn: 0.1350516	test: 0.1399807	best: 0.1399805 (399)	total: 19.1s	remaining: 9.07s
500:	learn: 0.1343067	test: 0.1397711	best: 0.1397711 (500)	total: 23.6s	remaining: 4.24s
590:	learn: 0.1337794	test: 0.1396887	best: 0.1396879 (584)	total: 27.4s	remaining: 0us

bestTest = 0.1396878648
bestIteration = 584

Shrink model to first 585 iterations.


[I 2024-03-11 17:39:45,180] Trial 13 finished with value: 0.9641272865486947 and parameters: {'iterations': 591, 'learning_rate': 0.0261720462957187, 'depth': 6, 'l2_leaf_reg': 4.237867819218878, 'border_count': 140, 'bagging_temperature': 18.921126756050928, 'random_strength': 5.788866454696822e-05}. Best is trial 6 with value: 0.9641850216025326.


0:	learn: 0.6775860	test: 0.6776403	best: 0.6776403 (0)	total: 94.3ms	remaining: 41.3s
100:	learn: 0.1803669	test: 0.1825293	best: 0.1825293 (100)	total: 7.57s	remaining: 25.3s
200:	learn: 0.1438489	test: 0.1468288	best: 0.1468288 (200)	total: 15.1s	remaining: 17.9s
300:	learn: 0.1387645	test: 0.1431852	best: 0.1431852 (300)	total: 22.9s	remaining: 10.5s
400:	learn: 0.1368870	test: 0.1421402	best: 0.1421402 (400)	total: 30.3s	remaining: 2.87s


[I 2024-03-11 17:40:19,193] Trial 14 finished with value: 0.9641369090576677 and parameters: {'iterations': 439, 'learning_rate': 0.009338487232988092, 'depth': 8, 'l2_leaf_reg': 0.00125534984885279, 'border_count': 201, 'bagging_temperature': 1.4472513013016313, 'random_strength': 4.9752454371711714e-08}. Best is trial 6 with value: 0.9641850216025326.


438:	learn: 0.1363059	test: 0.1419594	best: 0.1419594 (438)	total: 33.1s	remaining: 0us

bestTest = 0.1419593796
bestIteration = 438

0:	learn: 0.6353823	test: 0.6355263	best: 0.6355263 (0)	total: 89.4ms	remaining: 1m 5s
100:	learn: 0.1359161	test: 0.1413828	best: 0.1413828 (100)	total: 8.25s	remaining: 51.6s
200:	learn: 0.1310888	test: 0.1402165	best: 0.1402165 (200)	total: 16.5s	remaining: 43.6s
300:	learn: 0.1280165	test: 0.1398502	best: 0.1398502 (300)	total: 24.5s	remaining: 35.1s
400:	learn: 0.1256624	test: 0.1395746	best: 0.1395746 (400)	total: 32.1s	remaining: 26.6s
500:	learn: 0.1235217	test: 0.1395579	best: 0.1395017 (450)	total: 39.7s	remaining: 18.4s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1395016907
bestIteration = 450

Shrink model to first 451 iterations.


[I 2024-03-11 17:40:59,927] Trial 15 finished with value: 0.9641369090576677 and parameters: {'iterations': 733, 'learning_rate': 0.038135356050125746, 'depth': 9, 'l2_leaf_reg': 1.5574897780323855, 'border_count': 63, 'bagging_temperature': 27.993485332250682, 'random_strength': 0.7557112326889771}. Best is trial 6 with value: 0.9641850216025326.


0:	learn: 0.6756135	test: 0.6756734	best: 0.6756734 (0)	total: 63.1ms	remaining: 19.4s
100:	learn: 0.1714021	test: 0.1728897	best: 0.1728897 (100)	total: 4.66s	remaining: 9.55s
200:	learn: 0.1430181	test: 0.1453223	best: 0.1453223 (200)	total: 9.37s	remaining: 4.99s
300:	learn: 0.1392702	test: 0.1421013	best: 0.1421013 (300)	total: 14.2s	remaining: 330ms


[I 2024-03-11 17:41:15,355] Trial 16 finished with value: 0.9640791740038298 and parameters: {'iterations': 308, 'learning_rate': 0.010639728965247551, 'depth': 6, 'l2_leaf_reg': 82.25015117356021, 'border_count': 137, 'bagging_temperature': 10.940021844888722, 'random_strength': 0.00014292338868302366}. Best is trial 6 with value: 0.9641850216025326.


307:	learn: 0.1391566	test: 0.1420178	best: 0.1420178 (307)	total: 14.5s	remaining: 0us

bestTest = 0.1420177963
bestIteration = 307

0:	learn: 0.6757679	test: 0.6758100	best: 0.6758100 (0)	total: 82.8ms	remaining: 40.1s
100:	learn: 0.1703369	test: 0.1719740	best: 0.1719740 (100)	total: 5.74s	remaining: 21.8s
200:	learn: 0.1424574	test: 0.1452271	best: 0.1452271 (200)	total: 11.5s	remaining: 16.2s
300:	learn: 0.1381571	test: 0.1421693	best: 0.1421693 (300)	total: 17.4s	remaining: 10.7s
400:	learn: 0.1362753	test: 0.1413428	best: 0.1413428 (400)	total: 23.2s	remaining: 4.87s
484:	learn: 0.1351256	test: 0.1409823	best: 0.1409823 (484)	total: 28.1s	remaining: 0us

bestTest = 0.140982342
bestIteration = 484



[I 2024-03-11 17:41:44,390] Trial 17 finished with value: 0.9640984190217757 and parameters: {'iterations': 485, 'learning_rate': 0.011320119277877723, 'depth': 8, 'l2_leaf_reg': 0.0023694907076421703, 'border_count': 8, 'bagging_temperature': 10.908807521927466, 'random_strength': 0.7549839007890735}. Best is trial 6 with value: 0.9641850216025326.


0:	learn: 0.6301151	test: 0.6303463	best: 0.6303463 (0)	total: 130ms	remaining: 1m 27s
100:	learn: 0.1337407	test: 0.1412205	best: 0.1412205 (100)	total: 11s	remaining: 1m 2s
200:	learn: 0.1280735	test: 0.1404198	best: 0.1404198 (200)	total: 21.3s	remaining: 50.3s
300:	learn: 0.1236387	test: 0.1487911	best: 0.1402385 (282)	total: 31.5s	remaining: 39.2s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1402384906
bestIteration = 282

Shrink model to first 283 iterations.


[I 2024-03-11 17:42:20,098] Trial 18 finished with value: 0.9641850216025326 and parameters: {'iterations': 675, 'learning_rate': 0.03897146595264372, 'depth': 9, 'l2_leaf_reg': 4.388842739409182e-05, 'border_count': 215, 'bagging_temperature': 37.60823804412858, 'random_strength': 0.024314872567937438}. Best is trial 6 with value: 0.9641850216025326.


0:	learn: 0.6669845	test: 0.6670613	best: 0.6670613 (0)	total: 60.5ms	remaining: 53.6s
100:	learn: 0.1487554	test: 0.1508869	best: 0.1508869 (100)	total: 4.72s	remaining: 36.8s
200:	learn: 0.1388232	test: 0.1420179	best: 0.1420179 (200)	total: 9.46s	remaining: 32.4s
300:	learn: 0.1370878	test: 0.1410822	best: 0.1410822 (300)	total: 14.1s	remaining: 27.6s
400:	learn: 0.1359528	test: 0.1406965	best: 0.1406965 (400)	total: 18.7s	remaining: 22.7s
500:	learn: 0.1350312	test: 0.1404731	best: 0.1404731 (500)	total: 23.2s	remaining: 17.9s
600:	learn: 0.1342063	test: 0.1402834	best: 0.1402834 (600)	total: 27.6s	remaining: 13.2s
700:	learn: 0.1334358	test: 0.1401674	best: 0.1401674 (700)	total: 31.8s	remaining: 8.49s
800:	learn: 0.1327382	test: 0.1400564	best: 0.1400564 (800)	total: 36.3s	remaining: 3.94s
887:	learn: 0.1321880	test: 0.1399882	best: 0.1399882 (887)	total: 39.8s	remaining: 0us

bestTest = 0.1399881759
bestIteration = 887



[I 2024-03-11 17:43:00,935] Trial 19 finished with value: 0.9641465315666407 and parameters: {'iterations': 888, 'learning_rate': 0.01613041049950429, 'depth': 6, 'l2_leaf_reg': 1.0289368990261868e-08, 'border_count': 123, 'bagging_temperature': 2.145380597579815, 'random_strength': 0.00036256205069602537}. Best is trial 6 with value: 0.9641850216025326.


0:	learn: 0.6824691	test: 0.6825130	best: 0.6825130 (0)	total: 66.1ms	remaining: 19.8s
100:	learn: 0.2278266	test: 0.2289900	best: 0.2289900 (100)	total: 6.02s	remaining: 11.9s
200:	learn: 0.1581400	test: 0.1600654	best: 0.1600654 (200)	total: 12.3s	remaining: 6.06s


[I 2024-03-11 17:43:20,467] Trial 20 finished with value: 0.9640984190217757 and parameters: {'iterations': 300, 'learning_rate': 0.0064297953404262465, 'depth': 7, 'l2_leaf_reg': 5.377716817754953, 'border_count': 67, 'bagging_temperature': 99.46574830126507, 'random_strength': 4.259009323313773e-06}. Best is trial 6 with value: 0.9641850216025326.


299:	learn: 0.1438662	test: 0.1463626	best: 0.1463626 (299)	total: 18.6s	remaining: 0us

bestTest = 0.1463626078
bestIteration = 299

0:	learn: 0.6344628	test: 0.6346573	best: 0.6346573 (0)	total: 131ms	remaining: 1m 27s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1585924301
bestIteration = 35

Shrink model to first 36 iterations.


[I 2024-03-11 17:43:30,804] Trial 21 finished with value: 0.9640599289858838 and parameters: {'iterations': 674, 'learning_rate': 0.03621156640968378, 'depth': 9, 'l2_leaf_reg': 0.00011809723250671714, 'border_count': 211, 'bagging_temperature': 30.15235342695951, 'random_strength': 0.03784267704983452}. Best is trial 6 with value: 0.9641850216025326.


0:	learn: 0.6439145	test: 0.6440336	best: 0.6440336 (0)	total: 124ms	remaining: 2m 3s
100:	learn: 0.1372416	test: 0.8546970	best: 0.1456733 (63)	total: 10.7s	remaining: 1m 34s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.145673307
bestIteration = 63

Shrink model to first 64 iterations.


[I 2024-03-11 17:43:43,779] Trial 22 finished with value: 0.9640791740038298 and parameters: {'iterations': 995, 'learning_rate': 0.03125742185973268, 'depth': 9, 'l2_leaf_reg': 3.80014646505508e-06, 'border_count': 201, 'bagging_temperature': 29.728127692927778, 'random_strength': 0.1311106348339184}. Best is trial 6 with value: 0.9641850216025326.


0:	learn: 0.5985717	test: 0.5988757	best: 0.5988757 (0)	total: 92.8ms	remaining: 1m 3s


[W 2024-03-11 17:43:49,673] Trial 23 failed with parameters: {'iterations': 690, 'learning_rate': 0.06005759286395733, 'depth': 8, 'l2_leaf_reg': 6.839404574591889e-06, 'border_count': 244, 'bagging_temperature': 14.32040651248944, 'random_strength': 0.003056473630504032} because of the following error: CatBoostError('/src/catboost/catboost/private/libs/algo/tensor_search_helpers.cpp:99: This should be unreachable').
Traceback (most recent call last):
  File "/cephfs/projects/ppashin/.local/lib/python3.11/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/tmp/ipykernel_152839/816748112.py", line 20, in objective
    model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)
  File "/cephfs/projects/ppashin/.local/lib/python3.11/site-packages/catboost/core.py", line 5201, in fit
    self._fit(X, y, cat_features, text_features, embedding_features, None, sample_we

CatBoostError: /src/catboost/catboost/private/libs/algo/tensor_search_helpers.cpp:99: This should be unreachable

In [5]:
y_pred_proba = best_model.predict_proba(X_test)
y_pred = best_model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {classification_report(y_test, y_pred)}')

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba.ravel()[1::2])

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'ROC AUC: {roc_auc}')

Accuracy:               precision    recall  f1-score   support

           0       0.96      1.00      0.98    100192
           1       0.65      0.01      0.01      3731

    accuracy                           0.96    103923
   macro avg       0.80      0.50      0.50    103923
weighted avg       0.95      0.96      0.95    103923

Precision: 0.6451612903225806
Recall: 0.005360493165371214
ROC AUC: 0.7381234061157389


Категориальные фичи 

In [15]:
# Artem cat features 
cat_features_tmp = ['feature5','feature6','feature7','feature9','feature74','feature85','feature96','feature106','feature117','feature127','feature138','feature150','feature163','feature174','feature215','feature293','feature343','feature344','feature345','feature346','feature347','feature348','feature349','feature350','feature352','feature356','feature360','feature363','feature364','feature374','feature379','feature381','feature383','feature389','feature393','feature395','feature396','feature397','feature398','feature399','feature400','feature401','feature402','feature403','feature404','feature408','feature409','feature410','feature411','feature412','feature413','feature415','feature416','feature417','feature419','feature421','feature422','feature425','feature427','feature428','feature430','feature431','feature432','feature436','feature437','feature438','feature439','feature441','feature442','feature443','feature444','feature445','feature446','feature447','feature448','feature449','feature450','feature452','feature453','feature454','feature459','feature460','feature463','feature465','feature466','feature472','feature474','feature475','feature476','feature477','feature478','feature479','feature480','feature481','feature483','feature485','feature486','feature488','feature490','feature491','feature492','feature493','feature494','feature497','feature498','feature499','feature500','feature501','feature502','feature504','feature505','feature508','feature509','feature510','feature512','feature513','feature514','feature515','feature516','feature517','feature519','feature520','feature521','feature524','feature525','feature526','feature527','feature528','feature534','feature535','feature538','feature539','feature541','feature542','feature543','feature544','feature547','feature548','feature549','feature550','feature552','feature553','feature554','feature557','feature558','feature559','feature560','feature577','feature580','feature581','feature594','feature596','feature597','feature598','feature599','feature600','feature601','feature602','feature604','feature606','feature608','feature609','feature610','feature611','feature612','feature613','feature616','feature617','feature619','feature620','feature621','feature622','feature631','feature634','feature639','feature644','feature647','feature650','feature654','feature660','feature664','feature666','feature668','feature672','feature673','feature682','feature692','feature695','feature698','feature705','feature712','feature740','feature741','feature743','feature744','feature746','feature747','feature750','feature751','feature753','feature754','feature757','feature758','feature760','feature763','feature766','feature767','feature769','feature770','feature771','feature773','feature776','feature777','feature786','feature787','feature788','feature789','feature790','feature791','feature792','feature794','feature795','feature796','feature797','feature800','feature803','feature804','feature805','feature811','feature812','feature813','feature814','feature815','feature835','feature838','feature842','feature843','feature845','feature851','feature853','feature855','feature857','feature861','feature863','feature864','feature866','feature868','feature870','feature873','feature875','feature877','feature879','feature881','feature884','feature887','feature927','feature928','feature929','feature930','feature931','feature932','feature937','feature991','feature992','feature993','feature994','feature995','feature996','feature997','feature998','feature999','feature1000','feature1003','feature1062','feature1063','feature1064','feature1065','feature1066','feature1068','feature1069','feature1070','feature1071','feature1072','feature1073','feature1074','feature1075','feature1076']
cat_features = [feature for feature in cat_features_tmp if feature in df.columns]

In [16]:
cat_features = [col for col in df.columns if df[col].nunique() < 5 and col != "target"]

190

In [17]:
# convert cats to string
for col in cat_features:
    df[col] = df[col].astype(str)

In [18]:
X = df.drop(["target"], axis = 1)
y = df['target']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

model = CatBoostClassifier(iterations=1000, learning_rate=0.01, depth=8, loss_function='Logloss', cat_features=cat_features)
model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=100)

0:	learn: 0.6779823	test: 0.6779615	best: 0.6779615 (0)	total: 793ms	remaining: 13m 11s
100:	learn: 0.1840523	test: 0.1828898	best: 0.1828898 (100)	total: 1m 49s	remaining: 16m 12s
200:	learn: 0.1466391	test: 0.1459123	best: 0.1459123 (200)	total: 4m 29s	remaining: 17m 52s
300:	learn: 0.1403589	test: 0.1404598	best: 0.1404598 (300)	total: 7m 31s	remaining: 17m 28s
400:	learn: 0.1382218	test: 0.1390445	best: 0.1390445 (400)	total: 10m 19s	remaining: 15m 25s
500:	learn: 0.1370145	test: 0.1384657	best: 0.1384657 (500)	total: 13m 15s	remaining: 13m 11s
600:	learn: 0.1361270	test: 0.1381147	best: 0.1381147 (600)	total: 16m 29s	remaining: 10m 56s
700:	learn: 0.1354130	test: 0.1378616	best: 0.1378616 (700)	total: 19m 45s	remaining: 8m 25s
800:	learn: 0.1348415	test: 0.1377058	best: 0.1377058 (800)	total: 22m 56s	remaining: 5m 41s
900:	learn: 0.1343423	test: 0.1375738	best: 0.1375738 (900)	total: 26m 1s	remaining: 2m 51s
999:	learn: 0.1339338	test: 0.1374693	best: 0.1374693 (999)	total: 29m 7s

<catboost.core.CatBoostClassifier at 0x7f1625c6cb50>

In [23]:
feature_weights = {feature: weight for feature, weight in zip(X.columns, model.feature_importances_)}
dict(sorted(feature_weights.items(), key=lambda item: item[1], reverse=True))

{'feature1004': 10.25550868976953,
 'feature341': 5.448038390055345,
 'feature318': 5.088408990697241,
 'feature1000': 3.4621059234682305,
 'feature994': 2.8880895645245,
 'feature988': 2.589907079047815,
 'feature920': 2.2319737769760652,
 'feature210': 2.2288415732108433,
 'feature356': 2.068587209501762,
 'feature951': 2.014806758640155,
 'feature320': 1.9601388914492945,
 'feature208': 1.9003488756101612,
 'feature713': 1.8027410623113953,
 'feature936': 1.7247850117706516,
 'feature942': 1.6868011865573591,
 'feature922': 1.6207836059054634,
 'feature950': 1.6150634588144794,
 'feature357': 1.5422446849839793,
 'feature349': 1.4763411804980442,
 'feature941': 1.3130462302408705,
 'feature935': 1.1653183636486504,
 'feature444': 1.1559362876802721,
 'feature783': 1.1156465260760717,
 'feature940': 1.106815295659792,
 'feature1': 1.100492397898269,
 'feature551': 1.0996554580530191,
 'feature990': 1.077964538732958,
 'feature191': 1.0667436536315469,
 'feature534': 0.998438245418040

In [19]:
y_pred_proba = model.predict_proba(X_test)
y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {classification_report(y_test, y_pred)}')

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba.ravel()[1::2])

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'ROC AUC: {roc_auc}')

Accuracy:               precision    recall  f1-score   support

           0       0.96      1.00      0.98     75146
           1       0.60      0.00      0.01      2797

    accuracy                           0.96     77943
   macro avg       0.78      0.50      0.49     77943
weighted avg       0.95      0.96      0.95     77943

Precision: 0.6
Recall: 0.0032177332856632105
ROC AUC: 0.7418843837886655


Отбор по feature importance

In [6]:
feature_weights = {feature: weight for feature, weight in zip(X.columns, model.feature_importances_)}
selected_features = [k for k, v in feature_weights.items() if v > 0]
f" отобралось {len(selected_features)} фич из {len(feature_weights)} "

(561, 749)

In [11]:
X_sel = df[selected_features]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=42)

model = CatBoostClassifier(iterations=1000, learning_rate=0.01, depth=8, loss_function='Logloss')
model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

0:	learn: 0.6777413	test: 0.6777841	best: 0.6777841 (0)	total: 90.8ms	remaining: 1m 30s
100:	learn: 0.1816663	test: 0.1828781	best: 0.1828781 (100)	total: 7.47s	remaining: 1m 6s
200:	learn: 0.1452007	test: 0.1475164	best: 0.1475164 (200)	total: 15.5s	remaining: 1m 1s
300:	learn: 0.1393600	test: 0.1426631	best: 0.1426631 (300)	total: 23.9s	remaining: 55.6s
400:	learn: 0.1374007	test: 0.1415068	best: 0.1415068 (400)	total: 32.1s	remaining: 48s
500:	learn: 0.1361880	test: 0.1409738	best: 0.1409738 (500)	total: 39.8s	remaining: 39.7s
600:	learn: 0.1353036	test: 0.1406660	best: 0.1406660 (600)	total: 47.4s	remaining: 31.5s
700:	learn: 0.1345623	test: 0.1404493	best: 0.1404493 (700)	total: 54.8s	remaining: 23.4s
800:	learn: 0.1338575	test: 0.1402639	best: 0.1402639 (800)	total: 1m 2s	remaining: 15.5s
900:	learn: 0.1332664	test: 0.1401084	best: 0.1401084 (900)	total: 1m 9s	remaining: 7.63s
999:	learn: 0.1327011	test: 0.1399924	best: 0.1399924 (999)	total: 1m 16s	remaining: 0us

bestTest = 0.1

<catboost.core.CatBoostClassifier at 0x7fa3d0084310>

In [12]:
y_pred_proba = model.predict_proba(X_test)
y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {classification_report(y_test, y_pred)}')

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba.ravel()[1::2])

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'ROC AUC: {roc_auc}')

Accuracy:               precision    recall  f1-score   support

           0       0.96      1.00      0.98    100192
           1       0.77      0.00      0.01      3731

    accuracy                           0.96    103923
   macro avg       0.87      0.50      0.49    103923
weighted avg       0.96      0.96      0.95    103923

Precision: 0.7692307692307693
Recall: 0.002680246582685607
ROC AUC: 0.7406038834812663


In [9]:
feature_weights = {feature: weight for feature, weight in zip(X.columns, model.feature_importances_)}
selected_features = [k for k, v in dict(sorted(feature_weights.items(), key=lambda item: item[1], reverse=True)).items() if v > 0]
len(selected_features), len(feature_weights)

(561, 749)

In [10]:
feature_weights

{'id': 0.03807066600838285,
 'sample_ml_new': 0.0,
 'feature1': 1.2488766221397836,
 'feature2': 0.1598991122892265,
 'feature3': 0.08070800397400199,
 'feature4': 0.1116965235891375,
 'feature5': 0.06052095579327117,
 'feature6': 0.18210495553714393,
 'feature7': 0.05118671231306596,
 'feature9': 0.09000518155062921,
 'feature12': 0.07257652168078543,
 'feature18': 0.13056974251327808,
 'feature19': 0.0552117134006944,
 'feature22': 0.016558461033134034,
 'feature23': 2.685235435876654e-05,
 'feature24': 0.0056580833675225805,
 'feature25': 0.07387581848902283,
 'feature26': 0.0,
 'feature27': 3.734147911872943e-05,
 'feature28': 0.00016417105350564678,
 'feature30': 0.016597221193952943,
 'feature31': 0.010995943441797522,
 'feature33': 0.0,
 'feature35': 0.45975151153987803,
 'feature36': 0.008317434195305954,
 'feature37': 0.09306898550918699,
 'feature38': 0.0327386469618588,
 'feature41': 0.035665957179934295,
 'feature42': 0.02063912858843175,
 'feature43': 0.242487277290389,
 '

In [None]:
import pandas as pd
from sklearn.linear_model import Lasso
import matplotlib.pyplot as plt    
from fastparquet import ParquetFile

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import precision_score, recall_score, roc_auc_score

from utils import remove_highly_correlated_features, feature_drop

import warnings
warnings.filterwarnings("ignore")


file_path = "Data/train_ai_comp_final_dp.parquet"
pf = ParquetFile(file_path)
df = pf.to_pandas()

fature_to_drop = remove_highly_correlated_features(df, threshold=0.94)
df.drop(columns=fature_to_drop, inplace=True)
df = feature_drop(df)
df = df.fillna(0)

# Artem cat features 
cat_features_tmp = ['feature5','feature6','feature7','feature9','feature74','feature85','feature96','feature106','feature117','feature127','feature138','feature150','feature163','feature174','feature215','feature293','feature343','feature344','feature345','feature346','feature347','feature348','feature349','feature350','feature352','feature356','feature360','feature363','feature364','feature374','feature379','feature381','feature383','feature389','feature393','feature395','feature396','feature397','feature398','feature399','feature400','feature401','feature402','feature403','feature404','feature408','feature409','feature410','feature411','feature412','feature413','feature415','feature416','feature417','feature419','feature421','feature422','feature425','feature427','feature428','feature430','feature431','feature432','feature436','feature437','feature438','feature439','feature441','feature442','feature443','feature444','feature445','feature446','feature447','feature448','feature449','feature450','feature452','feature453','feature454','feature459','feature460','feature463','feature465','feature466','feature472','feature474','feature475','feature476','feature477','feature478','feature479','feature480','feature481','feature483','feature485','feature486','feature488','feature490','feature491','feature492','feature493','feature494','feature497','feature498','feature499','feature500','feature501','feature502','feature504','feature505','feature508','feature509','feature510','feature512','feature513','feature514','feature515','feature516','feature517','feature519','feature520','feature521','feature524','feature525','feature526','feature527','feature528','feature534','feature535','feature538','feature539','feature541','feature542','feature543','feature544','feature547','feature548','feature549','feature550','feature552','feature553','feature554','feature557','feature558','feature559','feature560','feature577','feature580','feature581','feature594','feature596','feature597','feature598','feature599','feature600','feature601','feature602','feature604','feature606','feature608','feature609','feature610','feature611','feature612','feature613','feature616','feature617','feature619','feature620','feature621','feature622','feature631','feature634','feature639','feature644','feature647','feature650','feature654','feature660','feature664','feature666','feature668','feature672','feature673','feature682','feature692','feature695','feature698','feature705','feature712','feature740','feature741','feature743','feature744','feature746','feature747','feature750','feature751','feature753','feature754','feature757','feature758','feature760','feature763','feature766','feature767','feature769','feature770','feature771','feature773','feature776','feature777','feature786','feature787','feature788','feature789','feature790','feature791','feature792','feature794','feature795','feature796','feature797','feature800','feature803','feature804','feature805','feature811','feature812','feature813','feature814','feature815','feature835','feature838','feature842','feature843','feature845','feature851','feature853','feature855','feature857','feature861','feature863','feature864','feature866','feature868','feature870','feature873','feature875','feature877','feature879','feature881','feature884','feature887','feature927','feature928','feature929','feature930','feature931','feature932','feature937','feature991','feature992','feature993','feature994','feature995','feature996','feature997','feature998','feature999','feature1000','feature1003','feature1062','feature1063','feature1064','feature1065','feature1066','feature1068','feature1069','feature1070','feature1071','feature1072','feature1073','feature1074','feature1075','feature1076']
cat_features = [feature for feature in cat_features_tmp if feature in df.columns]

# convert cats to string
for col in cat_features:
    df[col] = df[col].astype(str)


X = df.drop(["target"], axis = 1)
y = df['target']
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

model = CatBoostClassifier(iterations=1000, learning_rate=0.01, depth=8, loss_function='Logloss', cat_features=cat_features)
model.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=50, verbose=100)


# metrics
y_pred_proba = model.predict_proba(X_test)
y_pred = model.predict(X_test)
# accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {classification_report(y_test, y_pred)}')

precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred_proba.ravel()[1::2])

print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'ROC AUC: {roc_auc}')


# sorted feature importance
feature_weights = {feature: weight for feature, weight in zip(X.columns, model.feature_importances_)}
dict(sorted(feature_weights.items(), key=lambda item: item[1], reverse=True))