In [2]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import sys
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import SMOTE, ADASYN
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings("ignore")
import os
from tqdm.auto import tqdm
import numpy as np
import optuna
import seaborn as sns
from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier
import datetime

In [2]:
from sklearn.model_selection import StratifiedKFold

In [2]:
path = 'train_data/'

In [3]:
target = pd.read_csv('train_target.csv')

In [4]:
#check for na values and '-' values
'''dataset_paths = sorted([os.path.join(path, filename) for filename in os.listdir(path)
                            if filename.startswith('train')], key=len)
for i in range(len(dataset_paths)):
    dfr = pd.read_parquet(dataset_paths[i],columns=cols_to_start)
    print(dataset_paths[i], dfr.isna().sum().sum(), dfr.min().min())
    del dfr'''
    

"dataset_paths = sorted([os.path.join(path, filename) for filename in os.listdir(path)\n                            if filename.startswith('train')], key=len)\nfor i in range(len(dataset_paths)):\n    dfr = pd.read_parquet(dataset_paths[i],columns=cols_to_start)\n    print(dataset_paths[i], dfr.isna().sum().sum(), dfr.min().min())\n    del dfr"

Making tgqm progress bar look transparent

In [5]:
%%html
<style>
.cell-output-ipywidget-background {
    background-color: transparent !important;
}
:root {
    --jp-widgets-color: var(--vscode-editor-foreground);
    --jp-widgets-font-size: var(--vscode-editor-font-size);
}  
</style>

Reading separated datased files one by one to save memory.

In [6]:
def read_parquet_dataset_from_local(path_to_dataset: str, start_from: int = 0,
                                     num_parts_to_read: int = 2, columns=None, verbose=True) -> pd.DataFrame:
    
    df1 = pd.DataFrame()
    dataset_paths = sorted([os.path.join(path_to_dataset, filename) for filename in os.listdir(path_to_dataset)
                              if filename.startswith('train')], key=len)
    start_from = max(0, start_from)
    chunks = dataset_paths[start_from: start_from + num_parts_to_read]
    
    if verbose:
        names = [chunk.split('_')[-1].split('.')[0] for chunk in chunks]
        print(f'Reading chunks No:{names}')
    for chunk_path in tqdm(chunks, desc="Progress"):
        chunk = pd.read_parquet(chunk_path,columns=columns)
        #making feature 'credit amount':
        chunk['rn'] = 1
        #this one is for memory saving
        chunk[chunk.drop('id', axis=1).columns] = chunk[chunk.drop('id', axis=1).columns].astype('Int8')
        chunk['id'] = chunk['id'].astype('Int32')
                      
        df1 = pd.concat([df1, chunk]).reset_index(drop=True)
        print(f'chunk {chunk_path.split('_')[-1].split('.')[0]} appended')  
    
    return df1

The features, that positively effects the model scoring.
Determined within the additional experiment, with deleting features one by one and fitting model on each iteration.

In [7]:
cols = ['id', 'pre_since_opened', 'pre_since_confirmed', 'pre_pterm',
       'pre_till_pclose', 'pre_till_fclose',
       'pre_loans_credit_limit', 'pre_loans_next_pay_summ',
       'pre_loans_total_overdue',
       'pre_loans_max_overdue_sum', 'pre_loans_credit_cost_rate', 'pre_loans5',
       'pre_loans530', 'pre_loans3060', 'pre_loans6090', 'pre_loans90', 'pre_util', 'pre_over2limit',
       'pre_maxover2limit', 'enc_paym_0', 'enc_paym_1', 'enc_paym_2',
       'enc_paym_3', 'enc_paym_4', 'enc_paym_5', 'enc_paym_6', 'enc_paym_7',
       'enc_paym_8', 'enc_paym_9', 'enc_paym_10', 'enc_paym_11', 'enc_paym_12',
       'enc_paym_13', 'enc_paym_14', 'enc_paym_15', 'enc_paym_16',
       'enc_paym_17', 'enc_paym_18', 'enc_paym_19', 'enc_paym_20',
       'enc_paym_21', 'enc_paym_22', 'enc_paym_23', 'enc_paym_24',
       'enc_loans_account_holder_type', 'enc_loans_credit_status',
       'enc_loans_credit_type', 'enc_loans_account_cur','rn', 'pre_fterm', 'pre_loans_outstanding']

In [8]:
df = read_parquet_dataset_from_local(path_to_dataset=path, start_from=0, num_parts_to_read=12, columns=cols)

Reading chunks No:['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11']


Progress:   0%|          | 0/12 [00:00<?, ?it/s]

chunk 0 appended
chunk 1 appended
chunk 2 appended
chunk 3 appended
chunk 4 appended
chunk 5 appended
chunk 6 appended
chunk 7 appended
chunk 8 appended
chunk 9 appended
chunk 10 appended
chunk 11 appended


Encoding features by OHE. By columns, aimed to save memory. 
All except 'rn' are categoriical, due to the previousli made binarisation as it is said in instruction file.
Additionally cheching the file size in each iteration.

In [9]:
length = len(df.drop(['id', 'rn'], axis=1).columns)
for i in df.drop(['id', 'rn'], axis=1):
    ohe = OneHotEncoder(sparse_output=False, dtype=np.int8)      
    new = ohe.fit_transform(df[[i]])
    df[ohe.get_feature_names_out()] = new #Добавляем новыее фичи в датафрейм
    df.drop(i, axis=1, inplace=True) #убрали старые фичи после их кодирования
    length-=1
    print(f'{i} successfully encoded, {length} left.\n Size of df is now {round(sys.getsizeof(df)/1000000000,2)}gb')

pre_since_opened successfully encoded, 48 left.
 Size of df is now 3.22gb
pre_since_confirmed successfully encoded, 47 left.
 Size of df is now 3.64gb
pre_pterm successfully encoded, 46 left.
 Size of df is now 4.06gb
pre_till_pclose successfully encoded, 45 left.
 Size of df is now 4.45gb
pre_till_fclose successfully encoded, 44 left.
 Size of df is now 4.81gb
pre_loans_credit_limit successfully encoded, 43 left.
 Size of df is now 5.28gb
pre_loans_next_pay_summ successfully encoded, 42 left.
 Size of df is now 5.42gb
pre_loans_total_overdue successfully encoded, 41 left.
 Size of df is now 5.42gb
pre_loans_max_overdue_sum successfully encoded, 40 left.
 Size of df is now 5.47gb
pre_loans_credit_cost_rate successfully encoded, 39 left.
 Size of df is now 5.78gb
pre_loans5 successfully encoded, 38 left.
 Size of df is now 6.07gb
pre_loans530 successfully encoded, 37 left.
 Size of df is now 6.54gb
pre_loans3060 successfully encoded, 36 left.
 Size of df is now 6.75gb
pre_loans6090 succ

Grouping dataset rows by id, all feature value.
Memory control on each step.

In [10]:
df = df.groupby('id').sum()
round(sys.getsizeof(df)/1000000000,2)

1.22

Connecting features with the target value colums.
Cleaning memory from unused items.

In [11]:
df = pd.merge(df,target, on='id')
df.drop('id', axis=1, inplace=True)
round(sys.getsizeof(df)/1000000000,2)
del target


In [12]:
df.drop_duplicates(inplace=True)
round(sys.getsizeof(df)/1000000000,2)

1.24

Saving the encoded dataset.

In [13]:
df.to_parquet('Encoded_dataset_0907.parquet', index=False)

In [6]:
df = pd.read_parquet('Encoded_dataset_0907.parquet')
round(sys.getsizeof(df)/1000000000,2)
df.shape

(2964447, 401)

In [7]:
X,y = df.drop('flag', axis=1), df['flag']
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.4, random_state=1, stratify=y)
Xval, Xtest, yval, ytest = train_test_split(Xtest, ytest, test_size=0.5, random_state=1, stratify=ytest)

In [8]:
del df

In [128]:
models_basic_params = [LGBMClassifier(), CatBoostClassifier(), XGBClassifier()]
stat = pd.DataFrame(columns=['model', 'params', 'learning_time', 'train_auc_score', 'test_auc_score','val_auc_score', 'cv_mean_score',	'cv_std'])

In [129]:
for i in tqdm(models_basic_params, desc="Progress"):
    model = i
    start = datetime.datetime.now()
    model.fit(Xtrain, ytrain)
    learning_time = str(datetime.datetime.now() - start)
    predtrain = model.predict_proba(Xtrain)
    predtest = model.predict_proba(Xtest)
    predval = model.predict_proba(Xval)
    score_train = roc_auc_score(ytrain, predtrain[:,1])
    score_test = roc_auc_score(ytest, predtest[:,1])
    score_val = roc_auc_score(yval, predval[:,1])
    

    res = pd.DataFrame({
        'model': [str(type(model).__name__)],
        'params': 'basic',
        'learning_time': [learning_time],
        'train_auc_score': [score_train],
        'test_auc_score': [score_test], 
        'val_auc_score': [score_val], 
        'cv_mean_score': '-',
        'cv_std': '-'})
    
    stat = pd.concat([stat, res]).reset_index(drop=True)

Progress:   0%|          | 0/3 [00:00<?, ?it/s]

[LightGBM] [Info] Number of positive: 63311, number of negative: 1715357
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 2.925133 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 5433
[LightGBM] [Info] Number of data points in the train set: 1778668, number of used features: 383
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035595 -> initscore=-3.299317
[LightGBM] [Info] Start training from score -3.299317
Learning rate set to 0.251608
0:	learn: 0.3910949	total: 560ms	remaining: 9m 18s
1:	learn: 0.2596401	total: 1.36s	remaining: 11m 19s
2:	learn: 0.2024698	total: 2.05s	remaining: 11m 21s
3:	learn: 0.1752776	total: 3.06s	remaining: 12m 43s
4:	learn: 0.1605731	total: 4.34s	remaining: 14m 23s
5:	learn: 0.1540344	total: 5.31s	remaining: 14m 40s
6:	learn: 0.1497395	total: 6.57s	remaining: 15m 31s
7:	learn: 0.1472283	total: 7.75s	remaining: 

In [130]:
stat

Unnamed: 0,model,params,learning_time,train_auc_score,test_auc_score,val_auc_score,cv_mean_score,cv_std
0,LGBMClassifier,basic,0:00:51.981009,0.768016,0.753927,0.752722,-,-
1,CatBoostClassifier,basic,0:17:28.451842,0.792411,0.755332,0.754277,-,-
2,XGBClassifier,basic,0:01:36.324102,0.79552,0.753115,0.750227,-,-


Cheching if there are features with zero importance. Will not delete them for a while, because it doesn't effect the score. Moreover, they can be different for different models.

In [10]:
LGBMC_best_params = {'max_depth': 51, 'n_estimators': 219, 'reg_alpha': 24, 'reg_lambda': 183, 'min_child_weight': 7, 'colsample_bytree': 0.9223492385632168}

In [11]:
CBC_best_params = {'learning_rate': 0.1, 'colsample_bylevel': 0.0966815210492446, 'boosting_type': 'Ordered'}

In [12]:
XGBC_best_params = {'max_depth': 9, 'n_estimators': 197, 'reg_alpha': 60, 'reg_lambda': 88, 'min_child_weight': 20, 'colsample_bytree': 0.37523253349507046}

In [10]:
models_optuned_params = [XGBClassifier(**XGBC_best_params, random_state=42), 
                         CatBoostClassifier(random_state=42, **CBC_best_params),
                         LGBMClassifier(random_state=42, force_col_wise=True, **LGBMC_best_params)]

In [135]:
for i in tqdm(models_optuned_params, desc="Progress"):
    model = i
    start = datetime.datetime.now()
    model.fit(Xtrain, ytrain)
    learning_time = str(datetime.datetime.now() - start)
    predtrain = model.predict_proba(Xtrain)
    predtest = model.predict_proba(Xtest)
    predval = model.predict_proba(Xval)
    score_train = roc_auc_score(ytrain, predtrain[:,1])
    score_test = roc_auc_score(ytest, predtest[:,1])
    score_val = roc_auc_score(yval, predval[:,1])
    
    mod_for_cv = i
    cv = StratifiedKFold(n_splits=5)
    scores = cross_val_score(mod_for_cv, Xtrain, ytrain, cv=cv, scoring = 'roc_auc')
    
    res = pd.DataFrame({
        'model': [str(type(model).__name__)],
        'params': 'optuned',
        'learning_time': [learning_time],
        'train_auc_score': [score_train],
        'test_auc_score': [score_test], 
        'val_auc_score': [score_val], 
        'cv_mean_score': [np.mean(scores)],
        'cv_std': [np.std(scores)]})
    
    stat = pd.concat([stat, res]).reset_index(drop=True)

Progress:   0%|          | 0/3 [00:00<?, ?it/s]

0:	learn: 0.5533570	total: 1.79s	remaining: 29m 51s
1:	learn: 0.4496636	total: 3.7s	remaining: 30m 44s
2:	learn: 0.3740174	total: 5.69s	remaining: 31m 29s
3:	learn: 0.3176939	total: 7.51s	remaining: 31m 10s
4:	learn: 0.2770382	total: 9.63s	remaining: 31m 56s
5:	learn: 0.2463584	total: 11.8s	remaining: 32m 35s
6:	learn: 0.2232416	total: 13.8s	remaining: 32m 42s
7:	learn: 0.2063911	total: 15.9s	remaining: 32m 57s
8:	learn: 0.1933869	total: 18s	remaining: 33m 4s
9:	learn: 0.1829892	total: 20.2s	remaining: 33m 22s
10:	learn: 0.1750881	total: 22.3s	remaining: 33m 20s
11:	learn: 0.1691850	total: 24.5s	remaining: 33m 35s
12:	learn: 0.1643340	total: 26.3s	remaining: 33m 15s
13:	learn: 0.1609154	total: 28.2s	remaining: 33m 7s
14:	learn: 0.1577310	total: 30.3s	remaining: 33m 6s
15:	learn: 0.1556260	total: 32.2s	remaining: 33m 1s
16:	learn: 0.1537153	total: 34.2s	remaining: 32m 59s
17:	learn: 0.1519597	total: 36.5s	remaining: 33m 10s
18:	learn: 0.1508945	total: 39.2s	remaining: 33m 41s
19:	learn:

In [136]:
stat = stat.sort_values(by='test_auc_score', ascending=False).reset_index(drop=True)

In [137]:
stat.to_csv('stat_1407.csv', index=False)

In [138]:
stat

Unnamed: 0,model,params,learning_time,train_auc_score,test_auc_score,val_auc_score,cv_mean_score,cv_std
0,LGBMClassifier,optuned,0:02:10.785736,0.772767,0.758499,0.757047,0.757573,0.00215
1,XGBClassifier,optuned,0:04:15.088610,0.80552,0.758275,0.757756,0.758157,0.002002
2,CatBoostClassifier,optuned,0:33:51.306004,0.777942,0.756921,0.756264,0.755699,0.002276
3,CatBoostClassifier,basic,0:17:28.451842,0.792411,0.755332,0.754277,-,-
4,LGBMClassifier,basic,0:00:51.981009,0.768016,0.753927,0.752722,-,-
5,XGBClassifier,basic,0:01:36.324102,0.79552,0.753115,0.750227,-,-


In [None]:
'''#XGBClassifier with params
mod_XGBClassifier = XGBClassifier(**XGBC_best_params, random_state=42, eval_metric='auc', early_stopping_rounds = 10)
mod_XGBClassifier.fit(Xtrain, ytrain, eval_set=[(Xval, yval)])
predtrain = mod_XGBClassifier.predict_proba(Xtrain)
predtest = mod_XGBClassifier.predict_proba(Xtest)
validate = mod_XGBClassifier.predict_proba(Xval)
print(f'Roc_auc_score with tuned params:\n train: {roc_auc_score(ytrain,predtrain[:,1])}\ntest: {roc_auc_score(ytest,predtest[:,1])}\nvalidate:{roc_auc_score(yval,validate[:,1])}')'''

In [5]:
mod_LGBMC = LGBMClassifier(random_state=42, force_col_wise=True,**LGBMC_best_params)
mod_LGBMC.fit(Xtrain, ytrain)
LGBMC_val_predict = mod_LGBMC.predict_proba(Xval)

mod_XGBC = XGBClassifier(**XGBC_best_params, random_state=42)
mod_XGBC.fit(Xtrain, ytrain)
XGBC_val_predict = mod_XGBC.predict_proba(Xval)

print(f'Basic auc scores un val sample:\nLGBMC - {roc_auc_score(yval,LGBMC_val_predict[:,1])}\nXGBC - {roc_auc_score(yval,XGBC_val_predict[:,1])}')

NameError: name 'LGBMC_best_params' is not defined

In [144]:
LGBMC_test_predict = mod_LGBMC.predict_proba(Xtest)
XGBC_test_predict = mod_XGBC.predict_proba(Xtest)
print(f'Basic auc scores un test sample:\nLGBMC - {roc_auc_score(ytest,LGBMC_test_predict[:,1])}\nXGBC - {roc_auc_score(ytest,XGBC_test_predict[:,1])}')

Basic auc scores un test sample:
LGBMC - 0.7584991662925701
XGBC - 0.7582746830868854


In [143]:
#LEts try to ensemble them
auc_top_n = -1
alpha_n = -1
for a in np.arange(0.01,1,0.01):
    ensemble_predict = a*LGBMC_val_predict[:,1] + (1-a)*XGBC_val_predict[:,1]
    auc = roc_auc_score(yval, ensemble_predict)
    if auc > auc_top_n:
        auc_top_n = auc
        alpha_n = a
print(f'Best ROC_AUC score on val sample = {auc_top_n} при альфа = {alpha_n}')  

Best ROC_AUC score on val sample = 0.7600147867597477 при альфа = 0.48000000000000004


In [146]:
a=0.48
ensemble_predict = a*LGBMC_test_predict[:,1] + (1-a)*XGBC_test_predict[:,1]
auc = roc_auc_score(ytest, ensemble_predict)
print(f'Best ROC_AUC score on test sample = {auc} при альфа = {0.48}')  

Best ROC_AUC score on test sample = 0.760914639546205 при альфа = 0.48



ДАЛЕЕ ПРОБУЕМ STACKING CLASSIFIER
----------------------------------------------------------------------------

In [11]:
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import StackingClassifier

In [17]:
from sklearn.linear_model import LogisticRegression

In [None]:
models_optuned_params = [XGBClassifier(**XGBC_best_params, random_state=42), 
                         CatBoostClassifier(random_state=42, **CBC_best_params),
                         LGBMClassifier(random_state=42, force_col_wise=True, **LGBMC_best_params)]

In [13]:
estimators = [
    ('XGBC', XGBClassifier(**XGBC_best_params, random_state=42)),
    ('LGBMC', LGBMClassifier(random_state=42, force_col_wise=True, **LGBMC_best_params))
]

In [14]:
clf = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)
clf.fit(Xtrain, ytrain)

[LightGBM] [Info] Number of positive: 63311, number of negative: 1715357
[LightGBM] [Info] Total Bins 5381
[LightGBM] [Info] Number of data points in the train set: 1778668, number of used features: 385
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035595 -> initscore=-3.299317
[LightGBM] [Info] Start training from score -3.299317
[LightGBM] [Info] Number of positive: 50649, number of negative: 1372285
[LightGBM] [Info] Total Bins 5385
[LightGBM] [Info] Number of data points in the train set: 1422934, number of used features: 384
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035595 -> initscore=-3.299313
[LightGBM] [Info] Start training from score -3.299313
[LightGBM] [Info] Number of positive: 50649, number of negative: 1372285
[LightGBM] [Info] Total Bins 5398
[LightGBM] [Info] Number of data points in the train set: 1422934, number of used features: 386
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035595 -> initscore=-3.299313
[LightGBM] [Info] Start training from score

In [17]:
clf_test_predict = clf.predict_proba(Xtest)
clf_val_predict = clf.predict_proba(Xval)
auc_test = roc_auc_score(ytest, clf_test_predict[:,1])
auc_val = roc_auc_score(yval, clf_val_predict[:,1])

In [23]:
print(f'Stacking ROC_AUC score on val sample = {auc_val}\
      \nStacking ROC_AUC score on test sample = {auc_test}')  

Stacking ROC_AUC score on val sample = 0.760013689796613      
Stacking ROC_AUC score on test sample = 0.7609203782685181


А ТЕПЕРЬ НА 3 МОДЕЛЯХ

In [24]:
estimators2 = [
    ('XGBC', XGBClassifier(**XGBC_best_params, random_state=42)),
    ('LGBMC', LGBMClassifier(random_state=42, force_col_wise=True, **LGBMC_best_params)),
    ('CBC', CatBoostClassifier(random_state=42, **CBC_best_params))
]

In [26]:
clf2 = StackingClassifier(
    estimators=estimators2, final_estimator=LogisticRegression()
)
clf2.fit(Xtrain, ytrain)

[LightGBM] [Info] Number of positive: 63311, number of negative: 1715357
[LightGBM] [Info] Total Bins 5381
[LightGBM] [Info] Number of data points in the train set: 1778668, number of used features: 385
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.035595 -> initscore=-3.299317
[LightGBM] [Info] Start training from score -3.299317
0:	learn: 0.5533570	total: 1.63s	remaining: 27m 4s
1:	learn: 0.4496636	total: 3.2s	remaining: 26m 37s
2:	learn: 0.3740174	total: 4.87s	remaining: 26m 58s
3:	learn: 0.3176939	total: 6.49s	remaining: 26m 57s
4:	learn: 0.2770382	total: 8.03s	remaining: 26m 38s
5:	learn: 0.2463584	total: 9.63s	remaining: 26m 36s
6:	learn: 0.2232416	total: 11.2s	remaining: 26m 26s
7:	learn: 0.2063911	total: 12.8s	remaining: 26m 26s
8:	learn: 0.1933869	total: 14.4s	remaining: 26m 26s
9:	learn: 0.1829892	total: 16s	remaining: 26m 25s
10:	learn: 0.1750881	total: 17.5s	remaining: 26m 14s
11:	learn: 0.1691850	total: 19.2s	remaining: 26m 22s
12:	learn: 0.1643340	total: 20.9s	remaini

In [27]:
clf_test_predict2 = clf2.predict_proba(Xtest)
clf_val_predict2 = clf2.predict_proba(Xval)
auc_test2 = roc_auc_score(ytest, clf_test_predict2[:,1])
auc_val2 = roc_auc_score(yval, clf_val_predict2[:,1])
print(f'Stacking ROC_AUC score on val sample = {auc_val2}\
      \nStacking ROC_AUC score on test sample = {auc_test2}')  

Stacking ROC_AUC score on val sample = 0.7604986650131851      
Stacking ROC_AUC score on test sample = 0.7613360578695326


In [28]:
os.system(f'telegram-send "Stacking ROC_AUC score on test sample = {auc_test2}\nStacking ROC_AUC score on val sample = {auc_val2}"')

0


ТУТ ПОДБОР ГИПЕРПАРАМЕТРОВ ОТДЕЛЬНЫХ МОДЕЛЕЙ С OPTUNA
----------------------------------------------------------------------------

In [None]:
def objective_LGBMClassifier(trial):
    
    params = {
        'max_depth': trial.suggest_int('max_depth', 5, 100),
        'n_estimators': trial.suggest_int('n_estimators', 5, 250),
        'reg_alpha': trial.suggest_int('reg_alpha', 10, 70),
        'reg_lambda': trial.suggest_int('reg_lambda', 20, 200),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 20),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
    }
    
    model = LGBMClassifier(random_state=42, force_col_wise=True,
                          **params)
    model.fit(Xtrain, ytrain)
    preds = model.predict_proba(Xtest)

    return roc_auc_score(ytest,preds[:,1])

study_LGBMClassifier = optuna.create_study(direction='maximize', pruner='MedianPruner')
study_LGBMClassifier.optimize(objective_LGBMClassifier, n_trials=50)
print('Best one:', study_LGBMClassifier.best_trial.params, study_LGBMClassifier.best_value)
os.system(f'telegram-send "study_LGBMClassifier best score:{study_LGBMClassifier.best_value}"')

LGBMClassifier best: {'max_depth': 51, 'n_estimators': 219, 'reg_alpha': 24, 'reg_lambda': 183, 'min_child_weight': 7, 'colsample_bytree': 0.9223492385632168} 0.7584991662925701

In [None]:
optuna.visualization.plot_param_importances(study_LGBMClassifier)

In [8]:
def objective_XGBClassifier(trial):
    
    params = {
        'max_depth': trial.suggest_int('max_depth', 5, 100),
        'n_estimators': trial.suggest_int('n_estimators', 5, 250),
        'reg_alpha': trial.suggest_int('reg_alpha', 10, 70),
        'reg_lambda': trial.suggest_int('reg_lambda', 20, 200),
        'min_child_weight': trial.suggest_int('min_child_weight', 2, 20),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.1, 1.0),
    }
    
    model = XGBClassifier(random_state=42,
                          **params)
    model.fit(Xtrain, ytrain)
    preds = model.predict_proba(Xtest)

    return roc_auc_score(ytest,preds[:,1])

study_XGBClassifier = optuna.create_study(direction='maximize', pruner='MedianPruner')
study_XGBClassifier.optimize(objective_XGBClassifier, n_trials=50)
print('Best one:', study_XGBClassifier.best_trial.params, study_XGBClassifier.best_value)
os.system(f'telegram-send "study_XGBClassifier best score:{study_XGBClassifier.best_value}"')

[I 2024-07-12 09:52:55,066] A new study created in memory with name: no-name-6ba725f3-08cb-4245-b73b-09de81570db1
[I 2024-07-12 09:53:55,035] Trial 0 finished with value: 0.7555577220965439 and parameters: {'max_depth': 31, 'n_estimators': 42, 'reg_alpha': 63, 'reg_lambda': 119, 'min_child_weight': 15, 'colsample_bytree': 0.3033590310538479}. Best is trial 0 with value: 0.7555577220965439.
[I 2024-07-12 09:57:41,556] Trial 1 finished with value: 0.7582746830868854 and parameters: {'max_depth': 9, 'n_estimators': 197, 'reg_alpha': 60, 'reg_lambda': 88, 'min_child_weight': 20, 'colsample_bytree': 0.37523253349507046}. Best is trial 1 with value: 0.7582746830868854.
[I 2024-07-12 09:58:46,773] Trial 2 finished with value: 0.7516535937038156 and parameters: {'max_depth': 49, 'n_estimators': 23, 'reg_alpha': 64, 'reg_lambda': 72, 'min_child_weight': 4, 'colsample_bytree': 0.921563984008409}. Best is trial 1 with value: 0.7582746830868854.
[I 2024-07-12 10:02:11,386] Trial 3 finished with va

Best one: {'max_depth': 9, 'n_estimators': 197, 'reg_alpha': 60, 'reg_lambda': 88, 'min_child_weight': 20, 'colsample_bytree': 0.37523253349507046} 0.7582746830868854


0

In [32]:
second_iter_opt = study.best_trial.params
second_iter_opt

{'max_depth': 31,
 'n_estimators': 154,
 'reg_alpha': 63,
 'reg_lambda': 189,
 'min_child_weight': 2,
 'colsample_bytree': 0.25727950771502256}

{'max_depth': 31,
 'n_estimators': 154,
 'reg_alpha': 63,
 'reg_lambda': 189,
 'min_child_weight': 2,
 'colsample_bytree': 0.25727950771502256}

Roc_auc_score with tuned params:
 train: 0.8182570023564028
test: 0.7547398270802788

In [98]:
#XGBOOST with params
mod_xgb = XGBClassifier(**second_iter_opt, random_state = 42)
mod_xgb.fit(Xtrain, ytrain)
predtrain = mod_xgb.predict_proba(Xtrain)
predtest = mod_xgb.predict_proba(Xtest)
validate = mod_xgb.predict_proba(Xval)
print(f'Roc_auc_score with tuned params:\n train: {roc_auc_score(ytrain,predtrain[:,1])}\ntest: {roc_auc_score(ytest,predtest[:,1])}\nvalidate: {roc_auc_score(yval,validate[:,1])}')

Roc_auc_score with tuned params:
 train: 0.8182570023564028
test: 0.7547398270802788
validate: 0.7552807972596255


Roc_auc_score with tuned params:
 train: 0.8182570023564028
test: 0.7547398270802788
validate: 0.7552807972596255


In [124]:
import dill

In [127]:
'''with open('XGBC_075auc.pkl', 'wb') as file:
    dill.dump({
    'model': mod_xgb,
    'metadata': {
    'name': 'Prediclion with XGBClassifier',
    'ver': 1,
    'author': 'pelmen',
    'type': 'XGBClassifier',
    'stat': 'roc_auc=0.75'
        }
    }, file)'''

In [92]:
from sklearn.model_selection import StratifiedKFold

In [93]:
from sklearn.model_selection import StratifiedKFold
mod_for_cv = XGBClassifier(**second_iter_opt, random_state = 42)
cv = StratifiedKFold(n_splits=5)
scores = cross_val_score(mod_for_cv, Xtrain, ytrain, cv=cv, scoring = 'roc_auc')
print(f'Mean ROC_AOC score after CV:{np.mean(scores)}')

Mean ROC_AOC score after CV:0.752577127488072


In [95]:
np.std(scores)

0.001611085564567733

In [96]:
from catboost import CatBoostClassifier

In [122]:
#CatBooost
CBC = CatBoostClassifier(iterations=1000, eval_metric='AUC',use_best_model=True,random_seed=42)
CBC.fit(Xtrain, ytrain, eval_set=[(Xtest, ytest)])

Learning rate set to 0.201819
0:	test: 0.5017068	best: 0.5017068 (0)	total: 479ms	remaining: 7m 58s
1:	test: 0.5405583	best: 0.5405583 (1)	total: 961ms	remaining: 7m 59s
2:	test: 0.5733628	best: 0.5733628 (2)	total: 1.87s	remaining: 10m 21s
3:	test: 0.6105319	best: 0.6105319 (3)	total: 3.11s	remaining: 12m 54s
4:	test: 0.6666042	best: 0.6666042 (4)	total: 4.45s	remaining: 14m 45s
5:	test: 0.6807469	best: 0.6807469 (5)	total: 5.43s	remaining: 14m 59s
6:	test: 0.6859012	best: 0.6859012 (6)	total: 6.46s	remaining: 15m 16s
7:	test: 0.6940623	best: 0.6940623 (7)	total: 7.61s	remaining: 15m 43s
8:	test: 0.7008880	best: 0.7008880 (8)	total: 8.96s	remaining: 16m 26s
9:	test: 0.7047223	best: 0.7047223 (9)	total: 10.3s	remaining: 17m 3s
10:	test: 0.7089712	best: 0.7089712 (10)	total: 11.3s	remaining: 16m 59s
11:	test: 0.7133956	best: 0.7133956 (11)	total: 12.7s	remaining: 17m 26s
12:	test: 0.7145378	best: 0.7145378 (12)	total: 14.1s	remaining: 17m 47s
13:	test: 0.7159341	best: 0.7159341 (13)	tot

<catboost.core.CatBoostClassifier at 0x1ab004b2540>

bestTest = 0.7537068693
bestIteration = 836

0.7533120413652735

In [20]:
def objective_CBC(trial):
    
    params = {
        "learning_rate": trial.suggest_categorical("learning_rate", [0.001,0.01,0.1,0.5]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"])
    }
    
    model = CatBoostClassifier(eval_metric='AUC', **params, iterations=700, early_stopping_rounds=10)
    model.fit(Xtrain, ytrain, eval_set=[(Xtest, ytest)])
    preds = model.predict_proba(Xtest)

    return roc_auc_score(ytest,preds[:,1])

study_CBC = optuna.create_study(direction="maximize")
study_CBC.optimize(objective_CBC, n_trials=50, timeout=600)

[I 2024-07-12 15:00:55,793] A new study created in memory with name: no-name-dfe44fdb-7b68-4626-b739-136bdbe7654e


0:	test: 0.5409708	best: 0.5409708 (0)	total: 324ms	remaining: 3m 46s
1:	test: 0.5656870	best: 0.5656870 (1)	total: 668ms	remaining: 3m 53s
2:	test: 0.5762302	best: 0.5762302 (2)	total: 1.01s	remaining: 3m 53s
3:	test: 0.5863431	best: 0.5863431 (3)	total: 1.35s	remaining: 3m 54s
4:	test: 0.5995680	best: 0.5995680 (4)	total: 1.69s	remaining: 3m 55s
5:	test: 0.5998654	best: 0.5998654 (5)	total: 2.21s	remaining: 4m 15s
6:	test: 0.6323744	best: 0.6323744 (6)	total: 2.74s	remaining: 4m 31s
7:	test: 0.6549313	best: 0.6549313 (7)	total: 3.29s	remaining: 4m 44s
8:	test: 0.6597412	best: 0.6597412 (8)	total: 3.83s	remaining: 4m 53s
9:	test: 0.6766054	best: 0.6766054 (9)	total: 4.4s	remaining: 5m 3s
10:	test: 0.6766705	best: 0.6766705 (10)	total: 4.92s	remaining: 5m 8s
11:	test: 0.6916596	best: 0.6916596 (11)	total: 5.46s	remaining: 5m 13s
12:	test: 0.6961233	best: 0.6961233 (12)	total: 6.02s	remaining: 5m 17s
13:	test: 0.7039483	best: 0.7039483 (13)	total: 6.56s	remaining: 5m 21s
14:	test: 0.705

[I 2024-07-12 15:07:25,811] Trial 0 finished with value: 0.7553258208739864 and parameters: {'learning_rate': 0.1, 'colsample_bylevel': 0.09156306866729402, 'boosting_type': 'Plain'}. Best is trial 0 with value: 0.7553258208739864.


0:	test: 0.5425719	best: 0.5425719 (0)	total: 537ms	remaining: 6m 15s
1:	test: 0.5585611	best: 0.5585611 (1)	total: 1.16s	remaining: 6m 46s
2:	test: 0.5592448	best: 0.5592448 (2)	total: 1.77s	remaining: 6m 51s
3:	test: 0.5634678	best: 0.5634678 (3)	total: 2.49s	remaining: 7m 12s
4:	test: 0.5663501	best: 0.5663501 (4)	total: 3.12s	remaining: 7m 13s
5:	test: 0.5664967	best: 0.5664967 (5)	total: 3.78s	remaining: 7m 17s
6:	test: 0.5725723	best: 0.5725723 (6)	total: 4.34s	remaining: 7m 10s
7:	test: 0.5732403	best: 0.5732403 (7)	total: 4.96s	remaining: 7m 8s
8:	test: 0.5782178	best: 0.5782178 (8)	total: 5.55s	remaining: 7m 6s
9:	test: 0.5794122	best: 0.5794122 (9)	total: 6.17s	remaining: 7m 5s
10:	test: 0.5889919	best: 0.5889919 (10)	total: 6.77s	remaining: 7m 4s
11:	test: 0.5890170	best: 0.5890170 (11)	total: 7.33s	remaining: 7m
12:	test: 0.5891747	best: 0.5891747 (12)	total: 7.96s	remaining: 7m
13:	test: 0.5941388	best: 0.5941388 (13)	total: 8.57s	remaining: 7m
14:	test: 0.5973214	best: 0.

[I 2024-07-12 15:08:19,895] Trial 1 finished with value: 0.6511914386397026 and parameters: {'learning_rate': 0.001, 'colsample_bylevel': 0.09725125242456334, 'boosting_type': 'Plain'}. Best is trial 0 with value: 0.7553258208739864.


0:	test: 0.5401635	best: 0.5401635 (0)	total: 1.73s	remaining: 20m 7s
1:	test: 0.5482782	best: 0.5482782 (1)	total: 3.7s	remaining: 21m 30s
2:	test: 0.5829567	best: 0.5829567 (2)	total: 5.57s	remaining: 21m 32s
3:	test: 0.6015653	best: 0.6015653 (3)	total: 7.53s	remaining: 21m 50s
4:	test: 0.6052514	best: 0.6052514 (4)	total: 9.57s	remaining: 22m 10s
5:	test: 0.6203285	best: 0.6203285 (5)	total: 11.6s	remaining: 22m 16s
6:	test: 0.6314380	best: 0.6314380 (6)	total: 13.4s	remaining: 22m 9s
7:	test: 0.6377656	best: 0.6377656 (7)	total: 15.4s	remaining: 22m 14s
8:	test: 0.6660550	best: 0.6660550 (8)	total: 17.4s	remaining: 22m 19s
9:	test: 0.6743389	best: 0.6743389 (9)	total: 19.3s	remaining: 22m 15s
10:	test: 0.6831330	best: 0.6831330 (10)	total: 21.2s	remaining: 22m 10s
11:	test: 0.6861317	best: 0.6861317 (11)	total: 23.3s	remaining: 22m 13s
12:	test: 0.6853999	best: 0.6861317 (11)	total: 25.2s	remaining: 22m 13s
13:	test: 0.6870620	best: 0.6870620 (13)	total: 27.4s	remaining: 22m 23s
1

[I 2024-07-12 15:28:39,674] Trial 2 finished with value: 0.7554056127240402 and parameters: {'learning_rate': 0.1, 'colsample_bylevel': 0.0966815210492446, 'boosting_type': 'Ordered'}. Best is trial 2 with value: 0.7554056127240402.


In [21]:
print('Best one:', study_CBC.best_trial.params, study_CBC.best_value)
os.system(f'telegram-send "study_CBC best score:{study_CBC.best_value}"')

Best one: {'learning_rate': 0.1, 'colsample_bylevel': 0.0966815210492446, 'boosting_type': 'Ordered'} 0.7554056127240402


0

In [14]:
study_CBC.best_trial.params

{'objective': 'CrossEntropy',
 'colsample_bylevel': 0.07962812891530555,
 'depth': 8,
 'boosting_type': 'Plain'}

In [None]:
study_CBC.optimize(objective_CBC, n_trials=50, timeout=600)
print('Best one:', study_CBC.best_trial.params, study_CBC.best_value)
os.system(f'telegram-send "study_CBC best score:{study_CBC.best_value}"')