In [1]:
import os
import sys


#Visualization
from matplotlib import pyplot as plt
from tqdm import tqdm

#Generic
import numpy as np
import pandas as pd
import polars as pl
import time
import scipy.stats as stats
import scipy
import math

#Models
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from catboost import CatBoostClassifier, Pool

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import torch
from torch import nn, Tensor
from torch.utils.data import DataLoader, Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor

from fastai import *
from fastai.tabular.all import *

#Feature Importance
from sklearn.inspection import permutation_importance

#Pipeline
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_val_score,cross_validate
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, ShuffleSplit, GroupKFold, StratifiedKFold, KFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif, GenericUnivariateSelect
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin

#Scoring
from sklearn.metrics import mean_absolute_error, roc_auc_score
from sklearn.metrics import mean_squared_error

import pickle

import optuna

is_submission = False
OPTUNING = False
OPTUNING_CAT = False

loadingpath = '/kaggle/input/catboost_v1/other/default/1/CatModellingInts'

In [2]:
def savedumping(variables_names, basename, savepath='/kaggle/working/'):
    for ele in variables_names:
        f = open(savepath + basename + ele[1],'wb')
        pickle.dump(ele[0],f)
        f.close()

In [3]:
class Loan_grade_enc(BaseEstimator, TransformerMixin):
    def __init__(self):
        return

    def transform(self,X,y=None):
        return_dict = {
            'A':1,
            'B':2,
            'C':3,
            'D':4,
            'E':5,
            'F':6,
            'G':7,
        }
        return X.map(lambda x:return_dict.get(x,0))

    def fit(self, X, y=None):
        return self
    def get_feature_names_out(self,name):
        return name

In [4]:
def find_types(targ_col):
    if targ_col.dtype == 'category':
        return 'category'
    if ((targ_col.astype(bool) - targ_col)==0).all():
        return 'bool'
    if ((targ_col.astype(int) - targ_col)==0).all():
        return 'int'
    return 'float'


In [5]:
class Data_processor:
    def __init__(self, paths, y_col = 'loan_status', all_cat = False):
        rawindict={}
        for path in paths:
            rawindict[path] = pd.read_csv(path)
        rawin = pd.concat(rawindict,axis=0)
        rawin.index = range(0,rawin.shape[0])
        rawin['person_emp_length'] = rawin['person_emp_length'].fillna(123)
        rawin['loan_int_rate'] = rawin['loan_int_rate'].fillna(0.01)
        self.y_targ = rawin[y_col]
        self.transformer = Data_transformer(all_cat)
        self.X_processed = self.transformer.fit_train_set(rawin.drop([y_col],axis=1))
        
class Data_transformer:
    def __init__(self, all_cat):
        self.all_cat = all_cat
        return
        
    def fit_train_set(self,rawin):
        oneh_encoder = OneHotEncoder(handle_unknown='ignore')
        ordi_encoder = OrdinalEncoder(handle_unknown='error')
        lngr_encoder = Loan_grade_enc()
        self.gen_enc = ColumnTransformer(
            [
                ('oneh',oneh_encoder,['person_home_ownership','loan_intent']),
                ('ordi',ordi_encoder,['cb_person_default_on_file']),
                ('lngr',lngr_encoder,['loan_grade'])
            ],
            remainder='passthrough')
        self.gen_enc.fit(rawin)
        X_processed = self.totalTransform(rawin)
        self.groupbymeans = X_processed.groupby('lngr__loan_grade')[['remainder__person_income','remainder__loan_amnt','remainder__loan_int_rate','remainder__loan_percent_income','remainder__cb_person_cred_hist_length']].mean()
        X_processed = self.newfeat(X_processed)
        self.intypes = X_processed.apply(find_types)
        X_processed = self.apply_types(X_processed) 
        return X_processed
        
    def totalTransform(self,X_raw):
        X_targ = pd.DataFrame(self.gen_enc.transform(X_raw),index = X_raw.index)
        X_targ.columns = self.gen_enc.get_feature_names_out(X_raw.columns)
        X_targ.drop('remainder__id',axis=1,inplace=True)
        return X_targ
    
    def newfeat(self,X_targ):
        X_targ['age_min_cred_length'] = X_targ['remainder__person_age'] - X_targ['remainder__cb_person_cred_hist_length']
        X_targ['payment_per_income'] = X_targ['remainder__person_income'] / X_targ['remainder__loan_amnt'] / X_targ['remainder__loan_int_rate']
        X_targ['loan_income_check'] = X_targ['remainder__loan_amnt'] / X_targ['remainder__person_income'] - X_targ['remainder__loan_percent_income']
        X_targ['rem_life'] = (90 - X_targ['remainder__person_age']) / X_targ['remainder__loan_amnt'] * X_targ['remainder__person_income'] / 100
        X_targ['rate_ratio_diff'] = X_targ['remainder__loan_int_rate'] - X_targ['remainder__loan_percent_income']
        return X_targ
    

    def apply_types(self,X_targ):
        for idx,ele in self.intypes.items():
            if not self.all_cat:
                X_targ[idx] = X_targ[idx].astype(ele)
            else:
                if ele != 'float':
                    X_targ[idx] = X_targ[idx].astype(ele)
                else:
                    X_targ[idx] = X_targ[idx]*100
                    X_targ[idx] = X_targ[idx].astype('int')
                    
        return X_targ
    
    def test_transform(self,X_test):
        return self.apply_types(self.newfeat(self.totalTransform(X_test)))

In [6]:
if not is_submission or OPTUNING_CAT or OPTUNING:
    testproc = Data_processor(['/kaggle/input/playground-series-s4e10/train.csv','/kaggle/input/loan-approval-prediction/credit_risk_dataset.csv'])

In [7]:
if not is_submission or OPTUNING_CAT or OPTUNING:
    testproc_cat = Data_processor(['/kaggle/input/playground-series-s4e10/train.csv','/kaggle/input/loan-approval-prediction/credit_risk_dataset.csv'], all_cat = True)

In [8]:
class timer():
    def __init__(self):
        self.t=time.time()
    def tic(self):
        self.t=time.time()
        return
    def toc(self):
        tot_sec = (time.time()-self.t)
        hours = str(np.floor(tot_sec/3600))
        minutes = str(np.floor(tot_sec/60)%60)
        seconds = str(tot_sec%60)
        print("time elapsed hours: " + hours + " minutes: " +  minutes + " seconds: " + seconds)
        return

base_timer=timer()
base_timer.tic()
base_timer.toc()
if is_submission:
    base_timer.tic()

time elapsed hours: 0.0 minutes: 0.0 seconds: 0.0004048347473144531


In [9]:
aggr_model = []

In [10]:
def feat_importance_func(model,X,y,**kwargs):
    print('feat_importance_func running')
    base_timer.tic()
    result = permutation_importance(
        model, X, y, n_repeats=10, random_state=42, n_jobs=2
    )
    base_timer.toc()
    result=pd.DataFrame(np.transpose([result.importances_mean,result.importances_std]),index = X.columns, columns = ['mean','std'])
    return result

def feat_importance_fast_func(model,X,y,**kwargs):
    print('feat_importance_func running')
    base_timer.tic()

    result = pd.DataFrame(
        model.feature_importances_, index=X.columns
    )
    result.columns = ['mean']
    result['std'] = 0
    base_timer.toc()
    return result


In [11]:
def feat_imp_plotter(feat_imps):
    col_mean = [col for col in feat_imps.columns if 'mean' in col]
    col_std = [col for col in feat_imps.columns if 'std' in col]

    feat_imps_agg = pd.concat([feat_imps[col_mean].sum(axis=1),feat_imps[col_std].map(lambda x:x**2).sum(axis=1).map(lambda x:x**0.5)],axis=1)
    feat_imps_agg.columns = ['mean','std']
    feat_imps_agg['std_cvtocv'] = feat_imps[col_mean].std(axis=1)/(feat_imps_agg['mean']+0.000000001)
    feat_imps_agg.sort_values(by='mean',ascending=False,inplace=True)
    ax=feat_imps_agg['mean'].plot(kind="bar", yerr=feat_imps_agg['std'])
    ax.set_title("Feature importances using MDI")
    ax.set_ylabel("Mean decrease in impurity")
    ax.figure.tight_layout()
    plt.show()

    print(feat_imps_agg.index.to_numpy().tolist())
    print(feat_imps_agg['mean'].to_numpy().tolist())
    print(feat_imps_agg['std'].to_numpy().tolist())
    
    feat_imps_agg.sort_values(by='std_cvtocv',ascending=False,inplace=True)
    print(feat_imps_agg.index.to_numpy().tolist())
    print(feat_imps_agg['std_cvtocv'].to_numpy().tolist())
    
def feat_imp_plotter_ratios(feat_imps,mask):
    col_mean = [col for col in feat_imps.columns if 'mean' in col]
    col_std = [col for col in feat_imps.columns if 'std' in col]

    feat_imps_agg = pd.concat([feat_imps[col_mean].sum(axis=1),feat_imps[col_std].map(lambda x:x**2).sum(axis=1).map(lambda x:x**0.5)],axis=1)
    feat_imps_agg.columns = ['mean','std']
    change_imps_score = pd.Series(index = feat_imps.index)
    change_imps_score = feat_imps_agg['mean']**mask.shape[0]
    for col in col_mean:
        if mask[col]==1:
            change_imps_score = change_imps_score /(feat_imps[col]+0.0001)
        else:
            change_imps_score = change_imps_score *(feat_imps[col]+0.0001)
    change_imps_score = np.log(change_imps_score)
    change_imps_score.sort_values(ascending=False,inplace=True)
    ax=change_imps_score.plot(kind="bar", yerr=feat_imps_agg['std'])
    ax.set_title("Feature importances using MDI")
    ax.set_ylabel("Mean decrease in impurity")
    ax.figure.tight_layout()
    plt.show()
    print(change_imps_score.index.to_numpy().tolist())
    print(change_imps_score.to_numpy().tolist())

In [12]:
def train_lgb(X, y, 
              feat_importance = None, 
              feat_importance_args = None, feat_importances_plot = None, cv_scoring = None,
              lgbm_params = {
                  'verbose' : 1,
                  'colsample_bytree':0.5,
                  'n_estimators': 1000,
                  'reg_lambda': 10,
                  'reg_alpha': 0.6,
                  'learning_rate': 0.02,
              }):    
    s_kfold = StratifiedKFold(n_splits=Config.n_splits,shuffle=True)
    
    models = []
    scores = []
    feat_importances = pd.DataFrame()
    for fi, (train_idx, valid_idx) in enumerate(s_kfold.split(X, y)):
        print(f'Fold {fi+1}/{Config.n_splits} ...')
        model = lgb.LGBMClassifier(**lgbm_params)
        model.fit(
            X.iloc[train_idx], y.iloc[train_idx],
            eval_set=[(X.iloc[valid_idx], y.iloc[valid_idx]),(X.iloc[train_idx], y.iloc[train_idx])],
            eval_metric=['average_precision','auc'],
            callbacks=[lgb.early_stopping(Config.early_stop)]
        )
        models.append(model)
        if lgbm_params['verbose']>=1:
            lgb.plot_metric(model,'auc')
            lgb.plot_metric(model,'average_precision')
            plt.scatter(y.loc[valid_idx],model.predict(X.loc[valid_idx,:]))
            plt.show()
            plt.hist2d(y.loc[valid_idx].to_numpy().flatten(),model.predict(X.loc[valid_idx,:]).flatten(),bins=[21,21])
            plt.show()
        if cv_scoring is not None:
            scores = scores + [cv_scoring(y.loc[valid_idx],model.predict_proba(X.loc[valid_idx,:])[:,1])]
        if feat_importance is not None:
            next_feat_imp = feat_importance(model, X.loc[valid_idx,:], y.loc[valid_idx], **feat_importance_args)
            next_feat_imp.columns = [col + str(fi) for col in next_feat_imp.columns]
            feat_importances = pd.concat([feat_importances,next_feat_imp],axis = 1)
    if feat_importances_plot is not None:
        feat_importances_plot(feat_importances)
    return models, feat_importances, scores

def infer_lgb(data, models):
    total_models = len(models)
    
    y_out = np.zeros((data.shape[0],2))
    for model in models:
        y_out = y_out + model.predict_proba(data)
    y_out = y_out/total_models
    return np.apply_along_axis(lambda y:y[1],1,y_out)

In [13]:
def train_catboost(X, y, 
                   feat_importance = None, 
                   feat_importance_args = None, feat_importances_plot = None, cv_scoring = None,
                   catboost_params = {
                       'n_estimators': 1000,
                       'verbose': -1,
                   },
                   target_fold = -1
                  ):    
    s_kfold = StratifiedKFold(n_splits=Config_catboost.n_splits,shuffle=True)
    catcols = []
    for idx,ele in X.dtypes.items():
        if ele != 'float64':
            catcols.append(idx)
    models = []
    scores = []
    feat_importances = pd.DataFrame()
    for fi, (train_idx, valid_idx) in enumerate(s_kfold.split(X, y)):
        if target_fold == -1 or fi == target_fold:
            print(f'Fold {fi+1}/{Config_catboost.n_splits} ...')
            model = CatBoostClassifier(**catboost_params)
            train_data = Pool(X.iloc[train_idx], y.iloc[train_idx],cat_features = catcols)
            valid_data = Pool(X.iloc[valid_idx], y.iloc[valid_idx],cat_features = catcols)
            model.fit(
                train_data,
                eval_set=valid_data,
                early_stopping_rounds=Config_catboost.early_stop,
                plot=Config_catboost.plot_metrics,
            )
            models.append(model)
            if cv_scoring is not None:
                scores = scores + [cv_scoring(y.loc[valid_idx],model.predict_proba(X.loc[valid_idx,:])[:,1])]
            if feat_importance is not None:
                next_feat_imp = feat_importance(model, X.loc[valid_idx,:], y.loc[valid_idx], **feat_importance_args)
                next_feat_imp.columns = [col + str(fi) for col in next_feat_imp.columns]
                feat_importances = pd.concat([feat_importances,next_feat_imp],axis = 1)
    if feat_importances_plot is not None:
        feat_importances_plot(feat_importances)
    return models, feat_importances, scores

def infer_catboost(data, models):
    total_models = len(models)
    
    y_out = np.zeros((data.shape[0],2))
    for model in models:
        y_out = y_out + model.predict_proba(data)
    y_out = y_out/total_models
    return np.apply_along_axis(lambda y:y[1],1,y_out)

In [14]:
class Config:
    #lgb parameters
    early_stop = 200
    n_splits = 5
    use_splits = []
    split_agent_features = True
    
    lgbm_params = {
        'verbose' : 1,
        'n_estimators': 10000,
        'colsample_bytree': 0.3507040407981956, 
        'reg_lambda': 1.596123927471322, 
        'reg_alpha': 0.2723801571566109, 
        'learning_rate': 0.02168731247549655
    }
    
        
    feat_importance_dict = {
        'n_repeats':10,
        'random_state':42,
        'n_jobs':2,        
    }
#if not is_submission:
    #models, feat_imp, cv_scoring = train_lgb(testproc.X_processed,testproc.y_targ,cv_scoring=roc_auc_score,lgbm_params = Config.lgbm_params)

In [15]:
class Config_catboost:
    #lgb parameters
    early_stop = 200
    n_splits = 5
    use_splits = []
    plot_metrics = True

    catboost_params = {
        'verbose' : 1,
        'n_estimators': 40000,
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'metric_period': 100,
        'colsample_bylevel': 0.8238464331915234,
        'l2_leaf_reg': 0.7581899740973308,
        'model_size_reg': 0.02203622528287333, 
        'learning_rate': 0.07,
        'random_seed': 73,
        #'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        #'max_depth': trial.suggest_int('max_depth', 12, 16),
        #'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        #'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
    }
if not is_submission:
    models, feat_imp, cv_scoring = train_catboost(testproc.X_processed,testproc.y_targ,cv_scoring=roc_auc_score,catboost_params = Config_catboost.catboost_params,target_fold=2)
    savedumping([(models,'_models'),(testproc.transformer.test_transform,'_test_transform'),(infer_catboost,'_infer'),(Config_catboost.catboost_params,'_params')],'catboost_01')
    aggr_model = aggr_model + models

Fold 3/5 ...


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



0:	test: 0.9253052	best: 0.9253052 (0)	total: 177ms	remaining: 1h 58m 6s
100:	test: 0.9638296	best: 0.9638296 (100)	total: 9.78s	remaining: 1h 4m 22s
200:	test: 0.9657338	best: 0.9657338 (200)	total: 19.1s	remaining: 1h 3m 4s
300:	test: 0.9663715	best: 0.9663782 (299)	total: 28.9s	remaining: 1h 3m 28s
400:	test: 0.9668500	best: 0.9668500 (400)	total: 39.5s	remaining: 1h 5m 4s
500:	test: 0.9672136	best: 0.9672136 (500)	total: 49.3s	remaining: 1h 4m 46s
600:	test: 0.9674481	best: 0.9674765 (590)	total: 59.3s	remaining: 1h 4m 45s
700:	test: 0.9675099	best: 0.9675178 (670)	total: 1m 9s	remaining: 1h 5m 10s
800:	test: 0.9676803	best: 0.9676828 (798)	total: 1m 19s	remaining: 1h 5m 12s
900:	test: 0.9677892	best: 0.9677939 (899)	total: 1m 30s	remaining: 1h 5m 10s
1000:	test: 0.9678545	best: 0.9678627 (990)	total: 1m 40s	remaining: 1h 5m 31s
1100:	test: 0.9677761	best: 0.9678627 (990)	total: 1m 51s	remaining: 1h 5m 27s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.9678627

In [16]:
class Config_catboost:
    #Catboost parameters
    early_stop = 200
    n_splits = 5
    use_splits = []
    plot_metrics = True

    catboost_params = {
        'verbose' : 1,
        'n_estimators': 40000,
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'metric_period': 100,
        'colsample_bylevel': 0.4262529055102354,
        'l2_leaf_reg': 0.6968608710804043,
        'model_size_reg': 0.04518026487545242, 
        'learning_rate': 0.07,
        'random_seed': 73,
        #'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        #'max_depth': trial.suggest_int('max_depth', 12, 16),
        #'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        #'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
    }
if not is_submission:
    models, feat_imp, cv_scoring = train_catboost(testproc.X_processed,testproc.y_targ,cv_scoring=roc_auc_score,catboost_params = Config_catboost.catboost_params,target_fold=1)
    savedumping([(models,'_models'),(testproc.transformer.test_transform,'_test_transform'),(infer_catboost,'_infer'),(Config_catboost.catboost_params,'_params')],'catboost_02')
    aggr_model = aggr_model + models

Fold 2/5 ...


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



0:	test: 0.8621959	best: 0.8621959 (0)	total: 85ms	remaining: 56m 39s
100:	test: 0.9604924	best: 0.9604924 (100)	total: 8.17s	remaining: 53m 46s
200:	test: 0.9626410	best: 0.9626410 (200)	total: 15.5s	remaining: 51m 13s
300:	test: 0.9639223	best: 0.9639339 (295)	total: 23.2s	remaining: 50m 54s
400:	test: 0.9644591	best: 0.9644597 (399)	total: 30.8s	remaining: 50m 42s
500:	test: 0.9646884	best: 0.9647272 (496)	total: 39.4s	remaining: 51m 44s
600:	test: 0.9649078	best: 0.9649078 (600)	total: 47.3s	remaining: 51m 40s
700:	test: 0.9649662	best: 0.9649861 (697)	total: 55.3s	remaining: 51m 40s
800:	test: 0.9651320	best: 0.9651381 (798)	total: 1m 3s	remaining: 51m 34s
900:	test: 0.9651149	best: 0.9651562 (857)	total: 1m 11s	remaining: 52m 2s
1000:	test: 0.9652056	best: 0.9652082 (997)	total: 1m 19s	remaining: 51m 52s
1100:	test: 0.9653125	best: 0.9653152 (1096)	total: 1m 28s	remaining: 51m 57s
1200:	test: 0.9653161	best: 0.9653686 (1114)	total: 1m 36s	remaining: 51m 46s
1300:	test: 0.9652590	

In [17]:
class Config_catboost:
    #Catboost parameters
    early_stop = 200
    n_splits = 5
    use_splits = []
    plot_metrics = True

    catboost_params = {
        'verbose' : 1,
        'n_estimators': 40000,
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'metric_period': 100,
        'colsample_bylevel': 0.25936353011971747,
        'l2_leaf_reg': 0.7757480352487609,
        'model_size_reg': 0.03732471214030866, 
        'learning_rate': 0.07,
        'random_seed': 73,
        #'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        #'max_depth': trial.suggest_int('max_depth', 12, 16),
        #'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        #'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
    }
if not is_submission:
    models, feat_imp, cv_scoring = train_catboost(testproc.X_processed,testproc.y_targ,cv_scoring=roc_auc_score,catboost_params = Config_catboost.catboost_params,target_fold=3)
    savedumping([(models,'_models'),(testproc.transformer.test_transform,'_test_transform'),(infer_catboost,'_infer'),(Config_catboost.catboost_params,'_params')],'catboost_03')
    aggr_model = aggr_model + models

Fold 4/5 ...


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



0:	test: 0.8659996	best: 0.8659996 (0)	total: 54.9ms	remaining: 36m 34s
100:	test: 0.9646399	best: 0.9646399 (100)	total: 6.18s	remaining: 40m 40s
200:	test: 0.9668596	best: 0.9668596 (200)	total: 12.7s	remaining: 41m 46s
300:	test: 0.9674769	best: 0.9674769 (300)	total: 19s	remaining: 41m 44s
400:	test: 0.9677808	best: 0.9677939 (393)	total: 26.1s	remaining: 42m 55s
500:	test: 0.9680299	best: 0.9680352 (499)	total: 32.8s	remaining: 43m 2s
600:	test: 0.9680936	best: 0.9681563 (556)	total: 39.5s	remaining: 43m 12s
700:	test: 0.9682460	best: 0.9682460 (700)	total: 46.1s	remaining: 43m 4s
800:	test: 0.9683336	best: 0.9683672 (765)	total: 53.1s	remaining: 43m 18s
900:	test: 0.9683992	best: 0.9683992 (900)	total: 1m	remaining: 43m 33s
1000:	test: 0.9684291	best: 0.9684591 (917)	total: 1m 7s	remaining: 43m 31s
1100:	test: 0.9684034	best: 0.9684687 (1050)	total: 1m 13s	remaining: 43m 32s
1200:	test: 0.9684548	best: 0.9684891 (1184)	total: 1m 20s	remaining: 43m 26s
1300:	test: 0.9685272	best: 

In [18]:
class Config_catboost:
    #Catboost parameters
    early_stop = 200
    n_splits = 5
    use_splits = []
    plot_metrics = True

    catboost_params = {
        'verbose' : 1,
        'n_estimators': 40000,
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'metric_period': 100,
        'colsample_bylevel': 0.853899465738587,
        'l2_leaf_reg': 0.7999193591030588,
        'model_size_reg': 0.0849983257003327,
        'learning_rate': 0.07,
        'random_seed': 73,
        #'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        #'max_depth': trial.suggest_int('max_depth', 12, 16),
        #'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        #'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
    }
if not is_submission:
    models, feat_imp, cv_scoring = train_catboost(testproc.X_processed,testproc.y_targ,cv_scoring=roc_auc_score,catboost_params = Config_catboost.catboost_params,target_fold=4)
    savedumping([(models,'_models'),(testproc.transformer.test_transform,'_test_transform'),(infer_catboost,'_infer'),(Config_catboost.catboost_params,'_params')],'catboost_04')
    aggr_model = aggr_model + models

Fold 5/5 ...


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



0:	test: 0.8672852	best: 0.8672852 (0)	total: 98.7ms	remaining: 1h 5m 49s
100:	test: 0.9625075	best: 0.9625075 (100)	total: 9.79s	remaining: 1h 4m 28s
200:	test: 0.9649247	best: 0.9649247 (200)	total: 19.9s	remaining: 1h 5m 32s
300:	test: 0.9659302	best: 0.9659302 (300)	total: 29.6s	remaining: 1h 5m 10s
400:	test: 0.9665176	best: 0.9665176 (400)	total: 39.5s	remaining: 1h 4m 59s
500:	test: 0.9667800	best: 0.9668100 (489)	total: 50.2s	remaining: 1h 5m 54s
600:	test: 0.9670543	best: 0.9670581 (589)	total: 1m	remaining: 1h 5m 47s
700:	test: 0.9672759	best: 0.9672759 (700)	total: 1m 10s	remaining: 1h 5m 48s
800:	test: 0.9673237	best: 0.9673281 (715)	total: 1m 21s	remaining: 1h 6m 27s
900:	test: 0.9673978	best: 0.9674100 (877)	total: 1m 31s	remaining: 1h 6m 23s
1000:	test: 0.9675165	best: 0.9675348 (990)	total: 1m 42s	remaining: 1h 6m 24s
1100:	test: 0.9674889	best: 0.9675454 (1055)	total: 1m 53s	remaining: 1h 6m 45s
1200:	test: 0.9674943	best: 0.9675454 (1055)	total: 2m 3s	remaining: 1h 6m

In [19]:
class Config_catboost:
    #Catboost parameters
    early_stop = 200
    n_splits = 5
    use_splits = []
    plot_metrics = True

    catboost_params = {
        'verbose' : 1,
        'n_estimators': 40000,
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'metric_period': 100,
        'colsample_bylevel': 0.7626854248140648,
        'l2_leaf_reg': 0.8533528033396921,
        'model_size_reg': 0.06941554875957238, 
        'learning_rate': 0.07,
        'random_seed': 73,
        #'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        #'max_depth': trial.suggest_int('max_depth', 12, 16),
        #'num_leaves': trial.suggest_int('num_leaves', 20, 150),
        #'bagging_freq': trial.suggest_int('bagging_freq', 1, 5),
    }
if not is_submission:
    models, feat_imp, cv_scoring = train_catboost(testproc.X_processed,testproc.y_targ,cv_scoring=roc_auc_score,catboost_params = Config_catboost.catboost_params,target_fold=0)
    savedumping([(models,'_models'),(testproc.transformer.test_transform,'_test_transform'),(infer_catboost,'_infer'),(Config_catboost.catboost_params,'_params')],'catboost_05')
    aggr_model = aggr_model + models

Fold 1/5 ...


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))



0:	test: 0.8738845	best: 0.8738845 (0)	total: 96.1ms	remaining: 1h 4m 5s
100:	test: 0.9604826	best: 0.9604826 (100)	total: 9.52s	remaining: 1h 2m 42s
200:	test: 0.9627854	best: 0.9627854 (200)	total: 18.8s	remaining: 1h 2m 12s
300:	test: 0.9638429	best: 0.9638432 (299)	total: 28.2s	remaining: 1h 2m 3s
400:	test: 0.9644565	best: 0.9644583 (398)	total: 37.9s	remaining: 1h 2m 27s
500:	test: 0.9648114	best: 0.9648114 (500)	total: 48.5s	remaining: 1h 3m 41s
600:	test: 0.9650402	best: 0.9650402 (600)	total: 58s	remaining: 1h 3m 22s
700:	test: 0.9653417	best: 0.9653467 (699)	total: 1m 7s	remaining: 1h 3m 21s
800:	test: 0.9654144	best: 0.9654221 (797)	total: 1m 18s	remaining: 1h 3m 50s
900:	test: 0.9654110	best: 0.9654506 (886)	total: 1m 28s	remaining: 1h 3m 42s
1000:	test: 0.9654590	best: 0.9654857 (927)	total: 1m 37s	remaining: 1h 3m 36s
1100:	test: 0.9655445	best: 0.9655555 (1099)	total: 1m 48s	remaining: 1h 3m 53s
1200:	test: 0.9655109	best: 0.9655670 (1110)	total: 1m 58s	remaining: 1h 3m 

In [20]:
if True:
    print('submission')
    #f = open(loadingpath,'rb')
    #models = pickle.load(f)
    #f.close()
    #f = open(loadingpath+'_config','rb')
    #Config_catboost = pickle.load(f)
    #f.close()
    #f = open(loadingpath+'_infer','rb')
    #infer_catboost = pickle.load(f)
    #f.close()
    #f = open(loadingpath+'_transform','rb')
    #test_transform = pickle.load(f)
    #f.close()
    
    X_test_raw = pd.read_csv('/kaggle/input/playground-series-s4e10/test.csv')
    X_test = testproc.transformer.test_transform(X_test_raw)
    
    y_test_pred = pd.DataFrame(infer_catboost(X_test,aggr_model),index = X_test.index,columns = ['loan_status'])
    y_test_pred = pd.concat([y_test_pred,X_test_raw['id']],axis=1)
    y_test_pred.to_csv('/kaggle/working/submission.csv',index=False)

submission
