# Importing some packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE
from sklearn.feature_selection import r_regression, SelectKBest
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.feature_selection import f_classif, chi2
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix

from imblearn.combine import SMOTEENN

from xgboost import XGBClassifier

from lightgbm import LGBMClassifier

from scipy import stats

from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy.special import softmax

from boruta import BorutaPy

# from BorutaShap import BorutaShap

from collections import Counter

import shap

import os
from pathlib import Path

from bisect import bisect

import pickle

import re

import gc

from tqdm.autonotebook import tqdm

import warnings 
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Loading data

In [2]:
data = pd.read_csv('train_data.csv')

test_data = pd.read_csv('test_data.csv')

# Getting features' names

In [3]:
nominal_features = [' Liability-Assets Flag'] # Net Income Flag is removed

numerical_features = [col for col in data.columns if col not in nominal_features and col!='Bankrupt?']

# Creating folds

In [4]:
skfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
skfold_list = []
for train_idxs, valid_idxs in skfold.split(data, y=data['Bankrupt?']):
    skfold_list.append((train_idxs, valid_idxs))

# Shap value importance function

In [5]:
def get_feature_importances_shap_values(shap_values, features, topk=10):
    '''
    Prints the feature importances based on SHAP values in an ordered way
    shap_values -> The SHAP values calculated from a shap.Explainer object
    features -> The name of the features, on the order presented to the explainer
    '''
    # Calculates the feature importance (mean absolute shap value) for each feature
    importances = []
    for i in range(shap_values.values.shape[1]):
        importances.append(np.mean(np.abs(shap_values.values[:, i])))
    # Calculates the normalized version
    importances_norm = softmax(importances)
    # Organize the importances and columns in a dictionary
    feature_importances = {fea: imp for imp, fea in zip(importances, features)}
    feature_importances_norm = {fea: imp for imp, fea in zip(importances_norm, features)}
    # Sorts the dictionary
    feature_importances = {k: v for k, v in sorted(feature_importances.items(), key=lambda item: item[1], reverse = True)}
    feature_importances_norm= {k: v for k, v in sorted(feature_importances_norm.items(), key=lambda item: item[1], reverse = True)}
    # Prints the feature importances
    selected_topk_feats = []
    
    for idx, (k, v) in enumerate(feature_importances.items()):
        # print(f"{k} -> {v:.4f} (softmax = {feature_importances_norm[k]:.4f})")
        if idx <=topk:
            selected_topk_feats.append(k)

    return selected_topk_feats

# Function for feature selection

In [6]:
class FSelector():

    def __init__(self, X, y, num_feats, ordinal_feats, nominal_feats, model, is_target_cat=True, select_n_feats=15):

        self.X = X
        self.y = y
        self.num_feats = num_feats
        self.ordinal_feats = ordinal_feats
        self.nominal_feats = nominal_feats
        self.model = model
        self.is_target_cat = is_target_cat
        self.select_n_feats = select_n_feats

    def calculate_vif(self, X):
    
        vif = pd.DataFrame()
        vif["features"] = X.columns
        vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

        return vif

    def select_feats_via_vif(self):

        num_features = self.num_feats.copy()

        vif_df = self.calculate_vif(self.X[num_features])

        while vif_df[vif_df['VIF']>=10].shape[0] != 0:
            vif_df.sort_values('VIF', ascending=False, inplace=True)
            vif_df.reset_index(drop=True, inplace=True)
            # print(vif_df)
            elimination_candidate = vif_df.iloc[0]['features']
            # print(elimination_candidate)
            num_features = [i for i in num_features if i!=elimination_candidate]
            new_X = self.X[num_features]
            vif_df = self.calculate_vif(new_X)

        return list(vif_df['features'].values)
    
    def get_spearmanr(self, X, y):
        # return np.array([stats.spearmanr(X.values[:, f], y.values).correlation for f in range(X.shape[1])])
        spearman_values = [stats.spearmanr(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
        temp_sp_df = pd.DataFrame({'spearman': spearman_values, 'feats': list(X.columns)})
        temp_sp_df['abs_spearman'] = np.abs(temp_sp_df['spearman'])
        temp_sp_df.sort_values('abs_spearman', ascending=False, inplace=True)
        temp_sp_df.reset_index(drop=True, inplace=True)
        return temp_sp_df.iloc[:15]['feats'].to_list()
    
    def get_kendalltau(self, X, y):
        # return [stats.kendalltau(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
        kendall_values = [stats.spearmanr(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
        temp_ken_df = pd.DataFrame({'kendall': kendall_values, 'feats': list(X.columns)})
        temp_ken_df['abs_kendall'] = np.abs(temp_ken_df['kendall'])
        temp_ken_df.sort_values('abs_kendall', ascending=False, inplace=True)
        temp_ken_df.reset_index(drop=True, inplace=True)
        return temp_ken_df.iloc[:15]['feats'].to_list()
    
    def get_pointbiserialr(self, X, y):
        return [stats.pointbiserialr(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
    
    def get_boruto_feats(self):
        feat_selector = BorutaPy(self.model, n_estimators='auto', verbose=2, random_state=1)
        feat_selector.fit(np.array(self.X), np.array(self.y))
        boruta_selected_features = list(self.X.iloc[:, feat_selector.support_].columns)
        return boruta_selected_features
    
    def get_kbest(self, X, feats_list, metric):
        selector = SelectKBest(metric, k=self.select_n_feats)
        selector.fit_transform(X[feats_list], self.y)
        selected_feats_idxs_list = list(selector.get_support(indices=True))
        column_names = [feats_list[i] for i in selected_feats_idxs_list]
        return column_names
    
    def get_rfe_feats(self):
        model_rfe = RFE(self.model, n_features_to_select=self.select_n_feats)
        model_rfe.fit(self.X, self.y)
        model_rfe_feats = list(self.X.iloc[:, list(model_rfe.support_)].columns)
        return model_rfe_feats
    
    def get_shap_feats(self, feats_list, topk=10):
        model = self.model
        X = self.X[feats_list]
        model.fit(self.X, self.y)
        explainer = shap.Explainer(model.predict, X, max_evals = int(2 * X.shape[1] + 1), verbose=0)
        shap_values = explainer(X)
        selected_shap_features = get_feature_importances_shap_values(
            shap_values, features=list(X.columns), topk=topk
        )
        return selected_shap_features
    
    def get_votes(self):

        if self.num_feats is not None:

            if self.is_target_cat:

                temp_n_feats =  self.select_n_feats
                if len(self.num_feats) < self.select_n_feats:
                    self.select_n_feats = 'all'

                # self.num_kendalltau_feats = self.get_kendalltau(self.X[self.num_feats], self.y)
                self.num_f_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=f_classif)
                self.num_mi_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=mutual_info_classif)

                self.select_n_feats = temp_n_feats

                self.selected_num_feats = []
                # self.selected_num_feats.extend(self.num_kendalltau_feats)
                self.selected_num_feats.extend(self.num_f_feats)
                self.selected_num_feats.extend(self.num_mi_feats)

            else:

                self.vif_feats = self.select_feats_via_vif()

                temp_n_feats =  self.select_n_feats
                if len(self.num_feats) < self.select_n_feats:
                    self.select_n_feats = 'all'

                self.pearson_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=r_regression, k=self.select_n_feats)

                self.select_n_feats = temp_n_feats
                # self.num_spearmanr_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=stats.spearmanr, k=self.select_n_feats)
                # self.num_kendalltau_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=stats.kendalltau, k=self.select_n_feats)
                self.num_spearmanr_feats = self.get_spearmanr(self.X[self.num_feats], self.y)
                self.num_kendalltau_feats = self.get_kendalltau(self.X[self.num_feats], self.y)
                # self.num_spearmanr_feats = SelectKBest(self.get_spearmanr, k=self.select_n_feats).fit_transform(self.X[self.num_feats], self.y)
                # self.num_kendalltau_feats = SelectKBest(self.get_kendalltau, k=self.select_n_feats).fit_transform(self.X[self.num_feats], self.y)

                self.selected_num_feats = []
                self.selected_num_feats.extend(self.pearson_feats)
                self.selected_num_feats.extend(self.num_spearmanr_feats)
                self.selected_num_feats.extend(self.num_kendalltau_feats)
                # self.selected_num_feats = list(set(self.selected_num_feats))

        else:

            self.selected_num_feats = []

        if self.ordinal_feats is not None:

            if self.is_target_cat:

                temp_n_feats =  self.select_n_feats
                if len(self.ordinal_feats) < self.select_n_feats:
                    self.select_n_feats = 'all'

                self.ordinal_mi_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=mutual_info_classif)
                self.ordinal_chi2_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=chi2)

                self.selected_ordinal_feats = []
                self.selected_ordinal_feats.extend(self.ordinal_mi_feats)
                self.selected_ordinal_feats.extend(self.ordinal_chi2_feats)

                self.select_n_feats = temp_n_feats

            else:

                self.ordinal_spearmanr_feats = self.get_spearmanr(self.X[self.ordinal_feats], self.y)
                self.ordinal_kendalltau_feats = self.get_kendalltau(self.X[self.ordinal_feats], self.y)

                # self.ordinal_spearmanr_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=stats.spearmanr, k=self.select_n_feats)
                # self.ordinal_kendalltau_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=stats.kendalltau, k=self.select_n_feats)

                # self.ordinal_spearmanr_feats = SelectKBest(self.get_spearmanr, k=self.select_n_feats).fit_transform(self.X[self.ordinal_feats], self.y)
                # self.ordinal_kendalltau_feats = SelectKBest(self.get_kendalltau, k=self.select_n_feats).fit_transform(self.X[self.ordinal_feats], self.y)

                self.selected_ordinal_feats = []
                self.selected_ordinal_feats.extend(self.ordinal_spearmanr_feats)
                self.selected_ordinal_feats.extend(self.ordinal_kendalltau_feats)
                # self.selected_ordinal_feats = list(set(self.selected_ordinal_feats))
                
        else:
            self.selected_ordinal_feats = []

        if self.nominal_feats is not None:

            if self.is_target_cat:
                
                temp_n_feats =  self.select_n_feats
                if len(self.nominal_feats) < self.select_n_feats:
                    self.select_n_feats = 'all'

                self.nominal_mi_feats = self.get_kbest(X=self.X, feats_list=self.nominal_feats, metric=mutual_info_classif)
                self.nominal_chi2_feats = self.get_kbest(X=self.X, feats_list=self.nominal_feats, metric=chi2)

                self.selected_nominal_feats = []
                self.selected_nominal_feats.extend(self.nominal_mi_feats)
                self.selected_nominal_feats.extend(self.nominal_chi2_feats)
                
                self.select_n_feats = temp_n_feats

            else:

                temp_n_feats =  self.select_n_feats
                if len(self.nominal_feats) < self.select_n_feats:
                    self.select_n_feats = 'all'

                self.f_feats = self.get_kbest(X=self.X, feats_list=self.nominal_feats, metric=f_classif, k=self.select_n_feats)
                self.mi_feats = self.get_kbest(X=self.X, feats_list=self.nominal_feats, metric=mutual_info_regression, k=self.select_n_feats)

                self.select_n_feats = temp_n_feats

                # # self.f_feats = f_classif(self.X[self.nominal_feats], self.y)[0]
                # self.f_feats = SelectKBest(f_classif, k=self.select_n_feats).fit_transform(self.X[self.nominal_feats], self.y).columns
                
                # # self.mi_feats = mutual_info_regression(self.X[self.nominal_feats], self.y)
                # self.mi_feats = SelectKBest(mutual_info_regression, k=self.select_n_feats).fit_transform(self.X[self.nominal_feats], self.y).columns

                self.selected_nominal_feats = []
                self.selected_nominal_feats.extend(self.f_feats)
                self.selected_nominal_feats.extend(self.mi_feats)
                # self.selected_nominal_feats = list(set(self.selected_nominal_feats))

        else:

            self.selected_nominal_feats = []

        if self.model is not None:
            # np.int = np.int32
            # np.float = np.float64
            # np.bool = np.bool_
            if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
                self.boruto_feats =  self.get_boruto_feats()
            if not isinstance(self.model, SVC):
                self.rfe_feats = self.get_rfe_feats()
        else:
            self.boruto_feats = []
            self.rfe_feats = []

            
        if len(self.selected_num_feats) != 0:
            if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
                self.selected_num_feats.extend(self.boruto_feats)
            if not isinstance(self.model, SVC):
                self.selected_num_feats.extend(self.rfe_feats)
            num_feats_dict = dict(Counter(self.selected_num_feats))
            self.selected_num_feats = [i for i in num_feats_dict if num_feats_dict[i] >= 2]


        if len(self.selected_ordinal_feats) != 0:
            if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
                self.selected_ordinal_feats.extend(self.boruto_feats)
            if not isinstance(self.model, SVC):
                self.selected_ordinal_feats.extend(self.rfe_feats)
            ordinal_feats_dict = dict(Counter(self.selected_ordinal_feats))
            self.selected_ordinal_feats = [i for i in ordinal_feats_dict if ordinal_feats_dict[i] >= 2]

        if len(self.selected_nominal_feats) != 0:
            if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
                self.selected_nominal_feats.extend(self.boruto_feats)
            if not isinstance(self.model, SVC):
                self.selected_nominal_feats.extend(self.rfe_feats)
            nominal_feats_dict = dict(Counter(self.selected_nominal_feats))
            self.selected_nominal_feats = [i for i in nominal_feats_dict if nominal_feats_dict[i] >= 2]

        self.selected_feats = []
        self.selected_feats.extend(self.selected_num_feats)
        self.selected_feats.extend(self.selected_ordinal_feats)
        self.selected_feats.extend(self.selected_nominal_feats)
        if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
            self.selected_feats.extend(self.boruto_feats)
        self.selected_feats = list(set(self.selected_feats))

        # self.selected_feats = self.get_shap_feats(self.selected_feats)

        return self.selected_feats

# Selecting features

In [7]:
selected_features_dict = {}

for idx in tqdm(range(1)):

    X_train = data.iloc[skfold_list[idx][0]].reset_index(drop=True)
    y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

    X_valid = data.iloc[skfold_list[idx][1]].reset_index(drop=True)
    y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

    new_numerical_features = []
    for feat in numerical_features:
        X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * X_train[' Liability-Assets Flag']
        X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * X_valid[' Liability-Assets Flag']
        new_numerical_features.append(f"feat{numerical_features.index(feat)}")

    numerical_features.extend(new_numerical_features)

    # getting categorical features
    categorical_features = nominal_features.copy()

    #getting all features
    all_features = []
    all_features.extend(categorical_features)
    all_features.extend(numerical_features)

    X_train = X_train[all_features]
    X_valid = X_valid[all_features]

    models_list = [RandomForestClassifier(), XGBClassifier(), LogisticRegression(), SVC(probability=True)]
    model_names_list = ['RandomForestClassifier', 'XGBClassifier', 'LogisticRegression', 'SVC']

    for model_idx in tqdm(range(len(model_names_list))):

        model_name = model_names_list[model_idx]

        selected_features_dict[model_name] = {}

        # feature selection
        model = models_list[model_idx]

        if isinstance(model, LogisticRegression) or isinstance(model, SVC):

            scaler = StandardScaler()

            X_train2 = scaler.fit_transform(X_train[numerical_features])
            X_train2 = pd.DataFrame(X_train2, columns=numerical_features)
            X_train2 = pd.concat([X_train2, X_train[categorical_features]], axis=1)

            fselector = FSelector(
                X=X_train2, 
                y=y_train, 
                num_feats=numerical_features, 
                ordinal_feats=None, 
                nominal_feats=nominal_features, 
                model=model
            )

        else:

            fselector = FSelector(
                X=X_train, 
                y=y_train, 
                num_feats=numerical_features, 
                ordinal_feats=None, 
                nominal_feats=nominal_features, 
                model=model
            )

        selected_features = fselector.get_votes()

        if len(selected_features) == 0:
            continue

        # model training
        model.fit(X_train[selected_features], y_train)

        # metric calculation
        y_train_pred = model.predict(X_train[selected_features])
        y_train_pred_prob = model.predict_proba(X_train[selected_features])[:, 1]

        if isinstance(model, LogisticRegression) or isinstance(model, SVC):
            X_valid2 = scaler.transform(X_valid[numerical_features])
            X_valid2 = pd.DataFrame(X_valid2, columns=numerical_features)
            X_valid2 = pd.concat([X_valid2, X_valid[categorical_features]], axis=1)
            y_valid_pred = model.predict(X_valid2[selected_features])
        else:
            y_valid_pred = model.predict(X_valid[selected_features])
        y_valid_pred_prob = model.predict_proba(X_valid[selected_features])[:, 1]

        train_acc = accuracy_score(y_train, y_train_pred)
        train_f1 = f1_score(y_train, y_train_pred)
        train_roc_auc = roc_auc_score(y_train, y_train_pred_prob)

        valid_acc = accuracy_score(y_valid, y_valid_pred)
        valid_f1 = f1_score(y_valid, y_valid_pred)
        valid_roc_auc = roc_auc_score(y_valid, y_valid_pred_prob)

        selected_features_dict[model_name][idx+1] = {}
        selected_features_dict[model_name][idx+1]['selected_feats'] = selected_features
        selected_features_dict[model_name][idx+1]['train_acc'] = train_acc
        selected_features_dict[model_name][idx+1]['train_f1'] = train_f1
        selected_features_dict[model_name][idx+1]['train_roc_auc'] = train_roc_auc
        selected_features_dict[model_name][idx+1]['valid_acc'] = valid_acc
        selected_features_dict[model_name][idx+1]['valid_f1'] = valid_f1
        selected_features_dict[model_name][idx+1]['valid_roc_auc'] = valid_roc_auc
        selected_features_dict[model_name][idx+1]['model'] = model
        if isinstance(model, LogisticRegression) or isinstance(model, SVC):
            selected_features_dict[model_name][idx+1]['scaler'] = scaler

        print(f"##### {model_name} #####")
        print(f"Selected features: {selected_features}")
        print("Train:")
        print(f"Accuracy: {train_acc:.5f}, F1: {train_f1:.5f}, ROC-AUC: {train_roc_auc:.5f}")
        print("Validation:")
        print(f"Accuracy: {valid_acc:.5f}, F1: {valid_f1:.5f}, ROC-AUC: {valid_roc_auc:.5f}")

    del X_train, y_train, X_valid, y_valid
    gc.collect()

  0%|          | 0/1 [00:00<?, ?it/s]



Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	30
Tentative: 	18
Rejected: 	141
Iteration: 	9 / 100
Confirmed: 	30
Tentative: 	18
Rejected: 	141
Iteration: 	10 / 100
Confirmed: 	30
Tentative: 	18
Rejected: 	141
Iteration: 	11 / 100
Confirmed: 	30
Tentative: 	18
Rejected: 	141
Iteration: 	12 / 100
Confirmed: 	30
Tentative: 	18
Rejected: 	141
Iteration: 	13 / 100
Confirmed: 	30
Tentative: 	18
Rejected: 	141
Iteration: 	14 / 100
Confirmed: 	30
Tentative: 	17
Rejected: 	142
Iteration: 	15 / 100
Confirmed: 	30
Tentative: 	17
Rejected: 	142
Iteration: 	16 / 100
Confirmed: 	



##### RandomForestClassifier #####
Selected features: [' Liability-Assets Flag', ' Working Capital to Total Assets', ' Cash/Current Liability', ' Interest Expense Ratio', ' ROA(A) before interest and % after tax', " Net Income to Stockholder's Equity", ' After-tax net Interest Rate', ' Net Income to Total Assets', ' Persistent EPS in the Last Four Seasons', ' Total debt/Total net worth', ' Non-industry income and expenditure/revenue', ' Equity to Liability', ' Borrowing dependency', ' ROA(C) before interest and depreciation before interest', ' Per Share Net profit before tax (Yuan ¥)', ' Working Capital/Equity', ' ROA(B) before interest and depreciation after tax', ' Net Value Growth Rate', ' Debt ratio %', ' Interest Coverage Ratio (Interest expense to EBIT)', ' Net worth/Assets', ' Quick Ratio', ' Continuous interest rate (after tax)', ' Net profit before tax/Paid-in capital', ' Net Value Per Share (C)', ' Degree of Financial Leverage (DFL)', ' Net Value Per Share (B)', ' Interest-be



##### XGBClassifier #####
Selected features: [' Liability-Assets Flag', ' ROA(A) before interest and % after tax', " Net Income to Stockholder's Equity", ' Net Income to Total Assets', ' Persistent EPS in the Last Four Seasons', ' Total debt/Total net worth', ' Non-industry income and expenditure/revenue', ' Borrowing dependency', ' ROA(C) before interest and depreciation before interest', ' Per Share Net profit before tax (Yuan ¥)', ' ROA(B) before interest and depreciation after tax', ' Net Value Growth Rate', ' Debt ratio %', ' Pre-tax net Interest Rate', ' Interest Coverage Ratio (Interest expense to EBIT)', ' Net worth/Assets', ' Continuous interest rate (after tax)', ' Net profit before tax/Paid-in capital', ' Net Value Per Share (B)', ' Liability to Equity', ' Inventory/Working Capital']
Train:
Accuracy: 1.00000, F1: 1.00000, ROC-AUC: 1.00000
Validation:
Accuracy: 0.97231, F1: 0.51429, ROC-AUC: 0.91557




##### LogisticRegression #####
Selected features: [' Net profit before tax/Paid-in capital', ' ROA(B) before interest and depreciation after tax', ' Persistent EPS in the Last Four Seasons', ' Net Income to Total Assets', ' Liability-Assets Flag', ' Liability to Equity', ' ROA(A) before interest and % after tax', ' Debt ratio %', ' Net worth/Assets', " Net Income to Stockholder's Equity", ' Borrowing dependency', ' Per Share Net profit before tax (Yuan ¥)']
Train:
Accuracy: 0.96922, F1: 0.15842, ROC-AUC: 0.92356
Validation:
Accuracy: 0.38599, F1: 0.09592, ROC-AUC: 0.92997


100%|██████████| 4/4 [28:33<00:00, 428.33s/it]
100%|██████████| 1/1 [28:33<00:00, 1713.54s/it]

##### SVC #####
Selected features: [' Net profit before tax/Paid-in capital', ' Persistent EPS in the Last Four Seasons', ' Liability-Assets Flag', ' Liability to Equity', ' ROA(A) before interest and % after tax', ' Debt ratio %', ' Net worth/Assets', " Net Income to Stockholder's Equity", ' Borrowing dependency', ' Per Share Net profit before tax (Yuan ¥)']
Train:
Accuracy: 0.96868, F1: 0.06486, ROC-AUC: 0.80992
Validation:
Accuracy: 0.96743, F1: 0.00000, ROC-AUC: 0.89588





In [8]:
selected_features_dict['RandomForestClassifier'][1]['selected_feats']

[' Liability-Assets Flag',
 ' Working Capital to Total Assets',
 ' Cash/Current Liability',
 ' Interest Expense Ratio',
 ' ROA(A) before interest and % after tax',
 " Net Income to Stockholder's Equity",
 ' After-tax net Interest Rate',
 ' Net Income to Total Assets',
 ' Persistent EPS in the Last Four Seasons',
 ' Total debt/Total net worth',
 ' Non-industry income and expenditure/revenue',
 ' Equity to Liability',
 ' Borrowing dependency',
 ' ROA(C) before interest and depreciation before interest',
 ' Per Share Net profit before tax (Yuan ¥)',
 ' Working Capital/Equity',
 ' ROA(B) before interest and depreciation after tax',
 ' Net Value Growth Rate',
 ' Debt ratio %',
 ' Interest Coverage Ratio (Interest expense to EBIT)',
 ' Net worth/Assets',
 ' Quick Ratio',
 ' Continuous interest rate (after tax)',
 ' Net profit before tax/Paid-in capital',
 ' Net Value Per Share (C)',
 ' Degree of Financial Leverage (DFL)',
 ' Net Value Per Share (B)',
 ' Interest-bearing debt interest rate',


In [9]:
selected_features_dict['XGBClassifier'][1]['selected_feats']

[' Liability-Assets Flag',
 ' ROA(A) before interest and % after tax',
 " Net Income to Stockholder's Equity",
 ' Net Income to Total Assets',
 ' Persistent EPS in the Last Four Seasons',
 ' Total debt/Total net worth',
 ' Non-industry income and expenditure/revenue',
 ' Borrowing dependency',
 ' ROA(C) before interest and depreciation before interest',
 ' Per Share Net profit before tax (Yuan ¥)',
 ' ROA(B) before interest and depreciation after tax',
 ' Net Value Growth Rate',
 ' Debt ratio %',
 ' Pre-tax net Interest Rate',
 ' Interest Coverage Ratio (Interest expense to EBIT)',
 ' Net worth/Assets',
 ' Continuous interest rate (after tax)',
 ' Net profit before tax/Paid-in capital',
 ' Net Value Per Share (B)',
 ' Liability to Equity',
 ' Inventory/Working Capital']

In [10]:
selected_features_dict['LogisticRegression'][1]['selected_feats']

[' Net profit before tax/Paid-in capital',
 ' ROA(B) before interest and depreciation after tax',
 ' Persistent EPS in the Last Four Seasons',
 ' Net Income to Total Assets',
 ' Liability-Assets Flag',
 ' Liability to Equity',
 ' ROA(A) before interest and % after tax',
 ' Debt ratio %',
 ' Net worth/Assets',
 " Net Income to Stockholder's Equity",
 ' Borrowing dependency',
 ' Per Share Net profit before tax (Yuan ¥)']

In [11]:
selected_features_dict['SVC'][1]['selected_feats']

[' Net profit before tax/Paid-in capital',
 ' Persistent EPS in the Last Four Seasons',
 ' Liability-Assets Flag',
 ' Liability to Equity',
 ' ROA(A) before interest and % after tax',
 ' Debt ratio %',
 ' Net worth/Assets',
 " Net Income to Stockholder's Equity",
 ' Borrowing dependency',
 ' Per Share Net profit before tax (Yuan ¥)']

# Model selection

## RandomForest

In [12]:
for feat in numerical_features:
    data[f"feat{numerical_features.index(feat)}"] = data[feat] * data[' Liability-Assets Flag']

idx = 0

X_train = data.iloc[skfold_list[idx][0]].reset_index(drop=True)
y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

X_valid = data.iloc[skfold_list[idx][1]].reset_index(drop=True)
y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

for feat in numerical_features:
    X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * X_train[' Liability-Assets Flag']
    X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * X_valid[' Liability-Assets Flag']

selected_features = selected_features_dict['RandomForestClassifier'][1]['selected_feats']

X_train = X_train[selected_features]
X_valid = X_valid[selected_features]

model = selected_features_dict['RandomForestClassifier'][1]['model']
explainer = shap.Explainer(model.predict, X_train, max_evals = int(2 * X_train.shape[1] + 1), verbose=0)
shap_values = explainer(X_train)
selected_shap_features = get_feature_importances_shap_values(
    shap_values, features=list(X_train.columns), topk=10
)
selected_features_dict['RandomForestClassifier'][1]['selected_shap_feats'] = selected_shap_features
selected_shap_features

PermutationExplainer explainer: 5524it [04:31, 20.04it/s]                          


[' Borrowing dependency',
 ' Degree of Financial Leverage (DFL)',
 ' Interest Expense Ratio',
 ' Interest Coverage Ratio (Interest expense to EBIT)',
 ' Cash/Total Assets',
 ' Quick Ratio',
 ' Persistent EPS in the Last Four Seasons',
 ' Working Capital to Total Assets',
 " Net Income to Stockholder's Equity",
 ' Working Capital/Equity',
 ' Equity to Liability']

In [13]:
n_estimators_list = [5, 10, 15, 25, 50, 100, 120, 300, 500]#, 800, 1200]

max_depth_list = [2, 3, 5, 8, 15, 25, 30, None]

# min_samples_split_list = [2, 5, 10, 15, 100]

# min_samples_leaf_list = [2, 5, 10]

# max_features_list = ['log2', 'sqrt', None]

params_dict={
    'n_estimators': n_estimators_list,
    'max_depth': max_depth_list,
    # 'min_samples_split': min_samples_split_list,
    # 'min_samples_leaf': min_samples_leaf_list,
    # 'max_features': max_features_list
}

rf_gscv = GridSearchCV(
    RandomForestClassifier(),
    param_grid=params_dict,
    scoring='roc_auc',
    cv=skfold_list,
    n_jobs=-1,
    verbose=4
)

# model training
rf_gscv.fit(data[selected_shap_features], data['Bankrupt?'])

print(rf_gscv.best_params_)
print(rf_gscv.best_score_)

for feat in numerical_features:
    test_data[f"feat{numerical_features.index(feat)}"] = test_data[feat] * test_data[' Liability-Assets Flag']

X_test = test_data[selected_shap_features]
y_test = test_data['Bankrupt?'].to_frame()

X_train = data[selected_shap_features]
# y_train = data['Bankrupt?'].to_frame()
rf_y_train_pred_prob = rf_gscv.predict_proba(X_train)[:, 1]

# y_test_pred = rf_gscv.predict(X_test[selected_features])
rf_y_test_pred_prob = rf_gscv.predict_proba(X_test)[:, 1]

pickle.dump(rf_gscv, open('rf_gscv.pkl', 'wb'))

rf_test_roc_auc = roc_auc_score(y_test, rf_y_test_pred_prob)
rf_test_roc_auc

Fitting 10 folds for each of 72 candidates, totalling 720 fits
{'max_depth': 8, 'n_estimators': 500}
0.9360679679251616


0.9544765840220386

## XGBoost

In [14]:
idx = 0

X_train = data.iloc[skfold_list[idx][0]].reset_index(drop=True)
y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

X_valid = data.iloc[skfold_list[idx][1]].reset_index(drop=True)
y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

for feat in numerical_features:
    X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * X_train[' Liability-Assets Flag']
    X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * X_valid[' Liability-Assets Flag']

selected_features = selected_features_dict['XGBClassifier'][1]['selected_feats']

X_train = X_train[selected_features]
X_valid = X_valid[selected_features]

model = selected_features_dict['XGBClassifier'][1]['model']
explainer = shap.Explainer(model.predict, X_train, max_evals = int(2 * X_train.shape[1] + 1), verbose=0)
shap_values = explainer(X_train)
selected_shap_features = get_feature_importances_shap_values(
    shap_values, features=list(X_train.columns), topk=10
)
selected_features_dict['XGBClassifier'][1]['selected_shap_feats'] = selected_shap_features
selected_shap_features

PermutationExplainer explainer: 5524it [01:00, 77.22it/s]                           


[' Borrowing dependency',
 ' Interest Coverage Ratio (Interest expense to EBIT)',
 ' ROA(C) before interest and depreciation before interest',
 ' Continuous interest rate (after tax)',
 ' Inventory/Working Capital',
 ' Non-industry income and expenditure/revenue',
 ' Net Value Per Share (B)',
 ' Net Value Growth Rate',
 ' Per Share Net profit before tax (Yuan ¥)',
 ' Total debt/Total net worth',
 ' Net Income to Total Assets']

In [15]:
eta_list = [0.01, 0.015, 0.025, 0.05, 0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9]

# gamma_list = [0, 0.05, 0.07, 0.09, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]

max_depth_list = [3, 5, 6, 7, 9, 12, 15, 17, 25]

# min_child_weight_list = [1, 3, 5, 7]

# subsample_list = [0.6, 0.7, 0.8, 0.9, 1.0]

# colsample_bytree_list = [0.6, 0.7, 0.8, 0.9, 1.0]

# lambda_list = [0.01, 0.03, 0.1, 1.0]

# alpha_list = [0, 0.1, 0.5, 1.0]

n_estimators_list = [50, 100, 150, 200, 500, 1000]


params_dict={
    'eta': eta_list,
    # 'gamma': gamma_list,
    'max_depth': max_depth_list,
    # 'min_child_weight': min_child_weight_list,
    # 'subsample': subsample_list,
    # 'colsample_bytree': colsample_bytree_list,
    # 'lambda': lambda_list,
    # 'alpha': alpha_list
    'n_estimators': n_estimators_list,
    # 'device': ['cuda']
}

xgb_gscv = GridSearchCV(
    XGBClassifier(),
    param_grid=params_dict,
    scoring='roc_auc',
    cv=skfold_list,
    n_jobs=-1,
    verbose=4
)

# model training
xgb_gscv.fit(data[selected_shap_features], data['Bankrupt?'])

print(xgb_gscv.best_params_)
print(xgb_gscv.best_score_)

X_test = test_data[selected_shap_features]
y_test = test_data['Bankrupt?'].to_frame()

X_train = data[selected_shap_features]
# y_train = data['Bankrupt?'].to_frame()
xgb_y_train_pred_prob = xgb_gscv.predict_proba(X_train)[:, 1]

# y_test_pred = xgb_gscv.predict(X_test[selected_features])
xgb_y_test_pred_prob = xgb_gscv.predict_proba(X_test)[:, 1]

pickle.dump(xgb_gscv, open('xgb_gscv.pkl', 'wb'))

xgb_test_roc_auc = roc_auc_score(y_test, xgb_y_test_pred_prob)
xgb_test_roc_auc

Fitting 10 folds for each of 594 candidates, totalling 5940 fits
{'eta': 0.1, 'max_depth': 3, 'n_estimators': 50}
0.9371915753045379


0.9488980716253443

## LogisticRegression

In [16]:
idx = 0

X_train = data.iloc[skfold_list[idx][0]].reset_index(drop=True)
y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

X_valid = data.iloc[skfold_list[idx][1]].reset_index(drop=True)
y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

for feat in numerical_features:
    X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * X_train[' Liability-Assets Flag']
    X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * X_valid[' Liability-Assets Flag']

scaler = selected_features_dict['LogisticRegression'][1]['scaler']

X_train2 = scaler.transform(X_train[numerical_features])
X_train2 = pd.DataFrame(X_train2, columns=numerical_features)
X_train = pd.concat([X_train2, X_train[categorical_features]], axis=1)

X_valid2 = scaler.transform(X_valid[numerical_features])
X_valid2 = pd.DataFrame(X_valid2, columns=numerical_features)
X_valid = pd.concat([X_valid2, X_valid[categorical_features]], axis=1)

selected_features = selected_features_dict['LogisticRegression'][1]['selected_feats']

X_train = X_train[selected_features]
X_valid = X_valid[selected_features]

model = selected_features_dict['LogisticRegression'][1]['model']
explainer = shap.Explainer(model.predict, X_train, max_evals = int(2 * X_train.shape[1] + 1), verbose=0)
shap_values = explainer(X_train)
selected_shap_features = get_feature_importances_shap_values(
    shap_values, features=list(X_train.columns), topk=10
)
selected_features_dict['LogisticRegression'][1]['selected_shap_feats'] = selected_shap_features
selected_shap_features

PermutationExplainer explainer: 5524it [00:24, 145.21it/s]                          


[' Debt ratio %',
 ' Net worth/Assets',
 ' ROA(B) before interest and depreciation after tax',
 ' ROA(A) before interest and % after tax',
 ' Net Income to Total Assets',
 ' Persistent EPS in the Last Four Seasons',
 ' Net profit before tax/Paid-in capital',
 ' Per Share Net profit before tax (Yuan ¥)',
 ' Borrowing dependency',
 ' Liability to Equity',
 " Net Income to Stockholder's Equity"]

In [17]:
num_feat = [col for col in selected_shap_features if col in numerical_features]
num_trans = Pipeline([('scale', StandardScaler())])
preprocessor = ColumnTransformer(transformers = [('num', num_trans, num_feat)], remainder='passthrough')
pipe = Pipeline(
    [
        ('preproc', preprocessor),
        ('lr', LogisticRegression())
    ]
)

params_dict = {'lr__penalty': ['l1','l2'], 'lr__C': [0.001,0.01,0.1,1,10,100,1000]}

lr_gscv = GridSearchCV(
    pipe,
    param_grid=params_dict,
    scoring='roc_auc',
    cv=skfold_list,
    n_jobs=-1,
    verbose=4
)

# model training
lr_gscv.fit(data[selected_shap_features], data['Bankrupt?'])

print(lr_gscv.best_params_)
print(lr_gscv.best_score_)

X_test = test_data[selected_shap_features]
y_test = test_data['Bankrupt?'].to_frame()

X_train = data[selected_shap_features]
# y_train = data['Bankrupt?'].to_frame()
lr_y_train_pred_prob = lr_gscv.predict_proba(X_train)[:, 1]

# y_test_pred = lr_gscv.predict(X_test[selected_features])
lr_y_test_pred_prob = lr_gscv.predict_proba(X_test)[:, 1]

pickle.dump(lr_gscv, open('lr_gscv.pkl', 'wb'))

lr_test_roc_auc = roc_auc_score(y_test, lr_y_test_pred_prob)
lr_test_roc_auc

Fitting 10 folds for each of 14 candidates, totalling 140 fits
{'lr__C': 0.01, 'lr__penalty': 'l2'}
0.9268905699998715


0.9464187327823691

## SVC

In [21]:
idx = 0

X_train = data.iloc[skfold_list[idx][0]].reset_index(drop=True)
y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

X_valid = data.iloc[skfold_list[idx][1]].reset_index(drop=True)
y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

for feat in numerical_features:
    X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * X_train[' Liability-Assets Flag']
    X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * X_valid[' Liability-Assets Flag']

scaler = selected_features_dict['SVC'][1]['scaler']

X_train2 = scaler.transform(X_train[numerical_features])
X_train2 = pd.DataFrame(X_train2, columns=numerical_features)
X_train = pd.concat([X_train2, X_train[categorical_features]], axis=1)

X_valid2 = scaler.transform(X_valid[numerical_features])
X_valid2 = pd.DataFrame(X_valid2, columns=numerical_features)
X_valid = pd.concat([X_valid2, X_valid[categorical_features]], axis=1)

selected_features = selected_features_dict['SVC'][1]['selected_feats']

X_train = X_train[selected_features]
X_valid = X_valid[selected_features]

model = selected_features_dict['SVC'][1]['model']
explainer = shap.Explainer(model.predict, X_train)#max_evals = int(2 * X_train.shape[1] + 1), verbose=0
shap_values = explainer(X_train)
selected_shap_features = get_feature_importances_shap_values(
    shap_values, features=list(X_train.columns), topk=10
)
selected_features_dict['SVC'][1]['selected_shap_feats'] = selected_shap_features
selected_shap_features

ExactExplainer explainer: 5524it [1:23:58,  1.09it/s]                            


[' Liability to Equity',
 ' Borrowing dependency',
 ' Debt ratio %',
 ' Net worth/Assets',
 ' ROA(A) before interest and % after tax',
 ' Per Share Net profit before tax (Yuan ¥)',
 ' Persistent EPS in the Last Four Seasons',
 ' Net profit before tax/Paid-in capital',
 " Net Income to Stockholder's Equity",
 ' Liability-Assets Flag']

In [22]:
num_feat = [col for col in selected_shap_features if col in numerical_features]
num_trans = Pipeline([('scale', StandardScaler())])
preprocessor = ColumnTransformer(transformers = [('num', num_trans, num_feat)], remainder='passthrough')
pipe = Pipeline(
    [
        ('preproc', preprocessor),
        ('svc', SVC(probability=True))
    ]
)

params_dict = {'svc__C': [1,10,100,1000], 'svc__gamma': [1,0.1,0.001,0.0001], 'svc__kernel': ['linear','rbf']}

svc_gscv = GridSearchCV(
    pipe,
    param_grid=params_dict,
    scoring='roc_auc',
    cv=skfold_list,
    n_jobs=-1,
    verbose=4
)

# model training
svc_gscv.fit(data[selected_shap_features], data['Bankrupt?'])

print(svc_gscv.best_params_)
print(svc_gscv.best_score_)

X_test = test_data[selected_shap_features]
y_test = test_data['Bankrupt?'].to_frame()

X_train = data[selected_shap_features]
# y_train = data['Bankrupt?'].to_frame()
svc_y_train_pred_prob = svc_gscv.predict_proba(X_train)[:, 1]

# y_test_pred = svc_gscv.predict(X_test[selected_features])
svc_y_test_pred_prob = svc_gscv.predict_proba(X_test)[:, 1]

pickle.dump(svc_gscv, open('svc_gscv.pkl', 'wb'))

svc_test_roc_auc = roc_auc_score(y_test, svc_y_test_pred_prob)
svc_test_roc_auc

Fitting 10 folds for each of 32 candidates, totalling 320 fits
{'svc__C': 1000, 'svc__gamma': 0.0001, 'svc__kernel': 'rbf'}
0.8998720616418318


0.9287878787878787

Saving selected_features_dict

In [23]:
pickle.dump(selected_features_dict, open('selected_features_dict.pkl', 'wb'))

# Ensemble

Loading the models and the selected features to avoid running the all cells above.

In [21]:
rf_gscv = pickle.load(open('rf_gscv.pkl', 'rb'))
xgb_gscv = pickle.load(open('xgb_gscv.pkl', 'rb'))
lr_gscv = pickle.load(open('lr_gscv.pkl', 'rb'))
svc_gscv = pickle.load(open('svc_gscv.pkl', 'rb'))
selected_features_dict = pickle.load(open('selected_features_dict.pkl', 'rb'))

for feat in numerical_features:
    test_data[f"feat{numerical_features.index(feat)}"] = test_data[feat] * test_data[' Liability-Assets Flag']

# RandomForest calculations
X_test = test_data[selected_features_dict['RandomForestClassifier'][1]['selected_shap_feats']]
y_test = test_data['Bankrupt?'].to_frame()

X_train = data[selected_features_dict['RandomForestClassifier'][1]['selected_shap_feats']]
# y_train = data['Bankrupt?'].to_frame()
rf_y_train_pred_prob = rf_gscv.predict_proba(X_train)[:, 1]

# y_test_pred = rf_gscv.predict(X_test[selected_features])
rf_y_test_pred_prob = rf_gscv.predict_proba(X_test)[:, 1]

# XGBoost calculations
X_test = test_data[selected_features_dict['XGBClassifier'][1]['selected_shap_feats']]

X_train = data[selected_features_dict['XGBClassifier'][1]['selected_shap_feats']]
# y_train = data['Bankrupt?'].to_frame()
xgb_y_train_pred_prob = xgb_gscv.predict_proba(X_train)[:, 1]

# y_test_pred = xgb_gscv.predict(X_test[selected_features])
xgb_y_test_pred_prob = xgb_gscv.predict_proba(X_test)[:, 1]

# LogisticRegression calculations
X_test = test_data[selected_features_dict['LogisticRegression'][1]['selected_shap_feats']]

X_train = data[selected_features_dict['LogisticRegression'][1]['selected_shap_feats']]
# y_train = data['Bankrupt?'].to_frame()
lr_y_train_pred_prob = lr_gscv.predict_proba(X_train)[:, 1]

# y_test_pred = lr_gscv.predict(X_test[selected_features])
lr_y_test_pred_prob = lr_gscv.predict_proba(X_test)[:, 1]

# SVC calculations
X_test = test_data[selected_features_dict['SVC'][1]['selected_shap_feats']]

X_train = data[selected_features_dict['SVC'][1]['selected_shap_feats']]
# y_train = data['Bankrupt?'].to_frame()
svc_y_train_pred_prob = svc_gscv.predict_proba(X_train)[:, 1]

# y_test_pred = svc_gscv.predict(X_test[selected_features])
svc_y_test_pred_prob = svc_gscv.predict_proba(X_test)[:, 1]


### Average

In [23]:
def get_mean_ensemble_prediction(prob_list):
    prob_array = np.vstack(prob_list).T
    return np.mean(prob_array, axis=1)

prob_list = [rf_y_test_pred_prob, xgb_y_test_pred_prob, lr_y_test_pred_prob, svc_y_test_pred_prob]
avg_ens_y_test_pred_prob = get_mean_ensemble_prediction(prob_list)
avg_ens_test_roc_auc = roc_auc_score(y_test, avg_ens_y_test_pred_prob)
avg_ens_test_roc_auc

0.9542011019283747

### Rank

In [34]:
# from scipy import stats

# ranked = []
# for i in range(len(prob_list)):
#     rank_data = stats.rankdata(prob_list[i])
#     ranked.append(rank_data)

# ranked = np.column_stack(ranked)

# np.mean(ranked, axis=1)

# Repespective AUC of test set: 0.9544765840220386, 0.9488980716253443, 0.9464187327823691, 0.9287878787878787

rank_ens_y_test_pred_prob = (4*rf_y_test_pred_prob + 3*xgb_y_test_pred_prob + 2*lr_y_test_pred_prob + svc_y_test_pred_prob) / (1+2+3+4)
rank_ens_test_roc_auc = roc_auc_score(y_test, rank_ens_y_test_pred_prob)
rank_ens_test_roc_auc

0.9553719008264463

### Optimized

In [37]:
from functools import partial
from scipy.optimize import fmin

class OptimizeAUC:
    def __init__(self):
        self.coef_ = 0

    def _auc(self, coef, X, y):
        X_coef = X * coef
        preds = np.sum(X_coef, axis=1)
        auc_score = roc_auc_score(y, preds)
        return -1*auc_score
    
    def fit(self, X, y):
        loss_partial = partial(self._auc, X=X, y=y)
        initial_coef = np.random.dirichlet(np.ones(X.shape[1]), size=1)
        self.coef_ = fmin(loss_partial, initial_coef, disp=True)

    def predict(self, X):
        X_coef = X * self.coef_
        preds = np.sum(X_coef, axis=1)
        return preds

In [54]:
coef_dict = {}

test_preds_list = []

for feat in numerical_features:
    test_data[f"feat{numerical_features.index(feat)}"] = test_data[feat] * test_data[' Liability-Assets Flag']

X_test_rf = test_data[selected_features_dict['RandomForestClassifier'][1]['selected_shap_feats']]
X_test_xgb = test_data[selected_features_dict['XGBClassifier'][1]['selected_shap_feats']]
X_test_lr = test_data[selected_features_dict['LogisticRegression'][1]['selected_shap_feats']]
X_test_svc = test_data[selected_features_dict['SVC'][1]['selected_shap_feats']]

for idx in range(10):

    X_train = data.iloc[skfold_list[idx][0]].reset_index(drop=True)
    y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

    X_valid = data.iloc[skfold_list[idx][1]].reset_index(drop=True)
    y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

    for feat in numerical_features:
        X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * X_train[' Liability-Assets Flag']
        X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * X_valid[' Liability-Assets Flag']

    # RandomForest
    rf_selected_features = selected_features_dict['RandomForestClassifier'][1]['selected_shap_feats']
    X_train_rf = X_train[rf_selected_features]
    X_valid_rf = X_valid[rf_selected_features]

    rfm = RandomForestClassifier(**rf_gscv.best_params_)
    rfm.fit(X_train_rf, y_train)
    rfm_valid_probs = rfm.predict_proba(X_valid_rf)[:, 1]

    rfm_test_probs = rfm.predict_proba(X_test_rf)[:, 1]
    # if idx == 0:
    #     test_preds_dict['RandomForestClassifier'] = [rfm_test_probs]
    # else:
    #     test_preds_dict['RandomForestClassifier'].append(rfm_test_probs)

    # XGBoost
    xgb_selected_features = selected_features_dict['XGBClassifier'][1]['selected_shap_feats']
    X_train_xgb = X_train[xgb_selected_features]
    X_valid_xgb = X_valid[xgb_selected_features]

    xgbm = XGBClassifier(**xgb_gscv.best_params_)
    xgbm.fit(X_train_xgb, y_train)
    xgbm_valid_probs = xgbm.predict_proba(X_valid_xgb)[:, 1]
    xgbm_test_probs = xgbm.predict_proba(X_test_xgb)[:, 1]

    # LogisticRegression
    lr_selected_features = selected_features_dict['LogisticRegression'][1]['selected_shap_feats']
    X_train_lr = X_train[lr_selected_features]
    X_valid_lr = X_valid[lr_selected_features]

    lr_params = {k.replace('lr__', ''): v for k, v in lr_gscv.best_params_.items()}
    selected_shap_features = selected_features_dict['LogisticRegression'][1]['selected_shap_feats']
    num_feat = [col for col in selected_shap_features if col in numerical_features]
    num_trans = Pipeline([('scale', StandardScaler())])
    preprocessor = ColumnTransformer(transformers = [('num', num_trans, num_feat)], remainder='passthrough')
    lrm = Pipeline(
        [
            ('preproc', preprocessor),
            ('lr', LogisticRegression(**lr_params))
        ]
    )
    lrm.fit(X_train_lr, y_train)
    lrm_valid_probs = lrm.predict_proba(X_valid_lr)[:, 1]
    lrm_test_probs = lrm.predict_proba(X_test_lr)[:, 1]

    # SVC
    svc_selected_features = selected_features_dict['SVC'][1]['selected_shap_feats']
    X_train_svc = X_train[svc_selected_features]
    X_valid_svc = X_valid[svc_selected_features]

    svc_params = {k.replace('svc__', ''): v for k, v in svc_gscv.best_params_.items()}
    selected_shap_features = selected_features_dict['SVC'][1]['selected_shap_feats']
    num_feat = [col for col in selected_shap_features if col in numerical_features]
    num_trans = Pipeline([('scale', StandardScaler())])
    preprocessor = ColumnTransformer(transformers = [('num', num_trans, num_feat)], remainder='passthrough')
    svcm = Pipeline(
        [
            ('preproc', preprocessor),
            ('svc', SVC(probability=True, **svc_params))
        ]
    )
    svcm.fit(X_train_svc, y_train)
    svcm_valid_probs = svcm.predict_proba(X_valid_svc)[:, 1]
    svcm_test_probs = svcm.predict_proba(X_test_svc)[:, 1]

    fold_preds = np.column_stack([
        rfm_valid_probs,
        xgbm_valid_probs,
        lrm_valid_probs,
        svcm_valid_probs
    ])

    opt = OptimizeAUC()
    opt.fit(fold_preds, y_valid)
    coef_dict[idx] = opt.coef_

    test_preds = np.column_stack([
        rfm_test_probs,
        xgbm_test_probs,
        lrm_test_probs,
        svcm_test_probs
    ])

    test_preds_list.append(opt.predict(test_preds))

Optimization terminated successfully.
         Current function value: -0.932744
         Iterations: 10
         Function evaluations: 59
Optimization terminated successfully.
         Current function value: -0.946128
         Iterations: 10
         Function evaluations: 55
Optimization terminated successfully.
         Current function value: -0.954882
         Iterations: 43
         Function evaluations: 103
Optimization terminated successfully.
         Current function value: -0.934764
         Iterations: 42
         Function evaluations: 107
Optimization terminated successfully.
         Current function value: -0.921296
         Iterations: 34
         Function evaluations: 93
Optimization terminated successfully.
         Current function value: -0.932155
         Iterations: 24
         Function evaluations: 72
Optimization terminated successfully.
         Current function value: -0.937626
         Iterations: 14
         Function evaluations: 60
Optimization terminated s

In [56]:
opt_y_test_pred_prob = np.mean(np.column_stack(test_preds_list), axis=1)
opt_test_roc_auc = roc_auc_score(y_test, opt_y_test_pred_prob)
opt_test_roc_auc

0.9567493112947658

### MLP

In [25]:
mlp_model = MLPClassifier(hidden_layer_sizes=(1, ))

test_meta_feats_array = np.vstack([rf_y_test_pred_prob, xgb_y_test_pred_prob, lr_y_test_pred_prob, svc_y_test_pred_prob]).T
train_meta_feats_array = np.vstack([rf_y_train_pred_prob, xgb_y_train_pred_prob, lr_y_train_pred_prob, svc_y_train_pred_prob]).T
# train_meta_feats_array = np.vstack([rf_y_test_pred_prob, lr_y_test_pred_prob, svc_y_test_pred_prob]).T

y_train = data['Bankrupt?'].to_frame()
# y_train = test_data['Bankrupt?'].to_frame()
mlp_model.fit(train_meta_feats_array, y_train)

y_test = test_data['Bankrupt?'].to_frame()

# test_meta_feats_array = np.vstack([rf_y_test_pred_prob, lr_y_test_pred_prob, svc_y_test_pred_prob]).T
# test_meta_feats_array.shape

mlp_y_test_pred_prob = mlp_model.predict_proba(test_meta_feats_array)[:, 1]
mlp_test_roc_auc = roc_auc_score(y_test, mlp_y_test_pred_prob)
mlp_test_roc_auc

0.5

### Stacking

In [60]:
test_preds_dict = {}

for feat in numerical_features:
    test_data[f"feat{numerical_features.index(feat)}"] = test_data[feat] * test_data[' Liability-Assets Flag']

X_test_rf = test_data[selected_features_dict['RandomForestClassifier'][1]['selected_shap_feats']]
X_test_xgb = test_data[selected_features_dict['XGBClassifier'][1]['selected_shap_feats']]
X_test_lr = test_data[selected_features_dict['LogisticRegression'][1]['selected_shap_feats']]
X_test_svc = test_data[selected_features_dict['SVC'][1]['selected_shap_feats']]

X_stack = np.zeros((data.shape[0], 4))

for idx in range(10):

    X_train = data.iloc[skfold_list[idx][0]].reset_index(drop=True)
    y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

    X_valid = data.iloc[skfold_list[idx][1]].reset_index(drop=True)
    y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

    for feat in numerical_features:
        X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * X_train[' Liability-Assets Flag']
        X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * X_valid[' Liability-Assets Flag']

    # RandomForest
    rf_selected_features = selected_features_dict['RandomForestClassifier'][1]['selected_shap_feats']
    X_train_rf = X_train[rf_selected_features]
    X_valid_rf = X_valid[rf_selected_features]

    rfm = RandomForestClassifier(**rf_gscv.best_params_)
    rfm.fit(X_train_rf, y_train)
    rfm_valid_probs = rfm.predict_proba(X_valid_rf)[:, 1]
    rfm_test_probs = rfm.predict_proba(X_test_rf)[:, 1]
    X_stack[skfold_list[idx][1], 0] = rfm_valid_probs
    # if idx == 0:
    #     test_preds_dict['RandomForestClassifier'] = [rfm_test_probs]
    # else:
    #     test_preds_dict['RandomForestClassifier'].append(rfm_test_probs)

    # XGBoost
    xgb_selected_features = selected_features_dict['XGBClassifier'][1]['selected_shap_feats']
    X_train_xgb = X_train[xgb_selected_features]
    X_valid_xgb = X_valid[xgb_selected_features]

    xgbm = XGBClassifier(**xgb_gscv.best_params_)
    xgbm.fit(X_train_xgb, y_train)
    xgbm_valid_probs = xgbm.predict_proba(X_valid_xgb)[:, 1]
    xgbm_test_probs = xgbm.predict_proba(X_test_xgb)[:, 1]
    X_stack[skfold_list[idx][1], 1] = xgbm_valid_probs

    # LogisticRegression
    lr_selected_features = selected_features_dict['LogisticRegression'][1]['selected_shap_feats']
    X_train_lr = X_train[lr_selected_features]
    X_valid_lr = X_valid[lr_selected_features]

    lr_params = {k.replace('lr__', ''): v for k, v in lr_gscv.best_params_.items()}
    selected_shap_features = selected_features_dict['LogisticRegression'][1]['selected_shap_feats']
    num_feat = [col for col in selected_shap_features if col in numerical_features]
    num_trans = Pipeline([('scale', StandardScaler())])
    preprocessor = ColumnTransformer(transformers = [('num', num_trans, num_feat)], remainder='passthrough')
    lrm = Pipeline(
        [
            ('preproc', preprocessor),
            ('lr', LogisticRegression(**lr_params))
        ]
    )
    lrm.fit(X_train_lr, y_train)
    lrm_valid_probs = lrm.predict_proba(X_valid_lr)[:, 1]
    lrm_test_probs = lrm.predict_proba(X_test_lr)[:, 1]
    X_stack[skfold_list[idx][1], 2] = lrm_valid_probs

    # SVC
    svc_selected_features = selected_features_dict['SVC'][1]['selected_shap_feats']
    X_train_svc = X_train[svc_selected_features]
    X_valid_svc = X_valid[svc_selected_features]

    svc_params = {k.replace('svc__', ''): v for k, v in svc_gscv.best_params_.items()}
    selected_shap_features = selected_features_dict['SVC'][1]['selected_shap_feats']
    num_feat = [col for col in selected_shap_features if col in numerical_features]
    num_trans = Pipeline([('scale', StandardScaler())])
    preprocessor = ColumnTransformer(transformers = [('num', num_trans, num_feat)], remainder='passthrough')
    svcm = Pipeline(
        [
            ('preproc', preprocessor),
            ('svc', SVC(probability=True, **svc_params))
        ]
    )
    svcm.fit(X_train_svc, y_train)
    svcm_valid_probs = svcm.predict_proba(X_valid_svc)[:, 1]
    svcm_test_probs = svcm.predict_proba(X_test_svc)[:, 1]
    X_stack[skfold_list[idx][1], 3] = svcm_valid_probs

    test_preds = np.column_stack([
        rfm_test_probs,
        xgbm_test_probs,
        lrm_test_probs,
        svcm_test_probs
    ])

    test_preds_dict[idx] = test_preds

In [67]:
svc_test_probs_list = []

for idx in range(10):
    temp_svc = SVC(probability=True)

    X_train_stack = X_stack[skfold_list[idx][0], :]
    y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

    X_valid_stack = X_stack[skfold_list[idx][1], :]
    y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

    temp_svc.fit(X_train_stack, y_train)
    temp_test_probs = temp_svc.predict_proba(test_preds_dict[idx])[:, 1]
    svc_test_probs_list.append(temp_test_probs)

stack_y_test_pred_prob = np.mean(np.column_stack(svc_test_probs_list), axis=1)
stack_test_roc_auc = roc_auc_score(y_test, stack_y_test_pred_prob)
stack_test_roc_auc

0.6741735537190082

In [71]:
lr_test_probs_list = []

for idx in range(10):
    temp_lr = XGBClassifier()

    X_train_stack = X_stack[skfold_list[idx][0], :]
    y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

    X_valid_stack = X_stack[skfold_list[idx][1], :]
    y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

    temp_lr.fit(X_train_stack, y_train)
    temp_test_probs = temp_lr.predict_proba(test_preds_dict[idx])[:, 1]
    lr_test_probs_list.append(temp_test_probs)

stack_y_test_pred_prob = np.mean(np.column_stack(lr_test_probs_list), axis=1)
stack_test_roc_auc = roc_auc_score(y_test, stack_y_test_pred_prob)
stack_test_roc_auc

0.9462809917355371

In [68]:
rf_test_probs_list = []

for idx in range(10):
    temp_rf = RandomForestClassifier()

    X_train_stack = X_stack[skfold_list[idx][0], :]
    y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

    X_valid_stack = X_stack[skfold_list[idx][1], :]
    y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

    temp_rf.fit(X_train_stack, y_train)
    temp_test_probs = temp_rf.predict_proba(test_preds_dict[idx])[:, 1]
    rf_test_probs_list.append(temp_test_probs)

stack_y_test_pred_prob = np.mean(np.column_stack(rf_test_probs_list), axis=1)
stack_test_roc_auc = roc_auc_score(y_test, stack_y_test_pred_prob)
stack_test_roc_auc

0.9526859504132231

In [70]:
xgb_test_probs_list = []

for idx in range(10):
    temp_xgb = XGBClassifier()

    X_train_stack = X_stack[skfold_list[idx][0], :]
    y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

    X_valid_stack = X_stack[skfold_list[idx][1], :]
    y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

    temp_xgb.fit(X_train_stack, y_train)
    temp_test_probs = temp_xgb.predict_proba(test_preds_dict[idx])[:, 1]
    xgb_test_probs_list.append(temp_test_probs)

stack_y_test_pred_prob = np.mean(np.column_stack(xgb_test_probs_list), axis=1)
stack_test_roc_auc = roc_auc_score(y_test, stack_y_test_pred_prob)
stack_test_roc_auc

0.9462809917355371

In [None]:
# test_main_path = '/kaggle/input/home-credit-credit-risk-model-stability/csv_files/test'

# # base data
# test_df = pd.read_csv(f'{test_main_path}/test_base.csv')
# test_df['date_decision'] = pd.to_datetime(test_df['date_decision'])
# print(f"test_df shape: {test_df.shape}")

# # static data
# test_static_df = pd.DataFrame()
# for file in ['test_static_0_0.csv', 'test_static_0_1.csv', 'test_static_0_2.csv']:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_static_df = pd.concat([test_static_df, temp_df], axis=0)

# test_static_df.drop(columns=static_remove_cols_list, inplace=True)

# print(f"test_static_df shape: {test_static_df.shape}")
# test_df = pd.merge(test_df, test_static_df, on='case_id', how='left')
# print(f"test_df shape: {test_df.shape}")

# for col in static_D_feats_list:
#     test_df[col] = pd.to_datetime(test_df[col])
#     test_df[f"{col}_tdiff"] = (test_df['date_decision'] - test_df[col]).dt.days
#     test_df.drop(columns=col, inplace=True)
# print(f"test_df shape: {test_df.shape}")

# del test_static_df

# # external
# test_static_cb_df = pd.read_csv(f'{test_main_path}/test_static_cb_0.csv')
# test_static_cb_df.drop(columns=external_remove_cols_list, inplace=True)

# print(f"test_static_cb_df shape: {test_static_cb_df.shape}")
# test_df = pd.merge(test_df, test_static_cb_df, on='case_id', how='left')
# print(f"test_df shape: {test_df.shape}")

# for col in external_D_feats_list:
#     test_df[col] = pd.to_datetime(test_df[col])
#     test_df[f"{col}_tdiff"] = (test_df['date_decision'] - test_df[col]).dt.days
#     test_df.drop(columns=col, inplace=True)
    
# print(f"test_df shape: {test_df.shape}")

# del test_static_cb_df

# # applprev depth=1
# test_ddate_map = test_df.set_index('case_id')['date_decision'].to_dict()

# test_applprev_df = pd.DataFrame()
# for file in ['test_applprev_1_0.csv', 'test_applprev_1_1.csv', 'test_applprev_1_2.csv']:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_applprev_df = pd.concat([test_applprev_df, temp_df], axis=0)
# test_applprev_df['date_decision'] = test_applprev_df['case_id'].map(test_ddate_map)

# test_applprev_df.drop(columns=applprev1_remove_cols_list, inplace=True)

# for col in applprev_D_feats_list:
#     test_applprev_df[col] = pd.to_datetime(test_applprev_df[col])
#     test_applprev_df[f"{col}_tdiff"] = (test_applprev_df['date_decision'] - test_applprev_df[col]).dt.days
#     test_applprev_df.drop(columns=col, inplace=True)
    
# temp_feats_list = [
#     col for col in test_applprev_df.columns 
#     if col not in ['case_id', 'num_group1', 'date_decision']
# ]

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_applprev_df[col].dtype == 'object':
#         test_applprev_df[col] = test_applprev_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict = applprev1_obj_map_dict[col]
#         test_applprev_df[col] = test_applprev_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_applprev_df[test_applprev_df['num_group1']==0].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_applprev_df[test_applprev_df['num_group1']==0][col].median()
#         temp_map_dict = test_applprev_df[test_applprev_df['num_group1']==0].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_applprev_df
# gc.collect()

# # other depth=1
# test_other_df = pd.DataFrame()
# for file in ['test_other_1.csv']:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_other_df = pd.concat([test_other_df, temp_df], axis=0)
    
# temp_feats_list = [
#     col for col in test_other_df.columns 
#     if col not in ['case_id', 'num_group1']
# ]

# test_other_df.drop(columns=other_remove_cols_list, inplace=True)

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_other_df[col].dtype == 'object':
#         test_other_df[col] = test_other_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict =  other_obj_map_dict[col]
#         test_other_df[col] = test_other_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_other_df[test_other_df['num_group1']==0].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_other_df[test_other_df['num_group1']==0][col].median()
#         temp_map_dict = test_other_df[test_other_df['num_group1']==0].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_other_df
# gc.collect()

# # tax registry A
# test_tax_registry_a_df = pd.DataFrame()
# for file in ['test_tax_registry_a_1.csv']:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_tax_registry_a_df = pd.concat([test_tax_registry_a_df, temp_df], axis=0)
# test_tax_registry_a_df['date_decision'] = test_tax_registry_a_df['case_id'].map(test_ddate_map)
# test_tax_registry_a_df.drop(columns=tax_registry_a_remove_cols_list, inplace=True)

# for col in tax_registry_a_D_feats_list:
#     test_tax_registry_a_df[col] = pd.to_datetime(test_tax_registry_a_df[col])
#     test_tax_registry_a_df[f"{col}_tdiff"] = (test_tax_registry_a_df['date_decision'] - test_tax_registry_a_df[col]).dt.days
#     test_tax_registry_a_df.drop(columns=col, inplace=True)
    
# temp_feats_list = [
#     col for col in test_tax_registry_a_df.columns 
#     if col not in ['case_id', 'num_group1', 'date_decision']
# ]

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_tax_registry_a_df[col].dtype == 'object':
#         test_tax_registry_a_df[col] = test_tax_registry_a_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict = tax_registry_a_obj_map_dict[col]
#         test_tax_registry_a_df[col] = test_tax_registry_a_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_tax_registry_a_df[test_tax_registry_a_df['num_group1']==0].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_tax_registry_a_df[test_tax_registry_a_df['num_group1']==0][col].median()
#         temp_map_dict = test_tax_registry_a_df[test_tax_registry_a_df['num_group1']==0].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_tax_registry_a_df
# gc.collect()

# # tax registry B
# test_tax_registry_b_df = pd.DataFrame()
# for file in ['test_tax_registry_b_1.csv']:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_tax_registry_b_df = pd.concat([test_tax_registry_b_df, temp_df], axis=0)
# test_tax_registry_b_df['date_decision'] = test_tax_registry_b_df['case_id'].map(test_ddate_map)
# test_tax_registry_b_df.drop(columns=tax_registry_b_remove_cols_list, inplace=True)

# for col in tax_registry_b_D_feats_list:
#     test_tax_registry_b_df[col] = pd.to_datetime(test_tax_registry_b_df[col])
#     test_tax_registry_b_df[f"{col}_tdiff"] = (test_tax_registry_b_df['date_decision'] - test_tax_registry_b_df[col]).dt.days
#     test_tax_registry_b_df.drop(columns=col, inplace=True)
    
# temp_feats_list = [
#     col for col in test_tax_registry_b_df.columns 
#     if col not in ['case_id', 'num_group1', 'date_decision']
# ]

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_tax_registry_b_df[col].dtype == 'object':
#         test_tax_registry_b_df[col] = test_tax_registry_b_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict = tax_registry_b_obj_map_dict[col]
#         test_tax_registry_b_df[col] = test_tax_registry_b_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_tax_registry_b_df[test_tax_registry_b_df['num_group1']==0].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_tax_registry_b_df[test_tax_registry_b_df['num_group1']==0][col].median()
#         temp_map_dict = test_tax_registry_b_df[test_tax_registry_b_df['num_group1']==0].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_tax_registry_b_df
# gc.collect()

# # tax registry C
# test_tax_registry_c_df = pd.DataFrame()
# for file in ['test_tax_registry_c_1.csv']:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_tax_registry_c_df = pd.concat([test_tax_registry_c_df, temp_df], axis=0)
# test_tax_registry_c_df['date_decision'] = test_tax_registry_c_df['case_id'].map(test_ddate_map)
# test_tax_registry_c_df.drop(columns=tax_registry_c_remove_cols_list, inplace=True)

# for col in tax_registry_c_D_feats_list:
#     test_tax_registry_c_df[col] = pd.to_datetime(test_tax_registry_c_df[col])
#     test_tax_registry_c_df[f"{col}_tdiff"] = (test_tax_registry_c_df['date_decision'] - test_tax_registry_c_df[col]).dt.days
#     test_tax_registry_c_df.drop(columns=col, inplace=True)
    
# temp_feats_list = [
#     col for col in test_tax_registry_c_df.columns 
#     if col not in ['case_id', 'num_group1', 'date_decision']
# ]

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_tax_registry_c_df[col].dtype == 'object':
#         test_tax_registry_c_df[col] = test_tax_registry_c_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict = tax_registry_c_obj_map_dict[col]
#         test_tax_registry_c_df[col] = test_tax_registry_c_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_tax_registry_c_df[test_tax_registry_c_df['num_group1']==0].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_tax_registry_c_df[test_tax_registry_c_df['num_group1']==0][col].median()
#         temp_map_dict = test_tax_registry_c_df[test_tax_registry_c_df['num_group1']==0].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_tax_registry_c_df
# gc.collect()

# # credit bureau A
# test_credit_bureau_a_df = pd.DataFrame()
# for file in [
#     'test_credit_bureau_a_1_0.csv', 'test_credit_bureau_a_1_1.csv', 
#     'test_credit_bureau_a_1_2.csv', 'test_credit_bureau_a_1_3.csv',
#     'test_credit_bureau_a_1_4.csv'
# ]:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_credit_bureau_a_df = pd.concat([test_credit_bureau_a_df, temp_df], axis=0)
# test_credit_bureau_a_df['date_decision'] = test_credit_bureau_a_df['case_id'].map(test_ddate_map)
# test_credit_bureau_a_df.drop(columns=credit_bureau_a_remove_cols_list, inplace=True)

# for col in credit_bureau_a_D_feats_list:
#     test_credit_bureau_a_df[col] = pd.to_datetime(test_credit_bureau_a_df[col])
#     test_credit_bureau_a_df[f"{col}_tdiff"] = (test_credit_bureau_a_df['date_decision'] - test_credit_bureau_a_df[col]).dt.days
#     test_credit_bureau_a_df.drop(columns=col, inplace=True)
    
# temp_feats_list = [
#     col for col in test_credit_bureau_a_df.columns 
#     if col not in ['case_id', 'num_group1', 'date_decision']
# ]

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_credit_bureau_a_df[col].dtype == 'object':
#         test_credit_bureau_a_df[col] = test_credit_bureau_a_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict = credit_bureau_a_obj_map_dict[col]
#         test_credit_bureau_a_df[col] = test_credit_bureau_a_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_credit_bureau_a_df[test_credit_bureau_a_df['num_group1']==0].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_credit_bureau_a_df[test_credit_bureau_a_df['num_group1']==0][col].median()
#         temp_map_dict = test_credit_bureau_a_df[test_credit_bureau_a_df['num_group1']==0].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_credit_bureau_a_df
# gc.collect()

# # credit bureau B
# test_credit_bureau_b_df = pd.DataFrame()
# for file in [
#     'test_credit_bureau_b_1.csv'
# ]:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_credit_bureau_b_df = pd.concat([test_credit_bureau_b_df, temp_df], axis=0)
# test_credit_bureau_b_df['date_decision'] = test_credit_bureau_b_df['case_id'].map(test_ddate_map)
# test_credit_bureau_b_df.drop(columns=credit_bureau_b_remove_cols_list, inplace=True)

# for col in credit_bureau_b_D_feats_list:
#     test_credit_bureau_b_df[col] = pd.to_datetime(test_credit_bureau_b_df[col])
#     test_credit_bureau_b_df[f"{col}_tdiff"] = (test_credit_bureau_b_df['date_decision'] - test_credit_bureau_b_df[col]).dt.days
#     test_credit_bureau_b_df.drop(columns=col, inplace=True)
    
# temp_feats_list = [
#     col for col in test_credit_bureau_b_df.columns 
#     if col not in ['case_id', 'num_group1', 'date_decision']
# ]

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_credit_bureau_b_df[col].dtype == 'object':
#         test_credit_bureau_b_df[col] = test_credit_bureau_b_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict = credit_bureau_b_obj_map_dict[col]
#         test_credit_bureau_b_df[col] = test_credit_bureau_b_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_credit_bureau_b_df[test_credit_bureau_b_df['num_group1']==0].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_credit_bureau_b_df[test_credit_bureau_b_df['num_group1']==0][col].median()
#         temp_map_dict = test_credit_bureau_b_df[test_credit_bureau_b_df['num_group1']==0].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_credit_bureau_b_df
# gc.collect()

# # deposit depth=1
# test_deposit_df = pd.DataFrame()
# for file in [
#     'test_deposit_1.csv'
# ]:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_deposit_df = pd.concat([test_deposit_df, temp_df], axis=0)
# test_deposit_df['date_decision'] = test_deposit_df['case_id'].map(test_ddate_map)
# test_deposit_df.drop(columns=deposit_remove_cols_list, inplace=True)

# for col in deposit_D_feats_list:
#     test_deposit_df[col] = pd.to_datetime(test_deposit_df[col])
#     test_deposit_df[f"{col}_tdiff"] = (test_deposit_df['date_decision'] - test_deposit_df[col]).dt.days
#     test_deposit_df.drop(columns=col, inplace=True)
    
# temp_feats_list = [
#     col for col in test_deposit_df.columns 
#     if col not in ['case_id', 'num_group1', 'date_decision']
# ]

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_deposit_df[col].dtype == 'object':
#         test_deposit_df[col] = test_deposit_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict = deposit_obj_map_dict[col]
#         test_deposit_df[col] = test_deposit_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_deposit_df[test_deposit_df['num_group1']==0].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_deposit_df[test_deposit_df['num_group1']==0][col].median()
#         temp_map_dict = test_deposit_df[test_deposit_df['num_group1']==0].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_deposit_df
# gc.collect()

# # person depth=1
# test_person_df = pd.DataFrame()
# for file in [
#     'test_person_1.csv'
# ]:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_person_df = pd.concat([test_person_df, temp_df], axis=0)
# test_person_df['date_decision'] = test_person_df['case_id'].map(test_ddate_map)
# test_person_df.drop(columns=person_remove_cols_list, inplace=True)

# for col in person_D_feats_list:
#     test_person_df[col] = pd.to_datetime(test_person_df[col])
#     test_person_df[f"{col}_tdiff"] = (test_person_df['date_decision'] - test_person_df[col]).dt.days
#     test_person_df.drop(columns=col, inplace=True)
    
# temp_feats_list = [
#     col for col in test_person_df.columns 
#     if col not in ['case_id', 'num_group1', 'date_decision']
# ]

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_person_df[col].dtype == 'object':
#         test_person_df[col] = test_person_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict = person_obj_map_dict[col]
#         test_person_df[col] = test_person_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_person_df[test_person_df['num_group1']==0].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_person_df[test_person_df['num_group1']==0][col].median()
#         temp_map_dict = test_person_df[test_person_df['num_group1']==0].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_person_df
# gc.collect()

# # debitcard depth=1
# test_debitcard_df = pd.DataFrame()
# for file in [
#     'test_debitcard_1.csv'
# ]:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_debitcard_df = pd.concat([test_debitcard_df, temp_df], axis=0)
# test_debitcard_df['date_decision'] = test_debitcard_df['case_id'].map(test_ddate_map)
# test_debitcard_df.drop(columns=debitcard_remove_cols_list, inplace=True)

# for col in debitcard_D_feats_list:
#     test_debitcard_df[col] = pd.to_datetime(test_debitcard_df[col])
#     test_debitcard_df[f"{col}_tdiff"] = (test_debitcard_df['date_decision'] - test_debitcard_df[col]).dt.days
#     test_debitcard_df.drop(columns=col, inplace=True)
    
# temp_feats_list = [
#     col for col in test_debitcard_df.columns 
#     if col not in ['case_id', 'num_group1', 'date_decision']
# ]

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_debitcard_df[col].dtype == 'object':
#         test_debitcard_df[col] = test_debitcard_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict = debitcard_obj_map_dict[col]
#         test_debitcard_df[col] = test_debitcard_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_debitcard_df[test_debitcard_df['num_group1']==0].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_debitcard_df[test_debitcard_df['num_group1']==0][col].median()
#         temp_map_dict = test_debitcard_df[test_debitcard_df['num_group1']==0].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_debitcard_df
# gc.collect()

# # applprev depth=2
# test_applprev2_df = pd.DataFrame()
# for file in [
#     'test_applprev_2.csv'
# ]:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_applprev2_df = pd.concat([test_applprev2_df, temp_df], axis=0)
# test_applprev2_df['date_decision'] = test_applprev2_df['case_id'].map(test_ddate_map)
# test_applprev2_df.drop(columns=applprev_2_remove_cols_list, inplace=True)

# for col in applprev_2_D_feats_list:
#     test_applprev2_df[col] = pd.to_datetime(test_applprev2_df[col])
#     test_applprev2_df[f"{col}_tdiff"] = (test_applprev2_df['date_decision'] - test_applprev2_df[col]).dt.days
#     test_applprev2_df.drop(columns=col, inplace=True)
    
# temp_feats_list = [
#     col for col in test_applprev2_df.columns 
#     if col not in ['case_id', 'num_group1', 'date_decision']
# ]

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_applprev2_df[col].dtype == 'object':
#         test_applprev2_df[col] = test_applprev2_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict = applprev_2_obj_map_dict[col]
#         test_applprev2_df[col] = test_applprev2_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_applprev2_df[(test_applprev2_df['num_group1']==0) & (test_applprev2_df['num_group2']==0)].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_applprev2_df[(test_applprev2_df['num_group1']==0) & (test_applprev2_df['num_group2']==0)][col].median()
#         temp_map_dict = test_applprev2_df[(test_applprev2_df['num_group1']==0) & (test_applprev2_df['num_group2']==0)].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_applprev2_df
# gc.collect()

# # person depth=2
# test_person2_df = pd.DataFrame()
# for file in [
#     'test_person_2.csv'
# ]:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_person2_df = pd.concat([test_person2_df, temp_df], axis=0)
# test_person2_df['date_decision'] = test_person2_df['case_id'].map(test_ddate_map)
# test_person2_df.drop(columns=person2_remove_cols_list, inplace=True)

# for col in person2_D_feats_list:
#     test_person2_df[col] = pd.to_datetime(test_person2_df[col])
#     test_person2_df[f"{col}_tdiff"] = (test_person2_df['date_decision'] - test_person2_df[col]).dt.days
#     test_person2_df.drop(columns=col, inplace=True)
    
# temp_feats_list = [
#     col for col in test_person2_df.columns 
#     if col not in ['case_id', 'num_group1', 'date_decision']
# ]

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_person2_df[col].dtype == 'object':
#         test_person2_df[col] = test_person2_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict = person2_obj_map_dict[col]
#         test_person2_df[col] = test_person2_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_person2_df[(test_person2_df['num_group1']==0) & (test_person2_df['num_group2']==0)].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_person2_df[(test_person2_df['num_group1']==0) & (test_person2_df['num_group2']==0)][col].median()
#         temp_map_dict = test_person2_df[(test_person2_df['num_group1']==0) & (test_person2_df['num_group2']==0)].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_person2_df
# gc.collect()

# # credit bureau A depth=2
# test_credit_bureau_a2_df = pd.DataFrame()
# for file in [
#     'test_credit_bureau_a_2_0.csv',
#     'test_credit_bureau_a_2_1.csv',
#     'test_credit_bureau_a_2_2.csv',
#     'test_credit_bureau_a_2_3.csv',
#     'test_credit_bureau_a_2_4.csv',
#     'test_credit_bureau_a_2_5.csv',
#     'test_credit_bureau_a_2_6.csv',
#     'test_credit_bureau_a_2_7.csv',
#     'test_credit_bureau_a_2_8.csv',
#     'test_credit_bureau_a_2_9.csv',
#     'test_credit_bureau_a_2_10.csv',
#     'test_credit_bureau_a_2_11.csv'
# ]:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_credit_bureau_a2_df = pd.concat([test_credit_bureau_a2_df, temp_df], axis=0)
# test_credit_bureau_a2_df['date_decision'] = test_credit_bureau_a2_df['case_id'].map(test_ddate_map)
# test_credit_bureau_a2_df.drop(columns=credit_bureau_a2_remove_cols_list, inplace=True)

# for col in credit_bureau_a2_D_feats_list:
#     test_credit_bureau_a2_df[col] = pd.to_datetime(test_credit_bureau_a2_df[col])
#     test_credit_bureau_a2_df[f"{col}_tdiff"] = (test_credit_bureau_a2_df['date_decision'] - test_credit_bureau_a2_df[col]).dt.days
#     test_credit_bureau_a2_df.drop(columns=col, inplace=True)
    
# temp_feats_list = [
#     col for col in test_credit_bureau_a2_df.columns 
#     if col not in ['case_id', 'num_group1', 'date_decision']
# ]

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_credit_bureau_a2_df[col].dtype == 'object':
#         test_credit_bureau_a2_df[col] = test_credit_bureau_a2_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict = credit_bureau_a2_obj_map_dict[col]
#         test_credit_bureau_a2_df[col] = test_credit_bureau_a2_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_credit_bureau_a2_df[(test_credit_bureau_a2_df['num_group1']==0) & (test_credit_bureau_a2_df['num_group2']==0)].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_credit_bureau_a2_df[(test_credit_bureau_a2_df['num_group1']==0) & (test_credit_bureau_a2_df['num_group2']==0)][col].median()
#         temp_map_dict = test_credit_bureau_a2_df[(test_credit_bureau_a2_df['num_group1']==0) & (test_credit_bureau_a2_df['num_group2']==0)].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_credit_bureau_a2_df
# gc.collect()

# # credit bureau B depth=2
# test_credit_bureau_b2_df = pd.DataFrame()
# for file in [
#     'test_credit_bureau_b_2.csv'
# ]:
#     temp_df = pd.read_csv(f'{test_main_path}/{file}')
#     test_credit_bureau_b2_df = pd.concat([test_credit_bureau_b2_df, temp_df], axis=0)
# test_credit_bureau_b2_df['date_decision'] = test_credit_bureau_b2_df['case_id'].map(test_ddate_map)
# test_credit_bureau_b2_df.drop(columns=credit_bureau_b2_remove_cols_list, inplace=True)

# for col in credit_bureau_b2_D_feats_list:
#     test_credit_bureau_b2_df[col] = pd.to_datetime(test_credit_bureau_b2_df[col])
#     test_credit_bureau_b2_df[f"{col}_tdiff"] = (test_credit_bureau_b2_df['date_decision'] - test_credit_bureau_b2_df[col]).dt.days
#     test_credit_bureau_b2_df.drop(columns=col, inplace=True)
    
# temp_feats_list = [
#     col for col in test_credit_bureau_b2_df.columns 
#     if col not in ['case_id', 'num_group1', 'date_decision']
# ]

# print(f"test_df shape: {test_df.shape}")

# for col in tqdm(temp_feats_list):
#     if test_credit_bureau_b2_df[col].dtype == 'object':
#         test_credit_bureau_b2_df[col] = test_credit_bureau_b2_df[col].fillna('Unknown') #, inplace=True
#         temp_map_dict = credit_bureau_b2_obj_map_dict[col]
#         test_credit_bureau_b2_df[col] = test_credit_bureau_b2_df[col].apply(lambda x: temp_map_dict.get(x, len(temp_map_dict)))
#         temp_map_dict2 = test_credit_bureau_b2_df[(test_credit_bureau_b2_df['num_group1']==0) & (test_credit_bureau_b2_df['num_group2']==0)].\
#             groupby('case_id')[col].agg(pd.Series.mode).to_dict()
#         test_df[col] = test_df['case_id'].map(temp_map_dict2)
#     else:
#         temp_median = test_credit_bureau_b2_df[(test_credit_bureau_b2_df['num_group1']==0) & (test_credit_bureau_b2_df['num_group2']==0)][col].median()
#         temp_map_dict = test_credit_bureau_b2_df[(test_credit_bureau_b2_df['num_group1']==0) & (test_credit_bureau_b2_df['num_group2']==0)].fillna(temp_median).\
#             groupby('case_id')[col].\
#             agg(['mean', 'median', 'std', 'max', 'min', 'skew']).to_dict()
#         for k in temp_map_dict:
#             test_df[f"{col}_{k}"] = test_df['case_id'].map(temp_map_dict[k])
            
# print(f"test_df shape: {test_df.shape}")

# del test_credit_bureau_b2_df
# gc.collect()

In [None]:
# def convert_dpd2bins(x):
#     if x >= 0 and x <= 30:
#         return 0
#     elif x >= 31 and x <= 60:
#         return 1
#     elif x >= 61 and x <= 90:
#         return 2
#     elif x >= 91:
#         return 3
#     elif pd.isna(x):
#         return 4

# A_feats_list = [col for col in train_df.columns if col.endswith('A')]
# P_feats_list = [col for col in train_df.columns if col.endswith('P')]
# L_feats_list = [col for col in train_df.columns if col.endswith('L')]
# D_feats_list = [col for col in train_df.columns if col.endswith('D')]
# M_feats_list = [col for col in train_df.columns if col.endswith('M')]
# T_feats_list = [col for col in train_df.columns if col.endswith('T')]

# feats_map_dict = {}

# exceptional_columns = ['date_decision', 'MONTH', 'WEEK_NUM', 'target']
# feats_list = [col for col in train_df.columns if col not in exceptional_columns and col not in D_feats_list]

# drop_cols_list = []

# for col in feats_list:

#     if train_df[col].isnull().sum() / train_df.shape[0] >= 0.9:
#         drop_cols_list.append(col)
#         continue

#     if train_df[col].dtype == 'object':
#         train_df[col].fillna('Unknown', inplace=True)
#         temp_unique_list = list(train_df[col].unique())
#         temp_map_dict = {temp_unique_list[i]: i for i in range(len(temp_unique_list))}
#         train_df[col] = train_df[col].map(temp_map_dict)
#         feats_map_dict[col] = temp_map_dict
#     else:
#         train_df[col].fillna(train_df[col].median(), inplace=True)

#     if col.endswith('P'):
#         train_df[f'{col}_bin'] = train_df[col].apply(convert_dpd2bins)

# # for col in P_feats_list:
# #     train_df[col].fillna(train_df[col].median(), inplace=True)
# #     train_df[f'{col}_bin'] = train_df[col].apply(convert_dpd2bins)

# # for col in L_feats_list:
# #     train_df[col].fillna(train_df[col].median(), inplace=True)

# datefirstoffer_1144D: Date of first customer relationship management (CRM) offer. dtype=object missing=560713 (55.86%)
# datelastinstal40dpd_247D: Date of last instalment that was more than 40 days past due (DPD). dtype=object missing=932264 (92.88%)
# datelastunpaid_3546854D: Date of the last unpaid instalment. dtype=object missing=606750 (60.45%)
# dtlastpmtallstes_4499206D: Date of last payment made by the applicant. dtype=object missing=758904 (75.61%)
# firstclxcampaign_1125D: Date of the client's first campaign. dtype=object missing=571655 (56.95%)
# firstdatedue_489D: Date of the first due date. dtype=object missing=339687 (33.84%)
# lastactivateddate_801D: Contract activation date for previous applications. dtype=object missing=320842 (31.96%)
# lastapplicationdate_877D: Date of previous customer's application. dtype=object missing=220760 (21.99%)
# lastapprdate_640D: Date of approval on client's most recent previous application. dtype=object missing=313123 (31.20%)
# lastdelinqdate_224D: Date of the last delinquency occurrence. dtype=object missing=668680 (66.62%)
# lastrejectdate_50D: Date of most recent rejected application by the applicant. dtype=object missing=530899 (52.89%)
# lastrepayingdate_696D: Date of the last payment made by the applicant. dtype=object missing=1002152 (99.84%)
# maxdpdinstldate_3546855D: Date of instalment on which client was most days past due. dtype=object missing=567808 (56.57%)
# payvacationpostpone_4187118D: Date of last payment holiday instalment. dtype=object missing=1002290 (99.85%)
# validfrom_1069D: Date since the client has an active campaign. dtype=object missing=884352 (88.10%)

# assignmentdate_238D: Tax authority data - date of assignment. dtype=object missing=1363480 (90.87%)
# assignmentdate_4527235D: Tax authority data - Date of assignment. dtype=object missing=1385498 (92.34%)
# assignmentdate_4955616D: Tax authority assignment date. dtype=object missing=1428843 (95.23%)
# birthdate_574D: Client's date of birth (credit bureau data). dtype=object missing=892605 (59.49%)
# dateofbirth_337D: Client's date of birth. dtype=object missing=114785 (7.65%)
# dateofbirth_342D: Client's date of birth. dtype=object missing=1463976 (97.57%)
# responsedate_1012D: Tax authority's response date. dtype=object missing=780476 (52.02%)
# responsedate_4527233D: Tax authority's response date. dtype=object missing=840149 (55.99%)
# responsedate_4917613D: Tax authority's response date. dtype=object missing=1275564 (85.01%)

# approvaldate_319D: Approval Date of Previous Application dtype=object missing=1766021 (45.43%)
# creationdate_885D: Date when previous application was created. dtype=object missing=35 (0.00%)
# dateactivated_425D: Contract activation date of the applicant's previous application. dtype=object missing=1844702 (47.45%)
# dtlastpmt_581D: Date of last payment made by the applicant. dtype=object missing=2860375 (73.58%)
# dtlastpmtallstes_3545839D: Date of the applicant's last payment. dtype=object missing=2434155 (62.61%)
# employedfrom_700D: Employment start date from the previous application. dtype=object missing=2180869 (56.10%)
# firstnonzeroinstldate_307D: Date of first instalment in the previous application. dtype=object missing=365175 (9.39%)

# for3years_504L: Client's credit history data over the last three years. dtype=float64 missing=1463962 (97.57%)
# formonth_535L: Credit history for the last month. dtype=float64 missing=1463962 (97.57%)
# forquarter_634L: Credit history for the last quarter. dtype=float64 missing=1463962 (97.57%)
# fortoday_1092L: Client's credit history for today. dtype=float64 missing=1463962 (97.57%)
# forweek_528L: Credit history for the last week. dtype=float64 missing=1463962 (97.57%)
# foryear_850L: Credit history for the last year. dtype=float64 missing=1463962 (97.57%)

# recorddate_4527225D: Date of tax deduction record. dtype=object missing=0 (0.00%)

# deductiondate_4917603D: Tax deduction date. dtype=object missing=0 (0.00%)

# processingdate_168D: Date when the tax deduction is processed. dtype=object missing=0 (0.00%)

# dateofcredend_289D: End date of an active credit contract. dtype=object missing=3449815 (83.97%)
# dateofcredend_353D: End date of a closed credit contract. dtype=object missing=3651494 (88.88%)
# dateofcredstart_181D: Date when the credit contract was closed. dtype=object missing=3651491 (88.88%)
# dateofcredstart_739D: Start date of a closed credit contract. dtype=object missing=3449815 (83.97%)
# dateofrealrepmt_138D: Date of credit's closure (contract termination date). dtype=object missing=3654366 (88.95%)
# numberofoverdueinstlmaxdat_148D: Date of maximum number of past due instalments for the closed contract. dtype=object missing=1701590 (81.83%)
# numberofoverdueinstlmaxdat_641D: Date of maximum number of past due instalments for the active contract. dtype=object missing=1990461 (95.73%)
# overdueamountmax2date_1002D: Date of maximal past due amount for a closed contract dtype=object missing=1705299 (82.01%)
# overdueamountmax2date_1142D: Date of maximal past due amount for an active contract. dtype=object missing=1989828 (95.70%)
# refreshdate_3813885D: Date when the credit bureau's public sources have been last updated. dtype=object missing=666459 (32.05%)

# contractdate_551D: Contract date of the active contract dtype=object missing=3892 (4.54%)
# contractmaturitydate_151D: End date of active contract. dtype=object missing=4079 (4.75%)
# lastupdate_260D: Last update date for the active contracts. dtype=object missing=3892 (4.54%)

# contractenddate_991D: End date of deposit contract. dtype=object missing=79682 (54.92%)
# openingdate_313D: Deposit account opening date. dtype=object missing=0 (0.00%)

# birth_259D: Date of birth of the person. dtype=object missing=1447332 (48.67%)
# birthdate_87D: Birth date of the person. dtype=object missing=2949075 (99.16%)
# empl_employedfrom_271D: Start date of employment. dtype=object missing=2407290 (80.94%)

# openingdate_857D: Debit card opening date. dtype=object missing=12711 (8.08%)

# empls_employedfrom_796D: Start of employment (num_group1 - person, num_group2 - employment). dtype=object missing=1637653 (99.65%)

# pmts_date_1107D: Payment date for an active contract according to credit bureau (num_group1 - contract, num_group2 - payment). dtype=object missing=0 (0.00%)