# Importing some packages

In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import Lasso, Ridge, LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.feature_selection import RFE
from sklearn.feature_selection import r_regression, SelectKBest
from sklearn.feature_selection import mutual_info_regression, mutual_info_classif
from sklearn.feature_selection import f_classif, chi2
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, confusion_matrix

from xgboost import XGBClassifier

from lightgbm import LGBMClassifier

from scipy import stats

from statsmodels.stats.outliers_influence import variance_inflation_factor

from scipy.special import softmax

from boruta import BorutaPy

# from BorutaShap import BorutaShap

from collections import Counter

import shap

import os
from pathlib import Path

from bisect import bisect

import re

import gc

from tqdm.autonotebook import tqdm

import warnings 
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Loading data

In [2]:
data = pd.read_csv('train_data.csv')

test_data = pd.read_csv('test_data.csv')

# Getting features' names

In [3]:
nominal_features = [' Liability-Assets Flag'] # Net Income Flag is removed

numerical_features = [col for col in data.columns if col not in nominal_features and col!='Bankrupt?']

# Creating folds

In [4]:
skfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
skfold_list = []
for train_idxs, valid_idxs in skfold.split(data, y=data['Bankrupt?']):
    skfold_list.append((train_idxs, valid_idxs))

# Shap value importance function

In [5]:
def get_feature_importances_shap_values(shap_values, features, topk=10):
    '''
    Prints the feature importances based on SHAP values in an ordered way
    shap_values -> The SHAP values calculated from a shap.Explainer object
    features -> The name of the features, on the order presented to the explainer
    '''
    # Calculates the feature importance (mean absolute shap value) for each feature
    importances = []
    for i in range(shap_values.values.shape[1]):
        importances.append(np.mean(np.abs(shap_values.values[:, i])))
    # Calculates the normalized version
    importances_norm = softmax(importances)
    # Organize the importances and columns in a dictionary
    feature_importances = {fea: imp for imp, fea in zip(importances, features)}
    feature_importances_norm = {fea: imp for imp, fea in zip(importances_norm, features)}
    # Sorts the dictionary
    feature_importances = {k: v for k, v in sorted(feature_importances.items(), key=lambda item: item[1], reverse = True)}
    feature_importances_norm= {k: v for k, v in sorted(feature_importances_norm.items(), key=lambda item: item[1], reverse = True)}
    # Prints the feature importances
    selected_topk_feats = []
    
    for idx, (k, v) in enumerate(feature_importances.items()):
        # print(f"{k} -> {v:.4f} (softmax = {feature_importances_norm[k]:.4f})")
        if idx <=topk:
            selected_topk_feats.append(k)

    return selected_topk_feats

# Function for feature selection

In [6]:
class FSelector():

    def __init__(self, X, y, num_feats, ordinal_feats, nominal_feats, model, is_target_cat=True, select_n_feats=15):

        self.X = X
        self.y = y
        self.num_feats = num_feats
        self.ordinal_feats = ordinal_feats
        self.nominal_feats = nominal_feats
        self.model = model
        self.is_target_cat = is_target_cat
        self.select_n_feats = select_n_feats

    def calculate_vif(self, X):
    
        vif = pd.DataFrame()
        vif["features"] = X.columns
        vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]

        return vif

    def select_feats_via_vif(self):

        num_features = self.num_feats.copy()

        vif_df = self.calculate_vif(self.X[num_features])

        while vif_df[vif_df['VIF']>=10].shape[0] != 0:
            vif_df.sort_values('VIF', ascending=False, inplace=True)
            vif_df.reset_index(drop=True, inplace=True)
            # print(vif_df)
            elimination_candidate = vif_df.iloc[0]['features']
            # print(elimination_candidate)
            num_features = [i for i in num_features if i!=elimination_candidate]
            new_X = self.X[num_features]
            vif_df = self.calculate_vif(new_X)

        return list(vif_df['features'].values)
    
    def get_spearmanr(self, X, y):
        # return np.array([stats.spearmanr(X.values[:, f], y.values).correlation for f in range(X.shape[1])])
        spearman_values = [stats.spearmanr(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
        temp_sp_df = pd.DataFrame({'spearman': spearman_values, 'feats': list(X.columns)})
        temp_sp_df['abs_spearman'] = np.abs(temp_sp_df['spearman'])
        temp_sp_df.sort_values('abs_spearman', ascending=False, inplace=True)
        temp_sp_df.reset_index(drop=True, inplace=True)
        return temp_sp_df.iloc[:15]['feats'].to_list()
    
    def get_kendalltau(self, X, y):
        # return [stats.kendalltau(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
        kendall_values = [stats.spearmanr(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
        temp_ken_df = pd.DataFrame({'kendall': kendall_values, 'feats': list(X.columns)})
        temp_ken_df['abs_kendall'] = np.abs(temp_ken_df['kendall'])
        temp_ken_df.sort_values('abs_kendall', ascending=False, inplace=True)
        temp_ken_df.reset_index(drop=True, inplace=True)
        return temp_ken_df.iloc[:15]['feats'].to_list()
    
    def get_pointbiserialr(self, X, y):
        return [stats.pointbiserialr(X.values[:, f], y.values).correlation for f in range(X.shape[1])]
    
    def get_boruto_feats(self):
        feat_selector = BorutaPy(self.model, n_estimators='auto', verbose=2, random_state=1)
        feat_selector.fit(np.array(self.X), np.array(self.y))
        boruta_selected_features = list(self.X.iloc[:, feat_selector.support_].columns)
        return boruta_selected_features
    
    def get_kbest(self, X, feats_list, metric):
        selector = SelectKBest(metric, k=self.select_n_feats)
        selector.fit_transform(X[feats_list], self.y)
        selected_feats_idxs_list = list(selector.get_support(indices=True))
        column_names = [feats_list[i] for i in selected_feats_idxs_list]
        return column_names
    
    def get_rfe_feats(self):
        model_rfe = RFE(self.model, n_features_to_select=self.select_n_feats)
        model_rfe.fit(self.X, self.y)
        model_rfe_feats = list(self.X.iloc[:, list(model_rfe.support_)].columns)
        return model_rfe_feats
    
    def get_shap_feats(self, feats_list, topk=10):
        model = self.model
        X = self.X[feats_list]
        model.fit(self.X, self.y)
        explainer = shap.Explainer(model.predict, X, max_evals = int(2 * X.shape[1] + 1), verbose=0)
        shap_values = explainer(X)
        selected_shap_features = get_feature_importances_shap_values(
            shap_values, features=list(X.columns), topk=topk
        )
        return selected_shap_features
    
    def get_votes(self):

        if self.num_feats is not None:

            if self.is_target_cat:

                temp_n_feats =  self.select_n_feats
                if len(self.num_feats) < self.select_n_feats:
                    self.select_n_feats = 'all'

                # self.num_kendalltau_feats = self.get_kendalltau(self.X[self.num_feats], self.y)
                self.num_f_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=f_classif)
                self.num_mi_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=mutual_info_classif)

                self.select_n_feats = temp_n_feats

                self.selected_num_feats = []
                # self.selected_num_feats.extend(self.num_kendalltau_feats)
                self.selected_num_feats.extend(self.num_f_feats)
                self.selected_num_feats.extend(self.num_mi_feats)

            else:

                self.vif_feats = self.select_feats_via_vif()

                temp_n_feats =  self.select_n_feats
                if len(self.num_feats) < self.select_n_feats:
                    self.select_n_feats = 'all'

                self.pearson_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=r_regression, k=self.select_n_feats)

                self.select_n_feats = temp_n_feats
                # self.num_spearmanr_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=stats.spearmanr, k=self.select_n_feats)
                # self.num_kendalltau_feats = self.get_kbest(X=self.X, feats_list=self.num_feats, metric=stats.kendalltau, k=self.select_n_feats)
                self.num_spearmanr_feats = self.get_spearmanr(self.X[self.num_feats], self.y)
                self.num_kendalltau_feats = self.get_kendalltau(self.X[self.num_feats], self.y)
                # self.num_spearmanr_feats = SelectKBest(self.get_spearmanr, k=self.select_n_feats).fit_transform(self.X[self.num_feats], self.y)
                # self.num_kendalltau_feats = SelectKBest(self.get_kendalltau, k=self.select_n_feats).fit_transform(self.X[self.num_feats], self.y)

                self.selected_num_feats = []
                self.selected_num_feats.extend(self.pearson_feats)
                self.selected_num_feats.extend(self.num_spearmanr_feats)
                self.selected_num_feats.extend(self.num_kendalltau_feats)
                # self.selected_num_feats = list(set(self.selected_num_feats))

        else:

            self.selected_num_feats = []

        if self.ordinal_feats is not None:

            if self.is_target_cat:

                temp_n_feats =  self.select_n_feats
                if len(self.ordinal_feats) < self.select_n_feats:
                    self.select_n_feats = 'all'

                self.ordinal_mi_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=mutual_info_classif)
                self.ordinal_chi2_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=chi2)

                self.selected_ordinal_feats = []
                self.selected_ordinal_feats.extend(self.ordinal_mi_feats)
                self.selected_ordinal_feats.extend(self.ordinal_chi2_feats)

                self.select_n_feats = temp_n_feats

            else:

                self.ordinal_spearmanr_feats = self.get_spearmanr(self.X[self.ordinal_feats], self.y)
                self.ordinal_kendalltau_feats = self.get_kendalltau(self.X[self.ordinal_feats], self.y)

                # self.ordinal_spearmanr_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=stats.spearmanr, k=self.select_n_feats)
                # self.ordinal_kendalltau_feats = self.get_kbest(X=self.X, feats_list=self.ordinal_feats, metric=stats.kendalltau, k=self.select_n_feats)

                # self.ordinal_spearmanr_feats = SelectKBest(self.get_spearmanr, k=self.select_n_feats).fit_transform(self.X[self.ordinal_feats], self.y)
                # self.ordinal_kendalltau_feats = SelectKBest(self.get_kendalltau, k=self.select_n_feats).fit_transform(self.X[self.ordinal_feats], self.y)

                self.selected_ordinal_feats = []
                self.selected_ordinal_feats.extend(self.ordinal_spearmanr_feats)
                self.selected_ordinal_feats.extend(self.ordinal_kendalltau_feats)
                # self.selected_ordinal_feats = list(set(self.selected_ordinal_feats))
                
        else:
            self.selected_ordinal_feats = []

        if self.nominal_feats is not None:

            if self.is_target_cat:
                
                temp_n_feats =  self.select_n_feats
                if len(self.nominal_feats) < self.select_n_feats:
                    self.select_n_feats = 'all'

                self.nominal_mi_feats = self.get_kbest(X=self.X, feats_list=self.nominal_feats, metric=mutual_info_classif)
                self.nominal_chi2_feats = self.get_kbest(X=self.X, feats_list=self.nominal_feats, metric=chi2)

                self.selected_nominal_feats = []
                self.selected_nominal_feats.extend(self.nominal_mi_feats)
                self.selected_nominal_feats.extend(self.nominal_chi2_feats)
                
                self.select_n_feats = temp_n_feats

            else:

                temp_n_feats =  self.select_n_feats
                if len(self.nominal_feats) < self.select_n_feats:
                    self.select_n_feats = 'all'

                self.f_feats = self.get_kbest(X=self.X, feats_list=self.nominal_feats, metric=f_classif, k=self.select_n_feats)
                self.mi_feats = self.get_kbest(X=self.X, feats_list=self.nominal_feats, metric=mutual_info_regression, k=self.select_n_feats)

                self.select_n_feats = temp_n_feats

                # # self.f_feats = f_classif(self.X[self.nominal_feats], self.y)[0]
                # self.f_feats = SelectKBest(f_classif, k=self.select_n_feats).fit_transform(self.X[self.nominal_feats], self.y).columns
                
                # # self.mi_feats = mutual_info_regression(self.X[self.nominal_feats], self.y)
                # self.mi_feats = SelectKBest(mutual_info_regression, k=self.select_n_feats).fit_transform(self.X[self.nominal_feats], self.y).columns

                self.selected_nominal_feats = []
                self.selected_nominal_feats.extend(self.f_feats)
                self.selected_nominal_feats.extend(self.mi_feats)
                # self.selected_nominal_feats = list(set(self.selected_nominal_feats))

        else:

            self.selected_nominal_feats = []

        if self.model is not None:
            # np.int = np.int32
            # np.float = np.float64
            # np.bool = np.bool_
            if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
                self.boruto_feats =  self.get_boruto_feats()
            if not isinstance(self.model, SVC):
                self.rfe_feats = self.get_rfe_feats()
        else:
            self.boruto_feats = []
            self.rfe_feats = []

            
        if len(self.selected_num_feats) != 0:
            if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
                self.selected_num_feats.extend(self.boruto_feats)
            if not isinstance(self.model, SVC):
                self.selected_num_feats.extend(self.rfe_feats)
            num_feats_dict = dict(Counter(self.selected_num_feats))
            self.selected_num_feats = [i for i in num_feats_dict if num_feats_dict[i] >= 2]


        if len(self.selected_ordinal_feats) != 0:
            if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
                self.selected_ordinal_feats.extend(self.boruto_feats)
            if not isinstance(self.model, SVC):
                self.selected_ordinal_feats.extend(self.rfe_feats)
            ordinal_feats_dict = dict(Counter(self.selected_ordinal_feats))
            self.selected_ordinal_feats = [i for i in ordinal_feats_dict if ordinal_feats_dict[i] >= 2]

        if len(self.selected_nominal_feats) != 0:
            if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
                self.selected_nominal_feats.extend(self.boruto_feats)
            if not isinstance(self.model, SVC):
                self.selected_nominal_feats.extend(self.rfe_feats)
            nominal_feats_dict = dict(Counter(self.selected_nominal_feats))
            self.selected_nominal_feats = [i for i in nominal_feats_dict if nominal_feats_dict[i] >= 2]

        self.selected_feats = []
        self.selected_feats.extend(self.selected_num_feats)
        self.selected_feats.extend(self.selected_ordinal_feats)
        self.selected_feats.extend(self.selected_nominal_feats)
        if isinstance(self.model, RandomForestClassifier) or isinstance(self.model, XGBClassifier):
            self.selected_feats.extend(self.boruto_feats)
        self.selected_feats = list(set(self.selected_feats))

        # self.selected_feats = self.get_shap_feats(self.selected_feats)

        return self.selected_feats

# Selecting features

In [7]:
selected_features_dict = {}

for idx in tqdm(range(1)):

    X_train = data.iloc[skfold_list[idx][0]].reset_index(drop=True)
    y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

    X_valid = data.iloc[skfold_list[idx][1]].reset_index(drop=True)
    y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

    new_numerical_features = []
    for feat in numerical_features:
        X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * X_train[' Liability-Assets Flag']
        X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * X_valid[' Liability-Assets Flag']
        new_numerical_features.append(f"feat{numerical_features.index(feat)}")

    numerical_features.extend(new_numerical_features)

    # getting categorical features
    categorical_features = nominal_features.copy()

    #getting all features
    all_features = []
    all_features.extend(categorical_features)
    all_features.extend(numerical_features)

    X_train = X_train[all_features]
    X_valid = X_valid[all_features]

    models_list = [RandomForestClassifier(), XGBClassifier(), LogisticRegression(), SVC(probability=True)]
    model_names_list = ['RandomForestClassifier', 'XGBClassifier', 'LogisticRegression', 'SVC']

    for model_idx in tqdm(range(len(model_names_list))):

        model_name = model_names_list[model_idx]

        selected_features_dict[model_name] = {}

        # feature selection
        model = models_list[model_idx]

        fselector = FSelector(
            X=X_train, 
            y=y_train, 
            num_feats=numerical_features, 
            ordinal_feats=None, 
            nominal_feats=nominal_features, 
            model=model
        )

        selected_features = fselector.get_votes()

        if len(selected_features) == 0:
            continue

        # model training
        model.fit(X_train[selected_features], y_train)

        # metric calculation
        y_train_pred = model.predict(X_train[selected_features])
        y_train_pred_prob = model.predict_proba(X_train[selected_features])[:, 1]

        y_valid_pred = model.predict(X_valid[selected_features])
        y_valid_pred_prob = model.predict_proba(X_valid[selected_features])[:, 1]

        train_acc = accuracy_score(y_train, y_train_pred)
        train_f1 = f1_score(y_train, y_train_pred)
        train_roc_auc = roc_auc_score(y_train, y_train_pred_prob)

        valid_acc = accuracy_score(y_valid, y_valid_pred)
        valid_f1 = f1_score(y_valid, y_valid_pred)
        valid_roc_auc = roc_auc_score(y_valid, y_valid_pred_prob)

        selected_features_dict[model_name][idx+1] = {}
        selected_features_dict[model_name][idx+1]['selected_feats'] = selected_features
        selected_features_dict[model_name][idx+1]['train_acc'] = train_acc
        selected_features_dict[model_name][idx+1]['train_f1'] = train_f1
        selected_features_dict[model_name][idx+1]['train_roc_auc'] = train_roc_auc
        selected_features_dict[model_name][idx+1]['valid_acc'] = valid_acc
        selected_features_dict[model_name][idx+1]['valid_f1'] = valid_f1
        selected_features_dict[model_name][idx+1]['valid_roc_auc'] = valid_roc_auc
        selected_features_dict[model_name][idx+1]['model'] = model

        print(f"##### {model_name} #####")
        print(f"Selected features: {selected_features}")
        print("Train:")
        print(f"Accuracy: {train_acc:.5f}, F1: {train_f1:.5f}, ROC-AUC: {train_roc_auc:.5f}")
        print("Validation:")
        print(f"Accuracy: {valid_acc:.5f}, F1: {valid_f1:.5f}, ROC-AUC: {valid_roc_auc:.5f}")

    del X_train, y_train, X_valid, y_valid
    gc.collect()

  0%|          | 0/1 [00:00<?, ?it/s]

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	189
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	30
Tentative: 	18
Rejected: 	141
Iteration: 	9 / 100
Confirmed: 	30
Tentative: 	18
Rejected: 	141
Iteration: 	10 / 100
Confirmed: 	30
Tentative: 	18
Rejected: 	141
Iteration: 	11 / 100
Confirmed: 	30
Tentative: 	18
Rejected: 	141
Iteration: 	12 / 100
Confirmed: 	30
Tentative: 	18
Rejected: 	141
Iteration: 	13 / 100
Confirmed: 	30
Tentative: 	18
Rejected: 	141
Iteration: 	14 / 100
Confirmed: 	30
Tentative: 	17
Rejected: 	142
Iteration: 	15 / 100
Confirmed: 	30
Tentative: 	17
Rejected: 	142
Iteration: 	16 / 100
Confirmed: 	



##### RandomForestClassifier #####
Selected features: [' After-tax net Interest Rate', ' Net Value Per Share (A)', ' ROA(A) before interest and % after tax', ' Net Income to Total Assets', " Net Income to Stockholder's Equity", ' Quick Ratio', ' Working Capital to Total Assets', ' Per Share Net profit before tax (Yuan ¥)', ' Non-industry income and expenditure/revenue', ' Borrowing dependency', ' Net Value Per Share (B)', ' Continuous interest rate (after tax)', ' ROA(C) before interest and depreciation before interest', ' Inventory/Working Capital', ' Interest Expense Ratio', ' Liability to Equity', ' Interest Coverage Ratio (Interest expense to EBIT)', ' Net Value Growth Rate', ' Cash/Total Assets', ' Net Value Per Share (C)', ' Liability-Assets Flag', ' Cash/Current Liability', ' Working Capital/Equity', ' Equity to Liability', ' ROA(B) before interest and depreciation after tax', ' Net worth/Assets', ' Net profit before tax/Paid-in capital', ' Debt ratio %', ' Persistent EPS in the



##### XGBClassifier #####
Selected features: [' ROA(A) before interest and % after tax', ' Net Income to Total Assets', " Net Income to Stockholder's Equity", ' Per Share Net profit before tax (Yuan ¥)', ' Non-industry income and expenditure/revenue', ' Borrowing dependency', ' Net Value Per Share (B)', ' Continuous interest rate (after tax)', ' ROA(C) before interest and depreciation before interest', ' Inventory/Working Capital', ' Liability to Equity', ' Interest Coverage Ratio (Interest expense to EBIT)', ' Net Value Growth Rate', ' Liability-Assets Flag', ' ROA(B) before interest and depreciation after tax', ' Net worth/Assets', ' Pre-tax net Interest Rate', ' Net profit before tax/Paid-in capital', ' Debt ratio %', ' Persistent EPS in the Last Four Seasons', ' Total debt/Total net worth']
Train:
Accuracy: 1.00000, F1: 1.00000, ROC-AUC: 1.00000
Validation:
Accuracy: 0.97394, F1: 0.52941, ROC-AUC: 0.92609




##### LogisticRegression #####
Selected features: [' ROA(A) before interest and % after tax', ' Borrowing dependency', ' Net Income to Total Assets', " Net Income to Stockholder's Equity", ' Liability-Assets Flag', ' Net profit before tax/Paid-in capital', ' Debt ratio %', ' Persistent EPS in the Last Four Seasons', ' Net worth/Assets', ' Liability to Equity', ' Per Share Net profit before tax (Yuan ¥)']
Train:
Accuracy: 0.96904, F1: 0.13198, ROC-AUC: 0.92358
Validation:
Accuracy: 0.97231, F1: 0.32000, ROC-AUC: 0.92786


100%|██████████| 4/4 [22:59<00:00, 344.96s/it]
100%|██████████| 1/1 [23:00<00:00, 1380.02s/it]

##### SVC #####
Selected features: [' ROA(A) before interest and % after tax', ' Borrowing dependency', ' Net Income to Total Assets', " Net Income to Stockholder's Equity", ' Liability-Assets Flag', ' Net profit before tax/Paid-in capital', ' Debt ratio %', ' Persistent EPS in the Last Four Seasons', ' Net worth/Assets', ' Liability to Equity', ' Per Share Net profit before tax (Yuan ¥)']
Train:
Accuracy: 0.96868, F1: 0.06486, ROC-AUC: 0.82417
Validation:
Accuracy: 0.97068, F1: 0.18182, ROC-AUC: 0.91136





In [8]:
selected_features_dict['RandomForestClassifier'][1]['selected_feats']

[' After-tax net Interest Rate',
 ' Net Value Per Share (A)',
 ' ROA(A) before interest and % after tax',
 ' Net Income to Total Assets',
 " Net Income to Stockholder's Equity",
 ' Quick Ratio',
 ' Working Capital to Total Assets',
 ' Per Share Net profit before tax (Yuan ¥)',
 ' Non-industry income and expenditure/revenue',
 ' Borrowing dependency',
 ' Net Value Per Share (B)',
 ' Continuous interest rate (after tax)',
 ' ROA(C) before interest and depreciation before interest',
 ' Inventory/Working Capital',
 ' Interest Expense Ratio',
 ' Liability to Equity',
 ' Interest Coverage Ratio (Interest expense to EBIT)',
 ' Net Value Growth Rate',
 ' Cash/Total Assets',
 ' Net Value Per Share (C)',
 ' Liability-Assets Flag',
 ' Cash/Current Liability',
 ' Working Capital/Equity',
 ' Equity to Liability',
 ' ROA(B) before interest and depreciation after tax',
 ' Net worth/Assets',
 ' Net profit before tax/Paid-in capital',
 ' Debt ratio %',
 ' Persistent EPS in the Last Four Seasons',
 ' To

In [9]:
selected_features_dict['XGBClassifier'][1]['selected_feats']

[' ROA(A) before interest and % after tax',
 ' Net Income to Total Assets',
 " Net Income to Stockholder's Equity",
 ' Per Share Net profit before tax (Yuan ¥)',
 ' Non-industry income and expenditure/revenue',
 ' Borrowing dependency',
 ' Net Value Per Share (B)',
 ' Continuous interest rate (after tax)',
 ' ROA(C) before interest and depreciation before interest',
 ' Inventory/Working Capital',
 ' Liability to Equity',
 ' Interest Coverage Ratio (Interest expense to EBIT)',
 ' Net Value Growth Rate',
 ' Liability-Assets Flag',
 ' ROA(B) before interest and depreciation after tax',
 ' Net worth/Assets',
 ' Pre-tax net Interest Rate',
 ' Net profit before tax/Paid-in capital',
 ' Debt ratio %',
 ' Persistent EPS in the Last Four Seasons',
 ' Total debt/Total net worth']

In [10]:
selected_features_dict['LogisticRegression'][1]['selected_feats']

[' ROA(A) before interest and % after tax',
 ' Borrowing dependency',
 ' Net Income to Total Assets',
 " Net Income to Stockholder's Equity",
 ' Liability-Assets Flag',
 ' Net profit before tax/Paid-in capital',
 ' Debt ratio %',
 ' Persistent EPS in the Last Four Seasons',
 ' Net worth/Assets',
 ' Liability to Equity',
 ' Per Share Net profit before tax (Yuan ¥)']

In [11]:
selected_features_dict['SVC'][1]['selected_feats']

[' ROA(A) before interest and % after tax',
 ' Borrowing dependency',
 ' Net Income to Total Assets',
 " Net Income to Stockholder's Equity",
 ' Liability-Assets Flag',
 ' Net profit before tax/Paid-in capital',
 ' Debt ratio %',
 ' Persistent EPS in the Last Four Seasons',
 ' Net worth/Assets',
 ' Liability to Equity',
 ' Per Share Net profit before tax (Yuan ¥)']

# Model selection

## RandomForest

In [12]:
for feat in numerical_features:
    data[f"feat{numerical_features.index(feat)}"] = data[feat] * data[' Liability-Assets Flag']

idx = 0

X_train = data.iloc[skfold_list[idx][0]].reset_index(drop=True)
y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

X_valid = data.iloc[skfold_list[idx][1]].reset_index(drop=True)
y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

for feat in numerical_features:
    X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * X_train[' Liability-Assets Flag']
    X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * X_valid[' Liability-Assets Flag']

selected_features = selected_features_dict['RandomForestClassifier'][1]['selected_feats']

X_train = X_train[selected_features]
X_valid = X_valid[selected_features]

model = selected_features_dict['RandomForestClassifier'][1]['model']
explainer = shap.Explainer(model.predict, X_train, max_evals = int(2 * X_train.shape[1] + 1), verbose=0)
shap_values = explainer(X_train)
selected_shap_features = get_feature_importances_shap_values(
    shap_values, features=list(X_train.columns), topk=10
)
selected_features_dict['RandomForestClassifier'][1]['selected_shap_feats'] = selected_shap_features
selected_shap_features

PermutationExplainer explainer: 5524it [02:24, 36.21it/s]                          


[' Borrowing dependency',
 ' Interest Expense Ratio',
 ' Degree of Financial Leverage (DFL)',
 ' Quick Ratio',
 ' Interest Coverage Ratio (Interest expense to EBIT)',
 ' Persistent EPS in the Last Four Seasons',
 ' Working Capital to Total Assets',
 ' Working Capital/Equity',
 ' Cash/Total Assets',
 ' Non-industry income and expenditure/revenue',
 " Net Income to Stockholder's Equity"]

In [14]:
n_estimators_list = [5, 10, 15, 25, 50, 100, 120, 300, 500]#, 800, 1200]

max_depth_list = [2, 3, 5, 8, 15, 25, 30, None]

# min_samples_split_list = [2, 5, 10, 15, 100]

# min_samples_leaf_list = [2, 5, 10]

# max_features_list = ['log2', 'sqrt', None]

params_dict={
    'n_estimators': n_estimators_list,
    'max_depth': max_depth_list,
    # 'min_samples_split': min_samples_split_list,
    # 'min_samples_leaf': min_samples_leaf_list,
    # 'max_features': max_features_list
}

rf_gscv = GridSearchCV(
    RandomForestClassifier(),
    param_grid=params_dict,
    scoring='roc_auc',
    cv=skfold_list,
    n_jobs=-1,
    verbose=4
)

# model training
rf_gscv.fit(data[selected_shap_features], data['Bankrupt?'])

print(rf_gscv.best_params_)
print(rf_gscv.best_score_)

for feat in numerical_features:
    test_data[f"feat{numerical_features.index(feat)}"] = test_data[feat] * test_data[' Liability-Assets Flag']

X_test = test_data[selected_shap_features]
y_test = test_data['Bankrupt?'].to_frame()

# y_test_pred = rf_gscv.predict(X_test[selected_features])
rf_y_test_pred_prob = rf_gscv.predict_proba(X_test)[:, 1]

rf_test_roc_auc = roc_auc_score(y_test, rf_y_test_pred_prob)
rf_test_roc_auc

Fitting 10 folds for each of 72 candidates, totalling 720 fits
{'max_depth': 8, 'n_estimators': 500}
0.93551773392037


0.9627410468319559

## XGBoost

In [16]:
idx = 0

X_train = data.iloc[skfold_list[idx][0]].reset_index(drop=True)
y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

X_valid = data.iloc[skfold_list[idx][1]].reset_index(drop=True)
y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

for feat in numerical_features:
    X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * X_train[' Liability-Assets Flag']
    X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * X_valid[' Liability-Assets Flag']

selected_features = selected_features_dict['XGBClassifier'][1]['selected_feats']

X_train = X_train[selected_features]
X_valid = X_valid[selected_features]

model = selected_features_dict['XGBClassifier'][1]['model']
explainer = shap.Explainer(model.predict, X_train, max_evals = int(2 * X_train.shape[1] + 1), verbose=0)
shap_values = explainer(X_train)
selected_shap_features = get_feature_importances_shap_values(
    shap_values, features=list(X_train.columns), topk=10
)
selected_features_dict['XGBClassifier'][1]['selected_shap_feats'] = selected_shap_features
selected_shap_features

PermutationExplainer explainer: 5524it [00:46, 92.95it/s]                           


[' Borrowing dependency',
 ' ROA(C) before interest and depreciation before interest',
 ' Continuous interest rate (after tax)',
 ' Interest Coverage Ratio (Interest expense to EBIT)',
 ' Inventory/Working Capital',
 ' Non-industry income and expenditure/revenue',
 ' Net Value Per Share (B)',
 ' Net Value Growth Rate',
 ' Per Share Net profit before tax (Yuan ¥)',
 ' Total debt/Total net worth',
 ' Net Income to Total Assets']

In [17]:
eta_list = [0.01, 0.015, 0.025, 0.05, 0.1, 0.3, 0.4, 0.5, 0.6, 0.7, 0.9]

# gamma_list = [0, 0.05, 0.07, 0.09, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]

max_depth_list = [3, 5, 6, 7, 9, 12, 15, 17, 25]

# min_child_weight_list = [1, 3, 5, 7]

# subsample_list = [0.6, 0.7, 0.8, 0.9, 1.0]

# colsample_bytree_list = [0.6, 0.7, 0.8, 0.9, 1.0]

# lambda_list = [0.01, 0.03, 0.1, 1.0]

# alpha_list = [0, 0.1, 0.5, 1.0]

n_estimators_list = [50, 100, 150, 200, 500, 1000]


params_dict={
    'eta': eta_list,
    # 'gamma': gamma_list,
    'max_depth': max_depth_list,
    # 'min_child_weight': min_child_weight_list,
    # 'subsample': subsample_list,
    # 'colsample_bytree': colsample_bytree_list,
    # 'lambda': lambda_list,
    # 'alpha': alpha_list
    'n_estimators': n_estimators_list,
    'device': ['cuda']
}

xgb_gscv = GridSearchCV(
    XGBClassifier(),
    param_grid=params_dict,
    scoring='roc_auc',
    cv=skfold_list,
    n_jobs=-1,
    verbose=4
)

# model training
xgb_gscv.fit(data[selected_shap_features], data['Bankrupt?'])

print(xgb_gscv.best_params_)
print(xgb_gscv.best_score_)

X_test = test_data[selected_shap_features]
y_test = test_data['Bankrupt?'].to_frame()

# y_test_pred = xgb_gscv.predict(X_test[selected_features])
xgb_y_test_pred_prob = xgb_gscv.predict_proba(X_test)[:, 1]

xgb_test_roc_auc = roc_auc_score(y_test, xgb_y_test_pred_prob)
xgb_test_roc_auc

Fitting 10 folds for each of 594 candidates, totalling 5940 fits


## LogisticRegression

In [None]:
idx = 0

X_train = data.iloc[skfold_list[idx][0]].reset_index(drop=True)
y_train = data.iloc[skfold_list[idx][0]]['Bankrupt?'].to_frame().reset_index(drop=True)

X_valid = data.iloc[skfold_list[idx][1]].reset_index(drop=True)
y_valid = data.iloc[skfold_list[idx][1]]['Bankrupt?'].to_frame().reset_index(drop=True)

for feat in numerical_features:
    X_train[f"feat{numerical_features.index(feat)}"] = X_train[feat] * X_train[' Liability-Assets Flag']
    X_valid[f"feat{numerical_features.index(feat)}"] = X_valid[feat] * X_valid[' Liability-Assets Flag']

selected_features = selected_features_dict['LogisticRegression'][1]['selected_feats']

X_train = X_train[selected_features]
X_valid = X_valid[selected_features]

model = selected_features_dict['LogisticRegression'][1]['model']
explainer = shap.Explainer(model.predict, X_train, max_evals = int(2 * X_train.shape[1] + 1), verbose=0)
shap_values = explainer(X_train)
selected_shap_features = get_feature_importances_shap_values(
    shap_values, features=list(X_train.columns), topk=10
)
selected_features_dict['LogisticRegression'][1]['selected_shap_feats'] = selected_shap_features
selected_shap_features

In [None]:
params_dict = {'penalty': ['l1','l2'], 'C': [0.001,0.01,0.1,1,10,100,1000]}

lr_gscv = GridSearchCV(
    LogisticRegression(),
    param_grid=params_dict,
    scoring='roc_auc',
    cv=skfold_list,
    n_jobs=-1,
    verbose=4
)

# model training
lr_gscv.fit(data[selected_shap_features], data['Bankrupt?'])

print(lr_gscv.best_params_)
print(lr_gscv.best_score_)

X_test = test_data[selected_shap_features]
y_test = test_data['Bankrupt?'].to_frame()

# y_test_pred = lr_gscv.predict(X_test[selected_features])
lr_y_test_pred_prob = lr_gscv.predict_proba(X_test)[:, 1]

lr_test_roc_auc = roc_auc_score(y_test, lr_y_test_pred_prob)
lr_test_roc_auc