In [1]:
import os
import sys
from pathlib import Path
sys.path.append(str(Path(os.path.abspath('')).resolve().parents[0]))

In [2]:
from information_noise_reduction.subset_generator import reverse_all_subsets_generator
from information_noise_reduction.evaluate_model import evaluate_subsets
from information_noise_reduction.interpretation import compute_variable_contributions, top_k_variables
from information_noise_reduction.pre_processing import select_important_features_with_lasso

import numpy as np
import pandas as pd
import kagglehub
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

2024-11-12 12:52:16.872968: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load / Transform

In [3]:
path = os.path.join(kagglehub.dataset_download("fedesoriano/company-bankruptcy-prediction"), "data.csv")

In [4]:
df = pd.read_csv(path)
print(len(df.columns))
df.columns

96


Index(['Bankrupt?', ' ROA(C) before interest and depreciation before interest',
       ' ROA(A) before interest and % after tax',
       ' ROA(B) before interest and depreciation after tax',
       ' Operating Gross Margin', ' Realized Sales Gross Margin',
       ' Operating Profit Rate', ' Pre-tax net Interest Rate',
       ' After-tax net Interest Rate',
       ' Non-industry income and expenditure/revenue',
       ' Continuous interest rate (after tax)', ' Operating Expense Rate',
       ' Research and development expense rate', ' Cash flow rate',
       ' Interest-bearing debt interest rate', ' Tax rate (A)',
       ' Net Value Per Share (B)', ' Net Value Per Share (A)',
       ' Net Value Per Share (C)', ' Persistent EPS in the Last Four Seasons',
       ' Cash Flow Per Share', ' Revenue Per Share (Yuan ¥)',
       ' Operating Profit Per Share (Yuan ¥)',
       ' Per Share Net profit before tax (Yuan ¥)',
       ' Realized Sales Gross Profit Growth Rate',
       ' Operating Profit

## Preprocess

In [10]:
target_column = "Bankrupt?"
# feature_columns = [
#     ' ROA(B) before interest and depreciation after tax',
#     ' Operating Gross Margin', ' Realized Sales Gross Margin',
#     ' Operating Profit Rate', ' Pre-tax net Interest Rate',
#     ' After-tax net Interest Rate',
#     ' Non-industry income and expenditure/revenue',
#     ' Continuous interest rate (after tax)', ' Operating Expense Rate',
#     ' Research and development expense rate', ' Cash flow rate'
# ]

feature_columns = select_important_features_with_lasso(df, target_column, 20)



In [12]:
df = df[feature_columns.copy() + [target_column]]
df

Unnamed: 0,Debt ratio %,Borrowing dependency,Cash/Current Liability,Working Capital/Equity,Fixed Assets to Assets,Current Liability to Current Assets,Liability-Assets Flag,Net Income to Total Assets,Bankrupt?
0,0.207576,0.390284,1.473360e-04,0.721275,0.424206,0.118250,0,0.716845,1
1,0.171176,0.376760,1.383910e-03,0.731975,0.468828,0.047775,0,0.795297,1
2,0.207516,0.379093,5.340000e+09,0.742729,0.276179,0.025346,0,0.774670,1
3,0.151465,0.379743,1.010646e-03,0.729825,0.559144,0.067250,0,0.739555,1
4,0.106509,0.375025,6.804636e-04,0.732000,0.309555,0.047725,0,0.795016,1
...,...,...,...,...,...,...,...,...,...
6814,0.124618,0.373823,5.071548e-03,0.736716,0.400338,0.027951,0,0.799927,0
6815,0.099253,0.372505,4.727181e-03,0.734584,0.096136,0.031470,0,0.799748,0
6816,0.038939,0.369637,8.821248e-02,0.737432,0.055509,0.007542,0,0.797778,0
6817,0.086979,0.369649,7.133218e-03,0.736713,0.246805,0.022916,0,0.811808,0


## Class of models

In [21]:
def model_generator(input_dim):
    model = Sequential([
            Dense(16, activation='relu', input_shape=(input_dim,)),
            Dense(16, activation='relu', input_shape=(input_dim,)),
            Dense(8, activation='relu', input_shape=(input_dim,)),
            Dense(1, activation='sigmoid')
        ])
    model.compile(optimizer='adam', loss='binary_crossentropy')

    return model

## Analysis

In [22]:
model_gen = model_generator
subset_losses, subset_weights = evaluate_subsets(df, target_col=target_column, model_generator=model_gen, max_subsets=20, target_max_variables=4, epochs=10)


Init
---

Evaluated subset (' Working Capital/Equity', ' Fixed Assets to Assets', ' Current Liability to Current Assets', ' Debt ratio %') with loss: 0.1155
Evaluated subset (' Current Liability to Current Assets', ' Working Capital/Equity', ' Liability-Assets Flag', ' Borrowing dependency') with loss: 0.1270
Evaluated subset (' Borrowing dependency', ' Current Liability to Current Assets', ' Debt ratio %', ' Cash/Current Liability') with loss: 50482.8555
Evaluated subset (' Cash/Current Liability', ' Debt ratio %', ' Current Liability to Current Assets', ' Working Capital/Equity') with loss: 212279.9062
Evaluated subset (' Borrowing dependency', ' Working Capital/Equity', ' Cash/Current Liability', ' Fixed Assets to Assets') with loss: 97427.2109
Evaluated subset (' Cash/Current Liability', ' Debt ratio %', ' Net Income to Total Assets', ' Current Liability to Current Assets') with loss: 23942.7207
Evaluated subset (' Borrowing dependency', ' Net Income to Total Assets', ' Fixed Asse

In [23]:
variable_contributions = compute_variable_contributions(subset_losses)
variable_contributions

{'average_losses': {' Working Capital/Equity': 28155.296543256463,
  ' Fixed Assets to Assets': 36383.2268149741,
  ' Current Liability to Current Assets': 28670.636695116013,
  ' Debt ratio %': 47784.30309485396,
  ' Liability-Assets Flag': 0.1308424338698387,
  ' Borrowing dependency': 11377.809394449569,
  ' Cash/Current Liability': 96295.10780115922,
  ' Net Income to Total Assets': 24175.71775708182},
 'normalized_scores': {' Working Capital/Equity': 0.12050306411696889,
  ' Fixed Assets to Assets': 0.12526544147490032,
  ' Current Liability to Current Assets': 0.12079595824823548,
  ' Debt ratio %': 0.13217711397976165,
  ' Liability-Assets Flag': 0.10553502028765974,
  ' Borrowing dependency': 0.11134576956066998,
  ' Cash/Current Liability': 0.16611257842808375,
  ' Net Income to Total Assets': 0.1182650539037203}}

In [24]:
top_k_variables(variable_contributions['normalized_scores'], 5)

{' Cash/Current Liability': 0.16611257842808375,
 ' Debt ratio %': 0.13217711397976165,
 ' Fixed Assets to Assets': 0.12526544147490032,
 ' Current Liability to Current Assets': 0.12079595824823548,
 ' Working Capital/Equity': 0.12050306411696889}