In [1]:
import os
import sys
from pathlib import Path
sys.path.append(str(Path(os.path.abspath('')).resolve().parents[0]))

In [2]:
from information_noise_reduction.subset_generator import reverse_all_subsets_generator
from information_noise_reduction.evaluate_model import evaluate_subsets
from information_noise_reduction.interpretation import compute_variable_contributions, top_k_variables

import numpy as np
import pandas as pd
import kagglehub
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

2024-11-11 16:43:18.851417: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Load / Transform

In [3]:
path = os.path.join(kagglehub.dataset_download("fedesoriano/company-bankruptcy-prediction"), "data.csv")

In [4]:
df = pd.read_csv(path)
print(len(df.columns))
df.columns

96


Index(['Bankrupt?', ' ROA(C) before interest and depreciation before interest',
       ' ROA(A) before interest and % after tax',
       ' ROA(B) before interest and depreciation after tax',
       ' Operating Gross Margin', ' Realized Sales Gross Margin',
       ' Operating Profit Rate', ' Pre-tax net Interest Rate',
       ' After-tax net Interest Rate',
       ' Non-industry income and expenditure/revenue',
       ' Continuous interest rate (after tax)', ' Operating Expense Rate',
       ' Research and development expense rate', ' Cash flow rate',
       ' Interest-bearing debt interest rate', ' Tax rate (A)',
       ' Net Value Per Share (B)', ' Net Value Per Share (A)',
       ' Net Value Per Share (C)', ' Persistent EPS in the Last Four Seasons',
       ' Cash Flow Per Share', ' Revenue Per Share (Yuan ¥)',
       ' Operating Profit Per Share (Yuan ¥)',
       ' Per Share Net profit before tax (Yuan ¥)',
       ' Realized Sales Gross Profit Growth Rate',
       ' Operating Profit

In [5]:
feature_columns = [
    ' ROA(B) before interest and depreciation after tax',
    ' Operating Gross Margin', ' Realized Sales Gross Margin',
    ' Operating Profit Rate', ' Pre-tax net Interest Rate',
    ' After-tax net Interest Rate',
    ' Non-industry income and expenditure/revenue',
    ' Continuous interest rate (after tax)', ' Operating Expense Rate',
    ' Research and development expense rate', ' Cash flow rate'
]
target_column = "Bankrupt?"

In [6]:
df = df[feature_columns.copy() + [target_column]]
df

Unnamed: 0,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),Operating Expense Rate,Research and development expense rate,Cash flow rate,Bankrupt?
0,0.405750,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,0.780985,1.256969e-04,0.000000e+00,0.458143,1
1,0.516730,0.610235,0.610235,0.998946,0.797380,0.809301,0.303556,0.781506,2.897851e-04,0.000000e+00,0.461867,1
2,0.472295,0.601450,0.601364,0.998857,0.796403,0.808388,0.302035,0.780284,2.361297e-04,2.550000e+07,0.458521,1
3,0.457733,0.583541,0.583541,0.998700,0.796967,0.808966,0.303350,0.781241,1.078888e-04,0.000000e+00,0.465705,1
4,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,0.781550,7.890000e+09,0.000000e+00,0.462746,1
...,...,...,...,...,...,...,...,...,...,...,...,...
6814,0.543230,0.604455,0.604462,0.998992,0.797409,0.809331,0.303510,0.781588,1.510213e-04,4.500000e+09,0.463734,0
6815,0.524172,0.598308,0.598308,0.998992,0.797414,0.809327,0.303520,0.781586,5.220000e+09,1.440000e+09,0.461978,0
6816,0.520638,0.610444,0.610213,0.998984,0.797401,0.809317,0.303512,0.781546,2.509312e-04,1.039086e-04,0.472189,0
6817,0.554045,0.607850,0.607850,0.999074,0.797500,0.809399,0.303498,0.781663,1.236154e-04,2.510000e+09,0.476123,0


## Class of models

In [7]:
def model_generator(input_dim):
    model = Sequential([
            Dense(32, activation='relu', input_shape=(input_dim,)),
            Dense(32, activation='relu', input_shape=(input_dim,)),
            Dense(1, activation='sigmoid')
        ])
    model.compile(optimizer='adam', loss='binary_crossentropy')

    return model

## Analysis

In [8]:
model_gen = model_generator
subset_losses, subset_weights = evaluate_subsets(df, target_col=target_column, model_generator=model_gen, max_subsets=10, target_max_variables=4, epochs=2)


Init
---



2024-11-11 16:43:24.993943: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Evaluated subset (' Realized Sales Gross Margin', ' Continuous interest rate (after tax)', ' Operating Profit Rate', ' ROA(B) before interest and depreciation after tax') with loss: 0.1350
Evaluated subset (' Pre-tax net Interest Rate', ' Non-industry income and expenditure/revenue', ' ROA(B) before interest and depreciation after tax', ' Operating Gross Margin') with loss: 0.1377
Evaluated subset (' Pre-tax net Interest Rate', ' ROA(B) before interest and depreciation after tax', ' Research and development expense rate', ' Realized Sales Gross Margin') with loss: 129871.6094
Evaluated subset (' Operating Gross Margin', ' Cash flow rate', ' Operating Expense Rate', ' Operating Profit Rate') with loss: 904429.5625
Evaluated subset (' After-tax net Interest Rate', ' Continuous interest rate (after tax)', ' Non-industry income and expenditure/revenue', ' Pre-tax net Interest Rate') with loss: 0.1439
Evaluated subset (' Non-industry income and expenditure/revenue', ' Operating Gross Margin

In [9]:
variable_contributions = compute_variable_contributions(subset_losses)
variable_contributions

{'average_losses': {' Realized Sales Gross Margin': 25974.43333711773,
  ' Continuous interest rate (after tax)': 398186.58824418485,
  ' Operating Profit Rate': 301476.6120493288,
  ' ROA(B) before interest and depreciation after tax': 360419.9431567035,
  ' Pre-tax net Interest Rate': 409475.90321570885,
  ' Non-industry income and expenditure/revenue': 207275.91311085437,
  ' Operating Gross Margin': 113053.81655579247,
  ' Research and development expense rate': 816253.328125,
  ' Cash flow rate': 537219.7272205353,
  ' Operating Expense Rate': 1170585.875,
  ' After-tax net Interest Rate': 488361.14832192066},
 'normalized_scores': {' Realized Sales Gross Margin': 0.07542972936283453,
  ' Continuous interest rate (after tax)': 0.08851360785528724,
  ' Operating Profit Rate': 0.08491035573480316,
  ' ROA(B) before interest and depreciation after tax': 0.0870886378868344,
  ' Pre-tax net Interest Rate': 0.0889440733176453,
  ' Non-industry income and expenditure/revenue': 0.08154166

In [12]:
top_k_variables(variable_contributions['normalized_scores'], 4)

{' Operating Expense Rate': 0.1233576581643923,
 ' Research and development expense rate': 0.10593404789291914,
 ' Cash flow rate': 0.09396332422705667,
 ' After-tax net Interest Rate': 0.09201098801619377}