In [1]:
# Step 1: Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
# Step 2: Load the dataset
dataset_path = 'C:/Users/Vaishnavi/Documents/Python Project/taiwanese+bankruptcy+prediction/data.csv'
df = pd.read_csv(dataset_path)
df.head()

Unnamed: 0,Bankrupt?,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,1,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,...,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,...,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,1,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,...,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,1,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,...,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,1,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,...,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [5]:
missing_data = df.isnull().sum()
print("Columns with Missing Data:")
print(missing_data[missing_data > 0])

Columns with Missing Data:
Series([], dtype: int64)


In [8]:
# Set display options to show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [9]:
print(df.head())

   Bankrupt?   ROA(C) before interest and depreciation before interest  \
0          1                                           0.370594          
1          1                                           0.464291          
2          1                                           0.426071          
3          1                                           0.399844          
4          1                                           0.465022          

    ROA(A) before interest and % after tax  \
0                                 0.424389   
1                                 0.538214   
2                                 0.499019   
3                                 0.451265   
4                                 0.538432   

    ROA(B) before interest and depreciation after tax  \
0                                           0.405750    
1                                           0.516730    
2                                           0.472295    
3                                           0.4577

In [10]:
#Correlation Analysis
correlation_matrix = df.corr()
target_correlation = correlation_matrix['Bankrupt?'].abs().sort_values(ascending=False)
print(target_correlation)

Bankrupt?                                                   1.000000
 Net Income to Total Assets                                 0.315457
 ROA(A) before interest and % after tax                     0.282941
 ROA(B) before interest and depreciation after tax          0.273051
 ROA(C) before interest and depreciation before interest    0.260807
 Net worth/Assets                                           0.250161
 Debt ratio %                                               0.250161
 Persistent EPS in the Last Four Seasons                    0.219560
 Retained Earnings to Total Assets                          0.217779
 Net profit before tax/Paid-in capital                      0.207857
 Per Share Net profit before tax (Yuan ¥)                   0.201395
 Current Liability to Assets                                0.194494
 Working Capital to Total Assets                            0.193083
 Net Income to Stockholder's Equity                         0.180987
 Borrowing dependency             

In [14]:
#'Bankrupt?' is your target variable
X = df.drop('Bankrupt?', axis=1)
y = df['Bankrupt?']

In [15]:
# Step 1: Split the Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
X.head()

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),Operating Expense Rate,Research and development expense rate,Cash flow rate,Interest-bearing debt interest rate,Tax rate (A),Net Value Per Share (B),Net Value Per Share (A),Net Value Per Share (C),Persistent EPS in the Last Four Seasons,Cash Flow Per Share,Revenue Per Share (Yuan ¥),Operating Profit Per Share (Yuan ¥),Per Share Net profit before tax (Yuan ¥),Realized Sales Gross Profit Growth Rate,Operating Profit Growth Rate,After-tax Net Profit Growth Rate,Regular Net Profit Growth Rate,Continuous Net Profit Growth Rate,Total Asset Growth Rate,Net Value Growth Rate,Total Asset Return Growth Rate Ratio,Cash Reinvestment %,Current Ratio,Quick Ratio,Interest Expense Ratio,Total debt/Total net worth,Debt ratio %,Net worth/Assets,Long-term fund suitability ratio (A),Borrowing dependency,Contingent liabilities/Net worth,Operating profit/Paid-in capital,Net profit before tax/Paid-in capital,Inventory and accounts receivable/Net value,Total Asset Turnover,Accounts Receivable Turnover,Average Collection Days,Inventory Turnover Rate (times),Fixed Assets Turnover Frequency,Net Worth Turnover Rate (times),Revenue per person,Operating profit per person,Allocation rate per person,Working Capital to Total Assets,Quick Assets/Total Assets,Current Assets/Total Assets,Cash/Total Assets,Quick Assets/Current Liability,Cash/Current Liability,Current Liability to Assets,Operating Funds to Liability,Inventory/Working Capital,Inventory/Current Liability,Current Liabilities/Liability,Working Capital/Equity,Current Liabilities/Equity,Long-term Liability to Current Assets,Retained Earnings to Total Assets,Total income/Total expense,Total expense/Assets,Current Asset Turnover Rate,Quick Asset Turnover Rate,Working capitcal Turnover Rate,Cash Turnover Rate,Cash Flow to Sales,Fixed Assets to Assets,Current Liability to Liability,Current Liability to Equity,Equity to Long-term Liability,Cash Flow to Total Assets,Cash Flow to Liability,CFO to Assets,Cash Flow to Equity,Current Liability to Current Assets,Liability-Assets Flag,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,0.370594,0.424389,0.40575,0.601457,0.601457,0.998969,0.796887,0.808809,0.302646,0.780985,0.0001256969,0.0,0.458143,0.000725,0.0,0.14795,0.14795,0.14795,0.169141,0.311664,0.01756,0.095921,0.138736,0.022102,0.848195,0.688979,0.688979,0.217535,4980000000.0,0.000327,0.2631,0.363725,0.002259,0.001208,0.629951,0.021266,0.207576,0.792424,0.005024,0.390284,0.006479,0.095885,0.137757,0.398036,0.086957,0.001814,0.003487,0.0001820926,0.0001165007,0.032903,0.034164,0.392913,0.037135,0.672775,0.166673,0.190643,0.004094,0.001997,0.000147336,0.147308,0.334015,0.27692,0.001036,0.676269,0.721275,0.339077,0.025592,0.903225,0.002022,0.064856,701000000.0,6550000000.0,0.593831,458000000.0,0.671568,0.424206,0.676269,0.339077,0.126549,0.637555,0.458609,0.520382,0.312905,0.11825,0,0.716845,0.009219,0.622879,0.601453,0.82789,0.290202,0.026601,0.56405,1,0.016469
1,0.464291,0.538214,0.51673,0.610235,0.610235,0.998946,0.79738,0.809301,0.303556,0.781506,0.0002897851,0.0,0.461867,0.000647,0.0,0.182251,0.182251,0.182251,0.208944,0.318137,0.021144,0.093722,0.169918,0.02208,0.848088,0.689693,0.689702,0.21762,6110000000.0,0.000443,0.264516,0.376709,0.006016,0.004039,0.635172,0.012502,0.171176,0.828824,0.005059,0.37676,0.005835,0.093743,0.168962,0.397725,0.064468,0.001286,0.004917,9360000000.0,719000000.0,0.025484,0.006889,0.39159,0.012335,0.751111,0.127236,0.182419,0.014948,0.004136,0.00138391,0.056963,0.341106,0.289642,0.00521,0.308589,0.731975,0.32974,0.023947,0.931065,0.002226,0.025516,0.0001065198,7700000000.0,0.593916,2490000000.0,0.67157,0.468828,0.308589,0.32974,0.120916,0.6411,0.459001,0.567101,0.314163,0.047775,0,0.795297,0.008323,0.623652,0.610237,0.839969,0.283846,0.264577,0.570175,1,0.020794
2,0.426071,0.499019,0.472295,0.60145,0.601364,0.998857,0.796403,0.808388,0.302035,0.780284,0.0002361297,25500000.0,0.458521,0.00079,0.0,0.177911,0.177911,0.193713,0.180581,0.307102,0.005944,0.092338,0.142803,0.02276,0.848094,0.689463,0.68947,0.217601,7280000000.0,0.000396,0.264184,0.368913,0.011543,0.005348,0.629631,0.021248,0.207516,0.792484,0.0051,0.379093,0.006562,0.092318,0.148036,0.40658,0.014993,0.001495,0.004227,65000000.0,2650000000.0,0.013387,0.028997,0.381968,0.141016,0.829502,0.340201,0.602806,0.000991,0.006302,5340000000.0,0.098162,0.336731,0.277456,0.013879,0.446027,0.742729,0.334777,0.003715,0.909903,0.00206,0.021387,0.001791094,0.001022676,0.594502,761000000.0,0.671571,0.276179,0.446027,0.334777,0.117922,0.642765,0.459254,0.538491,0.314515,0.025346,0,0.77467,0.040003,0.623841,0.601449,0.836774,0.290189,0.026555,0.563706,1,0.016474
3,0.399844,0.451265,0.457733,0.583541,0.583541,0.9987,0.796967,0.808966,0.30335,0.781241,0.0001078888,0.0,0.465705,0.000449,0.0,0.154187,0.154187,0.154187,0.193722,0.321674,0.014368,0.077762,0.148603,0.022046,0.848005,0.68911,0.68911,0.217568,4880000000.0,0.000382,0.263371,0.384077,0.004194,0.002896,0.630228,0.009572,0.151465,0.848535,0.005047,0.379743,0.005366,0.077727,0.147561,0.397925,0.089955,0.001966,0.003215,7130000000.0,9150000000.0,0.028065,0.015463,0.378497,0.02132,0.725754,0.161575,0.225815,0.018851,0.002961,0.001010646,0.098715,0.348716,0.27658,0.00354,0.615848,0.729825,0.331509,0.022165,0.906902,0.001831,0.024161,8140000000.0,6050000000.0,0.593889,2030000000.0,0.671519,0.559144,0.615848,0.331509,0.12076,0.579039,0.448518,0.604105,0.302382,0.06725,0,0.739555,0.003252,0.622929,0.583538,0.834697,0.281721,0.026697,0.564663,1,0.023982
4,0.465022,0.538432,0.522298,0.598783,0.598783,0.998973,0.797366,0.809304,0.303475,0.78155,7890000000.0,0.0,0.462746,0.000686,0.0,0.167502,0.167502,0.167502,0.212537,0.319162,0.02969,0.096898,0.168412,0.022096,0.848258,0.689697,0.689697,0.217626,5510000000.0,0.000439,0.265218,0.37969,0.006022,0.003727,0.636055,0.00515,0.106509,0.893491,0.005303,0.375025,0.006624,0.096927,0.167461,0.400079,0.175412,0.001449,0.004367,0.0001633674,0.0002935211,0.040161,0.058111,0.394371,0.023988,0.751822,0.26033,0.35838,0.014161,0.004275,0.0006804636,0.110195,0.344639,0.287913,0.004869,0.975007,0.732,0.330726,0.0,0.91385,0.002224,0.026385,6680000000.0,5050000000.0,0.593915,824000000.0,0.671563,0.309555,0.975007,0.330726,0.110933,0.622374,0.454411,0.578469,0.311567,0.047725,0,0.795016,0.003878,0.623521,0.598782,0.839973,0.278514,0.024752,0.575617,1,0.03549


In [17]:
X_train.head()

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),Operating Expense Rate,Research and development expense rate,Cash flow rate,Interest-bearing debt interest rate,Tax rate (A),Net Value Per Share (B),Net Value Per Share (A),Net Value Per Share (C),Persistent EPS in the Last Four Seasons,Cash Flow Per Share,Revenue Per Share (Yuan ¥),Operating Profit Per Share (Yuan ¥),Per Share Net profit before tax (Yuan ¥),Realized Sales Gross Profit Growth Rate,Operating Profit Growth Rate,After-tax Net Profit Growth Rate,Regular Net Profit Growth Rate,Continuous Net Profit Growth Rate,Total Asset Growth Rate,Net Value Growth Rate,Total Asset Return Growth Rate Ratio,Cash Reinvestment %,Current Ratio,Quick Ratio,Interest Expense Ratio,Total debt/Total net worth,Debt ratio %,Net worth/Assets,Long-term fund suitability ratio (A),Borrowing dependency,Contingent liabilities/Net worth,Operating profit/Paid-in capital,Net profit before tax/Paid-in capital,Inventory and accounts receivable/Net value,Total Asset Turnover,Accounts Receivable Turnover,Average Collection Days,Inventory Turnover Rate (times),Fixed Assets Turnover Frequency,Net Worth Turnover Rate (times),Revenue per person,Operating profit per person,Allocation rate per person,Working Capital to Total Assets,Quick Assets/Total Assets,Current Assets/Total Assets,Cash/Total Assets,Quick Assets/Current Liability,Cash/Current Liability,Current Liability to Assets,Operating Funds to Liability,Inventory/Working Capital,Inventory/Current Liability,Current Liabilities/Liability,Working Capital/Equity,Current Liabilities/Equity,Long-term Liability to Current Assets,Retained Earnings to Total Assets,Total income/Total expense,Total expense/Assets,Current Asset Turnover Rate,Quick Asset Turnover Rate,Working capitcal Turnover Rate,Cash Turnover Rate,Cash Flow to Sales,Fixed Assets to Assets,Current Liability to Liability,Current Liability to Equity,Equity to Long-term Liability,Cash Flow to Total Assets,Cash Flow to Liability,CFO to Assets,Cash Flow to Equity,Current Liability to Current Assets,Liability-Assets Flag,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
3759,0.498513,0.542848,0.544622,0.599194,0.599036,0.998986,0.797412,0.80933,0.303528,0.781593,7000000000.0,2700000000.0,0.467994,0.000281,0.219962,0.197337,0.197337,0.197337,0.221046,0.329844,0.052286,0.101376,0.175943,0.022024,0.847952,0.689188,0.689188,0.21757,5560000000.0,0.000433,0.2636,0.379885,0.009253,0.00578,0.631983,0.010735,0.15999,0.84001,0.005262,0.378194,0.007238,0.101324,0.174911,0.402892,0.130435,0.00082,0.00771,0.0001119688,0.000191,0.040645,0.020366,0.393659,0.013654,0.785009,0.235713,0.356457,0.032719,0.005957,0.002388,0.072399,0.349242,0.277494,0.008863,0.424038,0.73501,0.330333,0.022557,0.940112,0.002299,0.013353,9460000000.0,6390000000.0,0.593941,2560000000.0,0.671571,0.373442,0.424038,0.330333,0.127509,0.640097,0.458822,0.608985,0.313993,0.03149,0,0.801313,0.005821,0.623649,0.599196,0.84058,0.282564,0.027239,0.566658,1,0.022512
1782,0.506606,0.562309,0.558863,0.609334,0.609334,0.999027,0.79745,0.809375,0.303508,0.781637,0.0001938068,0.0001353189,0.460732,0.000164,0.0,0.173697,0.173697,0.173697,0.223882,0.316156,0.029297,0.104307,0.177299,0.022159,0.848046,0.689407,0.689407,0.217595,0.0001039851,0.000469,0.264168,0.373151,0.011845,0.007728,0.63077,0.007986,0.137981,0.862019,0.016014,0.374378,0.005366,0.104304,0.176314,0.409016,0.16042,0.000625,0.010125,5870000000.0,0.005545,0.037419,0.026597,0.397854,0.000664,0.871387,0.647349,0.90084,0.207601,0.008227,0.00771,0.142719,0.338721,0.277293,0.009691,0.982763,0.741265,0.33323,0.0,0.937921,0.002312,0.031074,0.0002681255,0.0001951601,0.594009,0.0001680341,0.671609,0.01345,0.982763,0.33323,0.110933,0.702293,0.470215,0.555936,0.324933,0.024709,0,0.810914,0.000481,0.623932,0.609332,0.841339,0.28057,0.026843,0.565395,1,0.02667
5013,0.508799,0.561001,0.554687,0.614242,0.614055,0.999094,0.797533,0.809424,0.303514,0.781692,0.0002028264,0.0001005129,0.469213,0.000285,0.188683,0.225149,0.225149,0.225149,0.228609,0.329349,0.025107,0.110822,0.183776,0.02218,0.848151,0.689289,0.689289,0.21758,0.0001077029,0.000536,0.26367,0.383415,0.008609,0.005343,0.630922,0.004126,0.091877,0.908123,0.006202,0.373293,0.008672,0.110794,0.182814,0.39778,0.086957,0.000889,0.007109,7850000000.0,0.000315,0.021935,0.006357,0.394756,0.003322,0.774787,0.197801,0.289225,0.061019,0.005747,0.005095,0.063215,0.355845,0.277537,0.007742,0.640898,0.733372,0.328602,0.010628,0.943937,0.002361,0.01792,0.0001576788,0.000110311,0.59395,923000000.0,0.671584,0.150635,0.640898,0.328602,0.115194,0.65291,0.461995,0.610546,0.315919,0.033789,0,0.80974,0.001397,0.623714,0.614241,0.840969,0.277772,0.026864,0.565484,1,0.041556
5412,0.499976,0.562527,0.546764,0.597825,0.597825,0.999004,0.797411,0.809329,0.30349,0.78159,328000000.0,0.0,0.459505,0.000404,0.25539,0.180692,0.180692,0.180692,0.230973,0.309931,0.146891,0.124746,0.186563,0.022176,0.848171,0.68968,0.68968,0.217614,9200000000.0,0.000478,0.264225,0.365271,0.00935,0.007319,0.631214,0.016674,0.191583,0.808417,0.005473,0.379327,0.005366,0.124774,0.185574,0.42212,0.446777,0.000996,0.006349,0.000334951,0.001986,0.135645,0.141551,0.412327,0.016025,0.828304,0.665978,0.796687,0.018261,0.007583,0.000609,0.159037,0.336159,0.277255,0.004918,0.792277,0.740967,0.338004,0.004666,0.93605,0.002341,0.021787,7100000000.0,6270000000.0,0.593935,5080000.0,0.671565,0.177679,0.792277,0.338004,0.120768,0.609031,0.454735,0.53692,0.305663,0.03117,0,0.810082,0.000998,0.623986,0.597824,0.841885,0.286871,0.026951,0.56582,1,0.018173
3066,0.477892,0.5477,0.52915,0.600362,0.600362,0.998975,0.797412,0.809333,0.303551,0.781584,0.000102107,0.0,0.462998,0.000405,0.149307,0.194261,0.194261,0.194261,0.220573,0.322204,0.071313,0.100073,0.178203,0.022094,0.848064,0.689344,0.689344,0.21759,6520000000.0,0.000442,0.263924,0.377529,0.008758,0.007495,0.631171,0.010057,0.155153,0.844847,0.006144,0.372543,0.005366,0.100046,0.177257,0.408148,0.205397,0.000692,0.009149,0.000240752,0.000875,0.055,0.046103,0.394179,0.006717,0.799793,0.494416,0.571511,0.014452,0.007363,0.000628,0.121859,0.344278,0.277247,0.003955,0.745419,0.736186,0.332926,0.006831,0.936134,0.002289,0.024994,0.0001033323,9330000000.0,0.59394,768000000.0,0.671572,0.119679,0.745419,0.332926,0.118651,0.643043,0.459293,0.582778,0.314589,0.033229,0,0.804638,0.002826,0.623845,0.600363,0.840885,0.282073,0.026959,0.565848,1,0.023328


In [19]:
y_test.head()

239     0
2850    0
2687    0
6500    1
2684    0
Name: Bankrupt?, dtype: int64

In [20]:
# Step 2: Feature Selection using Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [21]:
# Get feature importances
feature_importances = rf_model.feature_importances_

In [22]:
# Choose a threshold for feature selection (you can experiment with different values)
threshold = 0.01
selected_features = X.columns[feature_importances > threshold]

In [23]:
# Display selected features
print("Selected Features:")
print(selected_features)

Selected Features:
Index([' ROA(C) before interest and depreciation before interest',
       ' ROA(A) before interest and % after tax',
       ' ROA(B) before interest and depreciation after tax',
       ' After-tax net Interest Rate',
       ' Non-industry income and expenditure/revenue',
       ' Continuous interest rate (after tax)',
       ' Interest-bearing debt interest rate', ' Net Value Per Share (B)',
       ' Net Value Per Share (A)', ' Net Value Per Share (C)',
       ' Persistent EPS in the Last Four Seasons',
       ' Per Share Net profit before tax (Yuan ¥)', ' Net Value Growth Rate',
       ' Total Asset Return Growth Rate Ratio', ' Quick Ratio',
       ' Interest Expense Ratio', ' Total debt/Total net worth',
       ' Debt ratio %', ' Net worth/Assets', ' Borrowing dependency',
       ' Net profit before tax/Paid-in capital',
       ' Accounts Receivable Turnover', ' Average Collection Days',
       ' Inventory Turnover Rate (times)', ' Fixed Assets Turnover Frequency',

In [24]:
#Correlation Analysis
correlation_matrix = df.corr()
target_correlation = correlation_matrix['Bankrupt?'].abs().sort_values(ascending=False)
print(target_correlation)

Bankrupt?                                                   1.000000
 Net Income to Total Assets                                 0.315457
 ROA(A) before interest and % after tax                     0.282941
 ROA(B) before interest and depreciation after tax          0.273051
 ROA(C) before interest and depreciation before interest    0.260807
 Net worth/Assets                                           0.250161
 Debt ratio %                                               0.250161
 Persistent EPS in the Last Four Seasons                    0.219560
 Retained Earnings to Total Assets                          0.217779
 Net profit before tax/Paid-in capital                      0.207857
 Per Share Net profit before tax (Yuan ¥)                   0.201395
 Current Liability to Assets                                0.194494
 Working Capital to Total Assets                            0.193083
 Net Income to Stockholder's Equity                         0.180987
 Borrowing dependency             