In [11]:
from imblearn.over_sampling import SMOTE
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, PassiveAggressiveClassifier, LogisticRegression


### Loading CSVs (3 differents with: initial, with correlation less that 0,8, With Variance > 0,02)

In [12]:
data = pd.read_csv('data.csv')
data.columns = data.columns.str.strip()
data.rename(columns={'Debt ratio %': 'Debt_ratio'}, inplace=True)

In [25]:
lc_data = pd.read_csv('low_correl_data.csv')
hv_data = pd.read_csv('low_var_data.csv')
pca_data = pd.read_csv('pca_data.csv')


In [16]:
def split_and_smote(df, test_ratio, random_state, random_state_smote):
    X = df.drop('Bankrupt?', axis=1)
    y = df['Bankrupt?']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=7986)
    smote = SMOTE(random_state=random_state_smote)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    return X_train, X_test, y_train, y_test

In [52]:
Xdata_train, Xdata_test, ydata_train, ydata_test = split_and_smote(data , 0.2 , 456 , 123)


(10556, 95)

In [21]:
Xlc_train, Xlc_test, ylc_train, ylc_test = split_and_smote(lc_data , 0.2 , 456 , 123)

In [22]:
Xhv_train, Xhv_test, yhv_train, yhv_test = split_and_smote(hv_data , 0.2 , 456 , 123)

### Model : Logistic Regression

In [48]:
LR_model = LogisticRegression(max_iter=1000)
LR_model.fit(Xdata_train, ydata_train)

In [49]:
LR_model.fit(Xlc_train, ylc_train)

In [50]:
LR_model.fit(Xhv_train, yhv_train)

In [55]:
ydata_LRpred = LR_model.predict(Xdata_test)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- After-tax Net Profit Growth Rate
- After-tax net Interest Rate
- Borrowing dependency
- CFO to Assets
- Cash Flow Per Share
- ...


In [56]:
Xdata_train.head()

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
0,0.569444,0.586186,0.628406,0.61252,0.61252,0.999165,0.797647,0.809541,0.303565,0.781811,...,0.823108,0.006469,0.623595,0.612519,0.841918,0.278031,0.02683,0.565337,1,0.039134
1,0.468142,0.534562,0.520959,0.598495,0.598538,0.998963,0.797381,0.809307,0.303521,0.781541,...,0.796095,0.00978,0.62414,0.598496,0.840054,0.281694,0.047109,0.569948,1,0.024036
2,0.432896,0.501526,0.482628,0.590741,0.59077,0.998864,0.797032,0.808984,0.30312,0.781172,...,0.77874,0.007182,0.623217,0.590737,0.838851,0.278031,0.026702,0.564693,1,0.039132
3,0.489787,0.551679,0.536485,0.598048,0.598048,0.998987,0.797382,0.809308,0.303475,0.781565,...,0.796879,0.001452,0.62352,0.598046,0.840241,0.298939,0.037504,0.569745,1,0.014222
4,0.620192,0.702846,0.691632,0.620908,0.620908,0.999207,0.797683,0.809604,0.30354,0.781885,...,0.879568,0.001569,0.624056,0.620907,0.845733,0.277877,0.026791,0.565159,1,0.040519


In [57]:
Xdata_test.head()

Unnamed: 0,ROA(C) before interest and depreciation before interest,ROA(A) before interest and % after tax,ROA(B) before interest and depreciation after tax,Operating Gross Margin,Realized Sales Gross Margin,Operating Profit Rate,Pre-tax net Interest Rate,After-tax net Interest Rate,Non-industry income and expenditure/revenue,Continuous interest rate (after tax),...,Net Income to Total Assets,Total assets to GNP price,No-credit Interval,Gross Profit to Sales,Net Income to Stockholder's Equity,Liability to Equity,Degree of Financial Leverage (DFL),Interest Coverage Ratio (Interest expense to EBIT),Net Income Flag,Equity to Liability
2829,0.457661,0.539195,0.508325,0.59967,0.59967,0.99888,0.797412,0.809326,0.30375,0.78142,...,0.79723,0.024271,0.622684,0.599668,0.840182,0.283932,0.029567,0.568808,1,0.020696
1911,0.548774,0.575992,0.601906,0.625218,0.625218,0.999056,0.797548,0.809502,0.303619,0.781754,...,0.817849,0.000545,0.62515,0.625218,0.841178,0.275552,0.026791,0.565158,1,0.121426
5950,0.476966,0.525131,0.526527,0.597616,0.597169,0.998937,0.797365,0.809269,0.30355,0.781611,...,0.792243,0.00076,0.625029,0.597615,0.839814,0.276879,0.0262,0.558753,1,0.05427
6257,0.478575,0.541321,0.527705,0.606394,0.606394,0.999027,0.797449,0.809354,0.303506,0.781625,...,0.802114,0.001682,0.623796,0.606391,0.840385,0.276116,0.026794,0.565174,1,0.077683
1517,0.334325,0.393044,0.386905,0.571275,0.571275,0.998378,0.796688,0.808731,0.303536,0.780934,...,0.730177,0.000821,0.62379,0.571274,0.834077,0.281133,0.026745,0.564929,1,0.025237


In [58]:
Xdata_train.columns == Xdata_test.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True])

In [59]:
LR_model = LogisticRegression(max_iter=1000)
LR_model.fit(Xdata_train, ydata_train)

print("Convergence status:", LR_model.converged_)

AttributeError: 'LogisticRegression' object has no attribute 'converged_'

In [60]:
import sklearn
print(sklearn.__version__)


1.2.2


In [61]:
ydata_train

0        0
1        0
2        0
3        0
4        0
        ..
10551    1
10552    1
10553    1
10554    1
10555    1
Name: Bankrupt?, Length: 10556, dtype: int64