# Bankruptcy Dataset - Logistic Regression

Response Variable : D (0=bankrupt, 1=Healthy)
Predictors: R1 - R24 (Financial Ratios)

Source of Data: Darden Business Publishing

In [1]:
import pandas as pd
import numpy as np
import warnings; warnings.simplefilter('ignore')
df = pd.read_csv('G:/Statistics (Python)/Cases/Bankruptcy/Bankruptcy.csv')
df.head()

Unnamed: 0,NO,D,YR,R1,R2,R3,R4,R5,R6,R7,...,R15,R16,R17,R18,R19,R20,R21,R22,R23,R24
0,1,0,78,0.23,0.08,0.02,0.03,0.46,0.12,0.19,...,0.05,0.57,0.15,0.23,3.56,0.26,1.55,0.43,0.11,0.17
1,2,0,77,0.19,0.07,0.09,0.12,0.02,0.02,0.03,...,0.09,0.12,0.16,0.22,3.78,1.29,1.4,0.06,0.07,0.1
2,3,0,72,0.07,0.02,0.03,0.05,0.06,0.1,0.14,...,-0.03,0.02,0.02,0.04,13.29,1.61,1.43,0.03,0.05,0.07
3,4,0,80,0.07,0.03,0.04,0.04,0.04,0.06,0.06,...,-0.02,0.01,0.02,0.02,5.36,1.3,1.12,-0.06,-0.08,-0.09
4,5,0,81,0.09,0.02,0.03,0.04,0.06,0.08,0.11,...,0.02,0.07,0.1,0.14,7.74,1.48,1.41,0.03,0.04,0.06


In [2]:
X = df.iloc[:,3:]
y = df.iloc[:,1]

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
kfold = StratifiedKFold(n_splits=5,random_state=42)
results = cross_val_score(logreg, X, y, cv=kfold, scoring='accuracy')
print(results)

[0.85714286 0.92307692 0.80769231 0.76923077 0.80769231]


In [3]:
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

Accuracy: 83.30% (5.30%)


In [4]:
results = cross_val_score(logreg, X, y, cv=kfold, scoring='roc_auc')
print(results)

[0.8877551  0.95266272 0.85207101 0.85207101 0.86390533]


In [5]:
print("ROC AUC: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

ROC AUC: 88.17% (3.78%)


## L1 

### Accuracy optimization for C

In [6]:
from sklearn.model_selection import GridSearchCV
c_space = np.array([0.01,0.1,0.25,0.3,0.5,0.6,0.7,0.9,1,1.1,1.2,1.5])
param_grid = {'C': c_space}

logreg = LogisticRegression(penalty='l1')

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

# Fit it to the data
logreg_cv.fit(X,y)

print("The Best Accuracy:" , logreg_cv.best_score_)

The Best Accuracy: 0.8409090909090909




In [7]:
print("The Best Parameter:", logreg_cv.best_params_)

The Best Parameter: {'C': 0.9}


### AUC optimization for C 

In [8]:
# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv=5,scoring='roc_auc')

# Fit it to the data
logreg_cv.fit(X,y)

print("The Best AUC:",logreg_cv.best_score_)

The Best AUC: 0.8682983682983683




In [9]:
print("The Best Parameter:",logreg_cv.best_params_)

The Best Parameter: {'C': 1.2}


## L2

###  Accuracy optimization for C

In [10]:
logreg = LogisticRegression(penalty='l2')

# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

# Fit it to the data
logreg_cv.fit(X,y)

print("The Best Accuracy:",logreg_cv.best_score_)

The Best Accuracy: 0.8333333333333334




In [11]:
print("The Best Parameter:",logreg_cv.best_params_)

The Best Parameter: {'C': 0.5}


### AUC Optimization for C 

In [12]:
# Instantiate the GridSearchCV object: logreg_cv
logreg_cv = GridSearchCV(logreg, param_grid, cv=5,scoring='roc_auc')

# Fit it to the data
logreg_cv.fit(X,y)

print("The Best AUC:",logreg_cv.best_score_)

The Best AUC: 0.8886113886113887




In [14]:
print("The Best Parameter:",logreg_cv.best_params_)

The Best Parameter: {'C': 0.25}


Hence, we see that the AUC for L2 being the biggest, we can choose L2 as the best model with C as 0.25