In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('Data.csv')

In [3]:
X=df.drop('Bankruptcy',axis='columns')
y=df['Bankruptcy']

In [4]:
df.shape

(6819, 96)

In [19]:
y.value_counts()

0    6599
1     220
Name: Bankruptcy, dtype: int64

Clearly We can see there is a data imbalance,So to counter it we used SMOTE **oversampling**

In [5]:
from imblearn.over_sampling import SMOTE

In [6]:
smote=SMOTE(sampling_strategy='minority')
X_sm,y_sm = smote.fit_resample(X,y)
y_sm.value_counts()

1    6599
0    6599
Name: Bankruptcy, dtype: int64

Now we will standardize the data,and split it into **Train and Test** 

In [7]:
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=15, stratify=y_sm)
y_train.value_counts()
y_test.value_counts()

1    1320
0    1320
Name: Bankruptcy, dtype: int64

In [9]:
X_train=sc.fit_transform(X_train)
X_test= sc.transform(X_test)

here we used **PCA** for dimentionality Reduction, basically it enables us to preserve most of the information in a lower subspace,and reduces the complexity of the data.

In [10]:
from sklearn.decomposition import PCA
pca = PCA(n_components = 5)
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

In [12]:
X_train.shape

(10558, 5)

We used logistic Regression as A **Classification** Technique to see how accurately does the Model predict whether a company will go **Bankrupt or not**

In [13]:
from sklearn.linear_model import LogisticRegression
classifier= LogisticRegression(max_iter= 10000, random_state= 0)
classifier.fit(X_train,y_train)

In [14]:
y_pred=classifier.predict(X_test)
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report
print("Classsification Report: \n", classification_report(y_test,y_pred))

Classsification Report: 
               precision    recall  f1-score   support

           0       0.88      0.87      0.88      1320
           1       0.87      0.88      0.88      1320

    accuracy                           0.88      2640
   macro avg       0.88      0.88      0.88      2640
weighted avg       0.88      0.88      0.88      2640



In [15]:
accuracy_score(y_test,y_pred)

0.8787878787878788

the Model Acuracy is 87.78%

In [16]:
confusion_matrix(y_test,y_pred)

array([[1153,  167],
       [ 153, 1167]], dtype=int64)

we also used **Cross-Validation** 

In [17]:
from sklearn.model_selection import cross_val_score
accuracies= cross_val_score(estimator=classifier,X= X_train, y =y_train, cv = 10)
print("Accuracy : {:.2f}%".format(accuracies.mean()*100))
print("Standard_Deviation : {:.2f}%".format(accuracies.std()*100))


Accuracy : 87.01%
Standard_Deviation : 0.82%
