In [20]:
from imblearn.over_sampling import SMOTE
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, PassiveAggressiveClassifier, LogisticRegression
from xgboost import XGBClassifier
from sklearn.datasets import load_iris
from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, roc_curve, confusion_matrix, f1_score, ConfusionMatrixDisplay


### Loading CSVs (3 differents with: initial, with correlation less that 0,8, With Variance > 0,02)

In [21]:
data = pd.read_csv('data.csv')
data.columns = data.columns.str.strip()
data.rename(columns={'Debt ratio %': 'Debt_ratio'}, inplace=True)

In [46]:
lc_data = pd.read_csv('low_correl_data.csv')
hv_data = pd.read_csv('low_var_data.csv')


In [23]:
def split_and_smote(df, test_ratio, random_state, random_state_smote):
    X = df.drop('Bankrupt?', axis=1)
    y = df['Bankrupt?']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=7986)
    smote = SMOTE(random_state=random_state_smote)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    return X_train, X_test, y_train, y_test

In [24]:
Xdata_train, Xdata_test, ydata_train, ydata_test = split_and_smote(data , 0.2 , 456 , 123)


In [25]:
Xlc_train, Xlc_test, ylc_train, ylc_test = split_and_smote(lc_data , 0.2 , 456 , 123)

In [26]:
Xhv_train, Xhv_test, yhv_train, yhv_test = split_and_smote(hv_data , 0.2 , 456 , 123)

### Function Definition for storing multiple evaluations of models

In [27]:
def evaluate_model(note, model, X_test, y_test, results):
    pred = model.predict(X_test)
    score = model.score(X_test, y_test)
    precision = precision_score(y_test, pred, pos_label=1)
    recall = recall_score(y_test, pred, pos_label=1)
    f1 = f1_score(y_test, pred, pos_label=1)
    false_negatives = confusion_matrix(y_test, pred)[1][0]
    false_positives = confusion_matrix(y_test, pred)[0][1]
    new_result = pd.Series([note, score, precision, recall, f1, false_negatives, false_positives], index=results.columns)

    results.loc[len(results)] = new_result
    return results

results = pd.DataFrame(columns=['note', 'accuracy', 'precision', 'recall', 'f1_score', 'false_negatives', 'false_positives'])


### Model : Logistic Regression

In [28]:
LR_model = LogisticRegression(max_iter=1000)
LR_model.fit(Xdata_train, ydata_train)

In [29]:
evaluate_model('Logistic Regression data', LR_model, Xdata_test, ydata_test, results)

Unnamed: 0,note,accuracy,precision,recall,f1_score,false_negatives,false_positives
0,Logistic Regression data,0.699413,0.035443,0.325581,0.063927,29,381


In [30]:
LR_model_lc = LogisticRegression(max_iter=1000)
LR_model_lc.fit(Xlc_train, ylc_train)

In [31]:
evaluate_model('Logistic Regression Low Correlation', LR_model_lc, Xlc_test, ylc_test, results)

Unnamed: 0,note,accuracy,precision,recall,f1_score,false_negatives,false_positives
0,Logistic Regression data,0.699413,0.035443,0.325581,0.063927,29,381
1,Logistic Regression Low Correlation,0.699413,0.035443,0.325581,0.063927,29,381


In [32]:
LR_model_hv = LogisticRegression(max_iter=1000)
LR_model_hv.fit(Xhv_train, yhv_train)

In [33]:
evaluate_model('Logistic Regression High Variance', LR_model_hv, Xhv_test, yhv_test, results)

Unnamed: 0,note,accuracy,precision,recall,f1_score,false_negatives,false_positives
0,Logistic Regression data,0.699413,0.035443,0.325581,0.063927,29,381
1,Logistic Regression Low Correlation,0.699413,0.035443,0.325581,0.063927,29,381
2,Logistic Regression High Variance,0.699413,0.035443,0.325581,0.063927,29,381


### Model: XG_Boost

In [34]:
XG_model = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
XG_model.fit(Xdata_train, ydata_train)

In [35]:
evaluate_model('XG_model data', XG_model, Xdata_test, ydata_test, results)

Unnamed: 0,note,accuracy,precision,recall,f1_score,false_negatives,false_positives
0,Logistic Regression data,0.699413,0.035443,0.325581,0.063927,29,381
1,Logistic Regression Low Correlation,0.699413,0.035443,0.325581,0.063927,29,381
2,Logistic Regression High Variance,0.699413,0.035443,0.325581,0.063927,29,381
3,XG_model data,0.799853,0.126623,0.906977,0.222222,4,269


In [36]:
XG_model_lc = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
XG_model_lc.fit(Xlc_train, ylc_train)

In [37]:
evaluate_model('XG_model low correlation', XG_model_lc, Xlc_test, ylc_test, results)

Unnamed: 0,note,accuracy,precision,recall,f1_score,false_negatives,false_positives
0,Logistic Regression data,0.699413,0.035443,0.325581,0.063927,29,381
1,Logistic Regression Low Correlation,0.699413,0.035443,0.325581,0.063927,29,381
2,Logistic Regression High Variance,0.699413,0.035443,0.325581,0.063927,29,381
3,XG_model data,0.799853,0.126623,0.906977,0.222222,4,269
4,XG_model low correlation,0.862903,0.166667,0.837209,0.277992,7,180


In [38]:
XG_model_hv = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
XG_model_hv.fit(Xhv_train, yhv_train)

In [39]:
evaluate_model('XG_model high variance', XG_model_hv, Xhv_test, yhv_test, results)

Unnamed: 0,note,accuracy,precision,recall,f1_score,false_negatives,false_positives
0,Logistic Regression data,0.699413,0.035443,0.325581,0.063927,29,381
1,Logistic Regression Low Correlation,0.699413,0.035443,0.325581,0.063927,29,381
2,Logistic Regression High Variance,0.699413,0.035443,0.325581,0.063927,29,381
3,XG_model data,0.799853,0.126623,0.906977,0.222222,4,269
4,XG_model low correlation,0.862903,0.166667,0.837209,0.277992,7,180
5,XG_model high variance,0.828446,0.136882,0.837209,0.235294,7,227


### Model: Passive Aggressive Classifier

In [40]:
PA_model = PassiveAggressiveClassifier(max_iter=1000, random_state=0,tol=1e-3)
PA_model.fit(Xdata_train, ydata_train)

In [41]:
evaluate_model('Passive Aggressive Classifier data', PA_model, Xdata_test, ydata_test, results)

Unnamed: 0,note,accuracy,precision,recall,f1_score,false_negatives,false_positives
0,Logistic Regression data,0.699413,0.035443,0.325581,0.063927,29,381
1,Logistic Regression Low Correlation,0.699413,0.035443,0.325581,0.063927,29,381
2,Logistic Regression High Variance,0.699413,0.035443,0.325581,0.063927,29,381
3,XG_model data,0.799853,0.126623,0.906977,0.222222,4,269
4,XG_model low correlation,0.862903,0.166667,0.837209,0.277992,7,180
5,XG_model high variance,0.828446,0.136882,0.837209,0.235294,7,227
6,Passive Aggressive Classifier data,0.879032,0.044776,0.139535,0.067797,37,128


In [42]:
PA_model_lc = PassiveAggressiveClassifier(max_iter=1000, random_state=0,tol=1e-3)
PA_model_lc.fit(Xlc_train, ylc_train)

In [43]:
evaluate_model('Passive Aggressive Classifier low correlation', PA_model_lc, Xlc_test, ylc_test, results)

Unnamed: 0,note,accuracy,precision,recall,f1_score,false_negatives,false_positives
0,Logistic Regression data,0.699413,0.035443,0.325581,0.063927,29,381
1,Logistic Regression Low Correlation,0.699413,0.035443,0.325581,0.063927,29,381
2,Logistic Regression High Variance,0.699413,0.035443,0.325581,0.063927,29,381
3,XG_model data,0.799853,0.126623,0.906977,0.222222,4,269
4,XG_model low correlation,0.862903,0.166667,0.837209,0.277992,7,180
5,XG_model high variance,0.828446,0.136882,0.837209,0.235294,7,227
6,Passive Aggressive Classifier data,0.879032,0.044776,0.139535,0.067797,37,128
7,Passive Aggressive Classifier low correlation,0.922287,0.056338,0.093023,0.070175,39,67


In [44]:
PA_model_hv = PassiveAggressiveClassifier(max_iter=1000, random_state=0,tol=1e-3)
PA_model_hv.fit(Xhv_train, yhv_train)

In [45]:
evaluate_model('Passive Aggressive Classifier high variance', PA_model_hv, Xhv_test, yhv_test, results)

Unnamed: 0,note,accuracy,precision,recall,f1_score,false_negatives,false_positives
0,Logistic Regression data,0.699413,0.035443,0.325581,0.063927,29,381
1,Logistic Regression Low Correlation,0.699413,0.035443,0.325581,0.063927,29,381
2,Logistic Regression High Variance,0.699413,0.035443,0.325581,0.063927,29,381
3,XG_model data,0.799853,0.126623,0.906977,0.222222,4,269
4,XG_model low correlation,0.862903,0.166667,0.837209,0.277992,7,180
5,XG_model high variance,0.828446,0.136882,0.837209,0.235294,7,227
6,Passive Aggressive Classifier data,0.879032,0.044776,0.139535,0.067797,37,128
7,Passive Aggressive Classifier low correlation,0.922287,0.056338,0.093023,0.070175,39,67
8,Passive Aggressive Classifier high variance,0.494868,0.031884,0.511628,0.060027,21,668


### Tests with PCA