In [1]:
from sklearn.metrics import fbeta_score, roc_auc_score, confusion_matrix, make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import ADASYN, BorderlineSMOTE
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


In [10]:
df = pd.read_csv(r'C:\Users\Syrym\Downloads\taiwanese+bankruptcy+prediction\data.csv')

In [11]:
# Check for constant values in each column
constant_columns = df.columns[df.nunique() == 1]

# Print columns with constant values
print("Columns with constant values:", constant_columns)
df = df.drop(columns=constant_columns)

Columns with constant values: Index([' Net Income Flag'], dtype='object')


In [12]:
X = df.drop('Bankrupt?', axis=1) 
y = df['Bankrupt?'] 

# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [37]:
# F2-score scorer
f2_scorer = make_scorer(fbeta_score, beta=2)

# Define the parameter grid for Decision Tree
param_grid = {
    'max_depth': [5, 10, 15, 20, 25,30],
    'min_samples_split': range(3, 12, 3),
    'min_samples_leaf': range(3, 12, 3),
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

# Create classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Results in a dictionary
results_dict = {}
name = 'Decision Tree Base Model'

# Grid search
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5, scoring=f2_scorer,n_jobs=-1)
grid_search.fit(X_train, y_train)
y_test_pred = grid_search.predict(X_test)

# Calculate confusion matrix,type I error,type II error
conf_matrix = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = conf_matrix.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)

# Results in the dictionary
results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}


print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")


Results for Decision Tree Base Model
best_params: {'criterion': 'entropy', 'max_depth': 15, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 3}
test_f2_score: 0.40425531914893614
auc_score: 0.7007575757575758
type_i_error: 0.030303030303030304
type_ii_error: 0.5681818181818182


In [38]:
param_grid = {
    'max_depth': [5, 10, 15, 20, 25,30],
    'min_samples_split': range(3, 12, 3),
    'min_samples_leaf': range(3, 12, 3),
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

# Create a SMOTEENN sampler and apply
samplerSMOTEENN = SMOTEENN(random_state=42)
X_resampled_SMOTEENN, y_resampled_SMOTEENN = samplerSMOTEENN.fit_resample(X_train, y_train)

# Results in a dictionary
results_dict = {}
name = 'SMOTE-EEN DT'


# Grid search on the resampled data
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5, scoring=f2_scorer,n_jobs=-1)
grid_search.fit(X_resampled_SMOTEENN, y_resampled_SMOTEENN)
y_test_pred_SMOTEENN = grid_search.predict(X_test)

# Calculate confusion matrix, type I error,type II error
conf_matrix_SMOTEENN = confusion_matrix(y_test, y_test_pred_SMOTEENN)
tn, fp, fn, tp = conf_matrix_SMOTEENN.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)



results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred_SMOTEENN, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred_SMOTEENN),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}

# Print the results
print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")


Results for SMOTE-EEN DT
best_params: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 3}
test_f2_score: 0.49833887043189373
auc_score: 0.8049242424242423
type_i_error: 0.07196969696969698
type_ii_error: 0.3181818181818182


In [None]:

#parameter grid for Decision Tree with feature selection
param_grid = {
    'dt_classifier__max_depth': [5, 10, 15, 20, 25,30],
    'dt_classifier__min_samples_split': range(3, 12, 3),
    'dt_classifier__min_samples_leaf': range(3, 12, 3),
    'dt_classifier__max_features': ['sqrt', 'log2'],
    'dt_classifier__criterion': ['gini', 'entropy'],
    'selectkbest__k': [10,20, 30, 40],  # Number of features to select
}

# Pipeline with SMOTEENN, feature selection, and Decision Tree classifier
pipeline = Pipeline([
    ('selectkbest', SelectKBest(mutual_info_classif)),
    ('dt_classifier', dt_classifier)
])

#Results in a dictionary
results_dict = {}

# Name for DT model
name = 'SMOTE-EEN DT with MI'

#Grid search on the resampled data
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=f2_scorer)
grid_search.fit(X_resampled_SMOTEENN, y_resampled_SMOTEENN)
y_test_pred = grid_search.predict(X_test)

#confusion matrix,type I error,type II error
conf_matrix = confusion_matrix(y_test, y_test_pred)
tn, fp, fn, tp = conf_matrix.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)

# Results in the dictionary
results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}


print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")




Results for SMOTE-EEN DT with MI
best_params: {'dt_classifier__criterion': 'entropy', 'dt_classifier__max_depth': 10, 'dt_classifier__max_features': 'sqrt', 'dt_classifier__min_samples_leaf': 3, 'dt_classifier__min_samples_split': 3, 'selectkbest__k': 30}
test_f2_score: 0.5045871559633027
auc_score: 0.8303030303030304
type_i_error: 0.0893939393939394
type_ii_error: 0.25


In [42]:

# parameter grid for Decision Tree with RFE
param_grid = {
    'dt_classifier__max_depth': [5, 10, 15, 20, 25,30],
    'dt_classifier__min_samples_split': range(3, 12, 3),
    'dt_classifier__min_samples_leaf': range(3, 12, 3),
    'dt_classifier__max_features': ['sqrt', 'log2'],
    'dt_classifier__criterion': ['gini', 'entropy'],
    'rfe__n_features_to_select': [10, 20, 30, 40],  # Number of features to select
}



rfe = RFE(estimator=dt_classifier, step=5)

# Results in a dictionary
results_dict = {}
name = 'SMOTE-EEN DT with RFE'

# Pipeline with SMOTEENN, RFE
pipeline = Pipeline([
    ('rfe', rfe),
    ('dt_classifier', dt_classifier)
])

# Grid search on the resampled data 
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=f2_scorer, n_jobs=-1)
grid_search.fit(X_resampled_SMOTEENN, y_resampled_SMOTEENN)
y_test_pred_SMOTEENN_RFE = grid_search.predict(X_test)

#confusion matrix,type I error,type II error
conf_matrix_SMOTEENN_RFE = confusion_matrix(y_test, y_test_pred_SMOTEENN_RFE)
tn, fp, fn, tp = conf_matrix_SMOTEENN_RFE.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)

# Results in the dictionary
results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred_SMOTEENN_RFE, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred_SMOTEENN_RFE),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}


print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")


Results for SMOTE-EEN DT with RFE
best_params: {'dt_classifier__criterion': 'entropy', 'dt_classifier__max_depth': 10, 'dt_classifier__max_features': 'sqrt', 'dt_classifier__min_samples_leaf': 3, 'dt_classifier__min_samples_split': 3, 'rfe__n_features_to_select': 30}
test_f2_score: 0.4817275747508306
auc_score: 0.7931818181818182
type_i_error: 0.0893939393939394
type_ii_error: 0.25


In [24]:
param_grid = {
    'max_depth': [5, 10, 15, 20, 25,30],
    'min_samples_split': range(3, 12, 3),
    'min_samples_leaf': range(3, 12, 3),
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

# Create a SMOTETomek sampler and apply
samplerSMOTETomek = SMOTETomek(random_state=42)
X_resampled_SMOTETomek, y_resampled_SMOTETomek = samplerSMOTETomek.fit_resample(X_train, y_train)

# Results in a dictionary
results_dict = {}
name = 'SMOTETomek Decision Tree'

# Grid search on the resampled data
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5, scoring=f2_scorer)
grid_search.fit(X_resampled_SMOTETomek, y_resampled_SMOTETomek)
y_test_pred_SMOTETomek = grid_search.predict(X_test)

# Calculate confusion matrix, type I error, type II error
conf_matrix = confusion_matrix(y_test, y_test_pred_SMOTETomek)
tn, fp, fn, tp = conf_matrix.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)



# Results in the dictionary
results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred_SMOTETomek, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred_SMOTETomek),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}


print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")


Results for SMOTETomek Decision Tree
best_params: {'criterion': 'entropy', 'max_depth': 15, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 3}
test_f2_score: 0.4038461538461539
auc_score: 0.7147727272727273
type_i_error: 0.04772727272727273
type_ii_error: 0.5227272727272727


In [33]:

param_grid = {
    'dt_classifier__max_depth': [5, 10, 15, 20, 25,30],
    'dt_classifier__min_samples_split': range(3, 12, 3),
    'dt_classifier__min_samples_leaf': range(3, 12, 3),
    'dt_classifier__max_features': ['sqrt', 'log2'],
    'dt_classifier__criterion': ['gini', 'entropy'],
    'selectkbest__k': [10,20,30,40],  # Number of features to select
}

pipeline = Pipeline([
    ('selectkbest', SelectKBest(mutual_info_classif)),
    ('dt_classifier', dt_classifier)
])

# Results in a dictionary
results_dict = {}
name = 'SMOTETomek Decision Tree with MI'

# Grid search on the resampled data
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=f2_scorer, n_jobs=-1)
grid_search.fit(X_resampled_SMOTETomek, y_resampled_SMOTETomek)
y_test_pred_SMOTETomek = grid_search.predict(X_test)

# Calculate confusion matrix, type I error, type II error
conf_matrix = confusion_matrix(y_test, y_test_pred_SMOTETomek)
tn, fp, fn, tp = conf_matrix.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)


results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred_SMOTETomek, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred_SMOTETomek),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}

print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")


Results for SMOTETomek Decision Tree with MI
best_params: {'dt_classifier__criterion': 'gini', 'dt_classifier__max_depth': 10, 'dt_classifier__max_features': 'sqrt', 'dt_classifier__min_samples_leaf': 3, 'dt_classifier__min_samples_split': 3, 'selectkbest__k': 30}
test_f2_score: 0.5345911949685535
auc_score: 0.8454545454545456
type_i_error: 0.08181818181818182
type_ii_error: 0.22727272727272727


In [46]:

param_grid = {
    'dt_classifier__max_depth': [5, 10, 15, 20, 25,30],
    'dt_classifier__min_samples_split': range(3, 12, 3),
    'dt_classifier__min_samples_leaf': range(3, 12, 3),
    'dt_classifier__max_features': ['sqrt', 'log2'],
    'dt_classifier__criterion': ['gini', 'entropy'],
    'rfe__n_features_to_select': [10,20,30, 40],  # Number of features to select
}


pipeline = Pipeline([
    ('rfe', RFE(estimator=dt_classifier, step=5)),
    ('dt_classifier', dt_classifier)
])

# Results in a dictionary
results_dict = {}
name = 'SMOTETomek Decision Tree with RFE'

# Grid search on the resampled data
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=f2_scorer, n_jobs=-1)
grid_search.fit(X_resampled_SMOTETomek, y_resampled_SMOTETomek)
y_test_pred_SMOTETomek_RFE = grid_search.predict(X_test)

# Calculate confusion matrix, type I error, type II error
conf_matrix_SMOTETomek_RFE = confusion_matrix(y_test, y_test_pred_SMOTETomek_RFE)
tn, fp, fn, tp = conf_matrix_SMOTETomek_RFE.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)


results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred_SMOTETomek_RFE, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred_SMOTETomek_RFE),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}


print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")


Results for SMOTETomek Decision Tree with RFE
best_params: {'dt_classifier__criterion': 'entropy', 'dt_classifier__max_depth': 10, 'dt_classifier__max_features': 'sqrt', 'dt_classifier__min_samples_leaf': 3, 'dt_classifier__min_samples_split': 3, 'rfe__n_features_to_select': 40}
test_f2_score: 0.44964028776978415
auc_score: 0.7549242424242425
type_i_error: 0.058333333333333334
type_ii_error: 0.4318181818181818


In [26]:

# Define the parameter grid for Decision Tree
param_grid = {
    'max_depth': [5, 10, 15, 20, 25,30],
    'min_samples_split': range(3, 12, 3),
    'min_samples_leaf': range(3, 12, 3),
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

# Create a ADASYN( sampler and apply
samplerADASYN = ADASYN(random_state=42)
X_resampled_ADASYN, y_resampled_ADASYN = samplerADASYN.fit_resample(X_train, y_train)

# Results in a dictionary
results_dict = {}
name = 'ADASYN Decision Tree'


# Grid search on the resampled data
grid_search = GridSearchCV(dt_classifier, param_grid, cv=5, scoring=f2_scorer)
grid_search.fit(X_resampled_ADASYN, y_resampled_ADASYN)
y_test_pred_ADASYN = grid_search.predict(X_test)

# Calculate confusion matrix, type I error, type II error
conf_matrix = confusion_matrix(y_test, y_test_pred_ADASYN)
tn, fp, fn, tp = conf_matrix.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)


results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred_SMOTETomek, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred_SMOTETomek),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}


print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")


Results for ADASYN Decision Tree
best_params: {'criterion': 'entropy', 'max_depth': 10, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 3}
test_f2_score: 0.4038461538461539
auc_score: 0.7147727272727273
type_i_error: 0.06439393939393939
type_ii_error: 0.3409090909090909


In [34]:

param_grid = {
    'dt_classifier__max_depth': [10, 15, 20, 25, 30],
    'dt_classifier__min_samples_split': range(3, 12, 3),
    'dt_classifier__min_samples_leaf': range(3, 12, 3),
    'dt_classifier__max_features': ['sqrt', 'log2'],
    'dt_classifier__criterion': ['gini', 'entropy'],
    'selectkbest__k': [10, 20, 30, 40],  # Number of features to select
}



pipeline = Pipeline([
    ('selectkbest', SelectKBest(mutual_info_classif)),
    ('dt_classifier', dt_classifier)
])

# Results in a dictionary
results_dict = {}
name = 'ADASYN Decision Tree with MI'

# Perform grid search on the resampled data
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=f2_scorer, n_jobs=-1)
grid_search.fit(X_resampled_ADASYN, y_resampled_ADASYN)
y_test_pred_ADASYN = grid_search.predict(X_test)

# Calculate confusion matrix, type I error, type II error
conf_matrix = confusion_matrix(y_test, y_test_pred_ADASYN)
tn, fp, fn, tp = conf_matrix.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)


results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred_ADASYN, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred_ADASYN),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}


print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")


Results for ADASYN Decision Tree with MI
best_params: {'dt_classifier__criterion': 'entropy', 'dt_classifier__max_depth': 10, 'dt_classifier__max_features': 'sqrt', 'dt_classifier__min_samples_leaf': 3, 'dt_classifier__min_samples_split': 9, 'selectkbest__k': 30}
test_f2_score: 0.5434782608695652
auc_score: 0.8556818181818181
type_i_error: 0.08409090909090909
type_ii_error: 0.20454545454545456


In [52]:

param_grid = {
    'dt_classifier__max_depth': [5, 10, 15, 20, 25, 30],
    'dt_classifier__min_samples_split': range(3, 12, 3),
    'dt_classifier__min_samples_leaf': range(3, 12, 3),
    'dt_classifier__max_features': ['sqrt', 'log2'],
    'dt_classifier__criterion': ['gini', 'entropy'],
    'rfe__n_features_to_select': [10,20,30,40],  # Number of features to select
}


pipeline = Pipeline([
    ('rfe', RFE(estimator=dt_classifier, step=5)),
    ('dt_classifier', dt_classifier)
])

# Results in a dictionary
results_dict = {}
name = 'ADASYN Decision Tree with RFE'

# Grid search on the resampled data
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=f2_scorer, n_jobs=-1)
grid_search.fit(X_resampled_ADASYN, y_resampled_ADASYN)
y_test_pred_ADASYN_RFE = grid_search.predict(X_test)

# Calculate confusion matrix, type I error, type II error
conf_matrix_ADASYN_RFE = confusion_matrix(y_test, y_test_pred_ADASYN_RFE)
tn, fp, fn, tp = conf_matrix_ADASYN_RFE.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)


results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred_ADASYN_RFE, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred_ADASYN_RFE),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}


print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")


Results for ADASYN Decision Tree with RFE
best_params: {'dt_classifier__criterion': 'entropy', 'dt_classifier__max_depth': 15, 'dt_classifier__max_features': 'log2', 'dt_classifier__min_samples_leaf': 3, 'dt_classifier__min_samples_split': 3, 'rfe__n_features_to_select': 30}
test_f2_score: 0.4891304347826087
auc_score: 0.7791666666666667
type_i_error: 0.055303030303030305
type_ii_error: 0.38636363636363635
