In [8]:
%pip install mlxtend
%pip install pycfs
%pip install h5py
%pip install tqdm


Collecting mlxtend
  Downloading mlxtend-0.23.0-py3-none-any.whl (1.4 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.23.0
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\Syrym\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


Collecting tqdm
  Downloading tqdm-4.66.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.66.1
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\Syrym\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [1]:
from sklearn.metrics import fbeta_score, roc_auc_score, confusion_matrix, make_scorer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import ADASYN
from sklearn.feature_selection import SelectKBest, mutual_info_classif
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from imblearn.combine import SMOTEENN, SMOTETomek
from sklearn.feature_selection import SelectKBest, mutual_info_classif, RFE

In [7]:
df = pd.read_csv(r'C:\Users\Syrym\Downloads\taiwanese+bankruptcy+prediction\data.csv')
# Check for constant values in each column
constant_columns = df.columns[df.nunique() == 1]

# Print columns with constant values
print("Columns with constant values:", constant_columns)
df = df.drop(columns=constant_columns)

Columns with constant values: Index([' Net Income Flag'], dtype='object')


In [2]:
# Calculating the percentage difference
initial_value = 0.2276
final_value = 0.1439

percentage_change = ((final_value - initial_value) / initial_value) * 100
percentage_change


-36.77504393673111

In [3]:
X = df.drop('Bankrupt?', axis=1) 
y = df['Bankrupt?'] 

# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
# F2-score scorer
f2_scorer = make_scorer(fbeta_score, beta=2)

param_grid = {
    'n_estimators': [50, 100, 200, 300, 400, 500],
    'max_depth': [5, 10, 15, 20, 25, 30],
    'min_samples_split': range(3, 12, 3),
    'min_samples_leaf': range(3, 12, 3),
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}

# Create classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Results in a dictionary
results_dict = {}
name = 'Random Forest Base Model'

# Perform grid search
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring=f2_scorer, n_jobs=-1)
grid_search.fit(X_train, y_train)
y_test_pred_baseModel_RF = grid_search.predict(X_test)

# Calculate confusion matrix, type I error,type II error
conf_matrix = confusion_matrix(y_test, y_test_pred_baseModel_RF)
tn, fp, fn, tp = conf_matrix.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)

# Results in the dictionary
results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred_baseModel_RF, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred_baseModel_RF),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}


print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")


Results for Random Forest Base Model
best_params: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 50}
test_f2_score: 0.1871657754010695
auc_score: 0.578030303030303
type_i_error: 0.0030303030303030303
type_ii_error: 0.8409090909090909


In [11]:

param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [5, 10, 15, 20, 25,30],
    'min_samples_split': range(3, 12, 3),
    'min_samples_leaf': range(3, 12, 3),
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}
# Create a SMOTETomek sampler and apply
samplerSMOTETomek = SMOTETomek(random_state=42)
X_resampled_SMOTETomek, y_resampled_SMOTETomek = samplerSMOTETomek.fit_resample(X_train, y_train)


# results in a dictionary
results_dict = {}
name = 'SMOTETomek Random Forest'

# Perform grid search
grid_search = GridSearchCV(rf_classifier, param_grid, cv=5, scoring=f2_scorer, n_jobs=-1)
grid_search.fit(X_resampled_SMOTETomek,y_resampled_SMOTETomek)
y_test_pred_resampled_SMOTETomek = grid_search.predict(X_test)

# Calculate confusion matrix, type I error,type II error
conf_matrix_resampled_SMOTETomek = confusion_matrix(y_test, y_test_pred_resampled_SMOTETomek)
tn, fp, fn, tp = conf_matrix_resampled_SMOTETomek.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)


results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred_resampled_SMOTETomek, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred_resampled_SMOTETomek),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}


print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")


Results for SMOTETomek Random Forest
best_params: {'criterion': 'entropy', 'max_depth': 25, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 3, 'n_estimators': 200}
test_f2_score: 0.6390977443609023
auc_score: 0.865151515151515
type_i_error: 0.04242424242424243
type_ii_error: 0.22727272727272727


In [17]:
param_grid = {
    'rf_classifier__n_estimators': [100,200, 300, 400, 500],
    'rf_classifier__max_depth': [5, 10, 15, 20, 25, 30],
    'rf_classifier__min_samples_split':  range(3, 12, 3),
    'rf_classifier__min_samples_leaf': range(3, 12, 3),
    'rf_classifier__max_features': ['sqrt', 'log2'],
    'rf_classifier__criterion': ['gini', 'entropy'],
    'selectkbest__k': [10,20, 30, 40,]  # Number of features to select
}


pipeline = Pipeline([
    ('selectkbest', SelectKBest(mutual_info_classif)),
    ('rf_classifier', rf_classifier)
])

# Results in a dictionary
results_dict = {}
name = 'SMOTETomek Random Forest with MI'

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=f2_scorer, n_jobs=-1)
grid_search.fit(X_resampled_SMOTETomek, y_resampled_SMOTETomek)
y_test_pred_resampled_SMOTETomek_MI = grid_search.predict(X_test)

# Calculate confusion matrix, type I error,type II error
conf_matrix_resampled_SMOTETomek_MI = confusion_matrix(y_test, y_test_pred_resampled_SMOTETomek_MI)
tn, fp, fn, tp = conf_matrix_resampled_SMOTETomek_MI.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)


results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred_resampled_SMOTETomek_MI, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred_resampled_SMOTETomek_MI),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}


print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")


Results for SMOTETomek Random Forest with MI
best_params: {'rf_classifier__criterion': 'entropy', 'rf_classifier__max_depth': 20, 'rf_classifier__max_features': 'sqrt', 'rf_classifier__min_samples_leaf': 3, 'rf_classifier__min_samples_split': 3, 'rf_classifier__n_estimators': 300, 'selectkbest__k': 40}
test_f2_score: 0.6133828996282529
auc_score: 0.8522727272727273
type_i_error: 0.045454545454545456
type_ii_error: 0.25


In [16]:

param_grid = {
    'rf_classifier__n_estimators': [100, 200, 300, 400, 500],
    'rf_classifier__max_depth': [5, 10, 15, 20, 25, 30],
    'rf_classifier__min_samples_split': range(3, 12, 3),
    'rf_classifier__min_samples_leaf':range(3, 12, 3),
    'rf_classifier__max_features': ['sqrt', 'log2'],
    'rf_classifier__criterion': ['gini', 'entropy'],
    'rfe__n_features_to_select': [10, 20, 30, 40],  # Number of features to select
}


pipeline = Pipeline([
    ('rfe', RFE(estimator=rf_classifier, step=5)),
    ('rf_classifier', rf_classifier)
])

# 4esults in a dictionary
results_dict = {}
name = 'SMOTETomek Random Forest with RFE'

# grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring=f2_scorer, n_jobs=-1)
grid_search.fit(X_resampled_SMOTETomek, y_resampled_SMOTETomek)
y_test_pred_resampled_SMOTETomek_RFE = grid_search.predict(X_test)

# Calculate confusion matrix, type I error,type II error
conf_matrix_resampled_SMOTETomek_RFE = confusion_matrix(y_test, y_test_pred_resampled_SMOTETomek_RFE)
tn, fp, fn, tp = conf_matrix_resampled_SMOTETomek_RFE.ravel()
type_i_error = fp / (fp + tn)
type_ii_error = fn / (fn + tp)


results_dict[name] = {
    'best_params': grid_search.best_params_,
    'test_f2_score': fbeta_score(y_test, y_test_pred_resampled_SMOTETomek_RFE, beta=2),
    'auc_score': roc_auc_score(y_test, y_test_pred_resampled_SMOTETomek_RFE),
    'type_i_error': type_i_error,
    'type_ii_error': type_ii_error
}


print("Results for", name)
for metric, value in results_dict[name].items():
    print(f"{metric}: {value}")


Results for SMOTETomek Random Forest with RFE
best_params: {'rf_classifier__criterion': 'entropy', 'rf_classifier__max_depth': 15, 'rf_classifier__max_features': 'sqrt', 'rf_classifier__min_samples_leaf': 3, 'rf_classifier__min_samples_split': 3, 'rf_classifier__n_estimators': 200, 'rfe__n_features_to_select': 40}
test_f2_score: 0.6106870229007633
auc_score: 0.8431818181818181
type_i_error: 0.04090909090909091
type_ii_error: 0.2727272727272727
