# **Modelling**

In [None]:
# Libraries
import pandas as pd
import numpy as np

from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from xgboost import XGBClassifier

In [2]:
# Reading csv file that contains the prepared data for modelling process
credit_data = pd.read_csv("tables/credit_data_modelling.csv", index_col=0)

In [15]:
# Classification result tables

# Classification report
class_report_results = pd.DataFrame({"Type": ["Baseline", "Baseline", "Advanced", "Advanced", "Advanced", "Advanced"],
                                     "Model": ["Majority Class Model", "Shallow Decision Tree", "Decision Tree", "Random Forest", "AdaBoost", "GradientBoost"],
                                     "Accuracy": np.nan,
                                     "Precision": np.nan,
                                     "Recall": np.nan,
                                     "ROC-AUC": np.nan})

# Confusion matrix
conf_matrix = {"Majority Class Model": np.nan,
                "Shallow Decision Tree": np.nan,
                "Decision Tree": np.nan,
                "Random Forest": np.nan,
                "AdaBoost": np.nan,
                "GradientBoost": np.nan}

# Best parameters
best_parameters = pd.DataFrame({"Model": ["Decision Tree", "Random Forest", "AdaBoost", "GradientBoost"]})

# Misclassification matrix
misclass_matrix = {"Majority Class Model": np.nan,
                    "Shallow Decision Tree": np.nan,
                    "Decision Tree": np.nan,
                    "Random Forest": np.nan,
                    "AdaBoost": np.nan,
                    "GradientBoost": np.nan}

# Misclassification costs
misclass_cost = pd.DataFrame({"Type": ["Baseline", "Baseline", "Advanced", "Advanced", "Advanced", "Advanced"],
                                "Model": ["Majority Class Model", "Shallow Decision Tree", "Decision Tree", "Random Forest", "AdaBoost", "GradientBoost"],
                                "Maximum cost": [5000, 5000, 1000, 1000, 1000, 1000],
                                "Model cost": np.nan,
                                "Relative cost": np.nan})

# Feature importance
feature_importance = {"Majority Class Model": np.nan,
                    "Shallow Decision Tree": np.nan,
                    "Decision Tree": np.nan,
                    "Random Forest": np.nan,
                    "AdaBoost": np.nan,
                    "GradientBoost": np.nan}

## **Baseline Model**

### **Majority Class Model**

In [16]:
# Data preparation
X = credit_data.drop(columns=["DEBTOR_STATUS"])
y = credit_data["DEBTOR_STATUS"]

# Define the Majority Class Model
majority_model = DummyClassifier(strategy="most_frequent")

# Train the model
majority_model.fit(X, y)

# Make predictions (in-sample) of classes and probabilities
y_pred = majority_model.predict(X)
y_prob = majority_model.predict_proba(X)

# Evaluate the model -----
# Confusion matrix and related metrics
conf_matrix["Majority Class Model"] = confusion_matrix(y, y_pred)
class_report_results.loc[class_report_results["Model"] == "Majority Class Model", "Accuracy"] = round(accuracy_score(y, y_pred), 4)
class_report_results.loc[class_report_results["Model"] == "Majority Class Model", "Precision"] = round(precision_score(y, y_pred, average="weighted", zero_division=0), 4)
class_report_results.loc[class_report_results["Model"] == "Majority Class Model", "Recall"] = round(recall_score(y, y_pred, average="weighted"), 4)
# ROC-AUC
class_report_results.loc[class_report_results["Model"] == "Majority Class Model", "ROC-AUC"] = round(roc_auc_score(y, y_pred), 4)

fpr, tpr, thresholds = roc_curve(y, y_prob[:, 1]) # False Positive Rate, True Positive Rate, and Tresholds
roc_auc_data = pd.DataFrame({
    "False Positive Rate": fpr,
    "True Positive Rate": tpr,
    "Threshold": thresholds
})
roc_auc_data.to_csv("tables/roc_auc_data_majority_model.csv", index=False)

# Display metrics -----
display(class_report_results.iloc[0:(class_report_results.loc[class_report_results["Model"] == "Majority Class Model"].index[0] + 1)])

Unnamed: 0,Type,Model,Accuracy,Precision,Recall,ROC-AUC
0,Baseline,Majority Class Model,0.7,0.49,0.7,0.5


### **Shallow Random Forest**

In [24]:
# Data preparation
X = credit_data.drop(columns=["DEBTOR_STATUS"])
y = credit_data["DEBTOR_STATUS"]

# Define the Shallow Decision Tree Model (max depth = 2 to keep it shallow)
shallow_tree_model = DecisionTreeClassifier(max_depth=2, criterion="gini")

# Train the model
shallow_tree_model.fit(X, y)

# Make predictions (in-sample) of classes and probabilities
y_pred = shallow_tree_model.predict(X)
y_prob = shallow_tree_model.predict_proba(X)

# Evaluate the model -----
# Confusion matrix and related metrics
conf_matrix["Shallow Decision Tree"] = confusion_matrix(y, y_pred)
class_report_results.loc[class_report_results["Model"] == "Shallow Decision Tree", "Accuracy"] = round(accuracy_score(y, y_pred), 4)
class_report_results.loc[class_report_results["Model"] == "Shallow Decision Tree", "Precision"] = round(precision_score(y, y_pred, average="weighted", zero_division=0), 4)
class_report_results.loc[class_report_results["Model"] == "Shallow Decision Tree", "Recall"] = round(recall_score(y, y_pred, average="weighted"), 4)
# ROC-AUC
class_report_results.loc[class_report_results["Model"] == "Shallow Decision Tree", "ROC-AUC"] = round(roc_auc_score(y, y_pred), 4)

fpr, tpr, thresholds = roc_curve(y, y_prob[:, 1]) # False Positive Rate, True Positive Rate, and Tresholds
roc_auc_data = pd.DataFrame({
    "False Positive Rate": fpr,
    "True Positive Rate": tpr,
    "Threshold": thresholds
})
roc_auc_data.to_csv("tables/roc_auc_data_shallow_decision_tree.csv", index=False)

# Feature importance -----
feature_importance["Shallow Decision Tree"] = pd.DataFrame({"Feature": shallow_tree_model.feature_names_in_,
                                                            "Importance": shallow_tree_model.feature_importances_
                                                            }).sort_values(by="Importance", ascending=False)

# Display metrics -----
display(class_report_results.iloc[0:(class_report_results.loc[class_report_results["Model"] == "Shallow Decision Tree"].index[0] + 1)])

Unnamed: 0,Type,Model,Accuracy,Precision,Recall,ROC-AUC
0,Baseline,Majority Class Model,0.7,0.49,0.7,0.5
1,Baseline,Shallow Decision Tree,0.719,0.7085,0.719,0.645


In [62]:
# Misclassification costs of the baseline models

# Define cost matrix (FN = 5, FP = 1)
cost_matrix = np.array([[0, 1],  # Cost of predicting Bad (1) when True is Good (0) = 1
                        [5, 0]]) # Cost of predicting Good (0) when True is Bad (1) = 5

# Misclassification costs of the Majority Class Model
misclass_matrix["Majority Class Model"] = conf_matrix["Majority Class Model"] * cost_matrix
misclass_cost.loc[class_report_results["Model"] == "Majority Class Model", "Model cost"] = (conf_matrix["Majority Class Model"] * cost_matrix).sum()
misclass_cost.loc[class_report_results["Model"] == "Majority Class Model", "Relative cost"] = (conf_matrix["Majority Class Model"] * cost_matrix).sum()/(cost_matrix.max() * len(X))

# Misclassification costs of the Shallow Decision Tree
misclass_matrix["Shallow Decision Tree"] = conf_matrix["Shallow Decision Tree"] * cost_matrix
misclass_cost.loc[class_report_results["Model"] == "Shallow Decision Tree", "Model cost"] = (conf_matrix["Shallow Decision Tree"] * cost_matrix).sum()
misclass_cost.loc[class_report_results["Model"] == "Shallow Decision Tree", "Relative cost"] = (conf_matrix["Shallow Decision Tree"] * cost_matrix).sum()/(cost_matrix.max() * len(X))

# Display costs
display(misclass_cost.iloc[0:(misclass_cost.loc[misclass_cost["Model"] == "Shallow Decision Tree"].index[0] + 1)])

Unnamed: 0,Type,Model,Maximum cost,Model cost,Relative cost
0,Baseline,Majority Class Model,5000,1500.0,0.3
1,Baseline,Shallow Decision Tree,5000,929.0,0.1858


## **Advanced Machine Learning Models**

### **Decision Tree**

In [27]:
# Data preparation
X = credit_data.drop(columns=["DEBTOR_STATUS"])
y = credit_data["DEBTOR_STATUS"]

# Train-test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7) 

# Define the Decision Tree Model with class weights
decision_tree_model = DecisionTreeClassifier(criterion="gini", class_weight={0: 1, 1: 5}, random_state=7)

# Define the hyperparameter grid
param_grid = {
    "max_depth": [1, 2, 3, 4, 5, 6, 7, 8, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 3, 4]
}

# Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=decision_tree_model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5
    )

# Fit the model
grid_search.fit(X_train, y_train)

# Make predictions with the best model
best_dt_model = grid_search.best_estimator_
y_pred = best_dt_model.predict(X_test)
y_prob = best_dt_model.predict_proba(X_test)

# Save best parameters
best_parameters.loc[best_parameters["Model"] == "Decision Tree", list(grid_search.best_params_.keys())] = list(grid_search.best_params_.values())

# Evaluate the final model -----
# Confusion matrix and related metrics
conf_matrix["Decision Tree"] = confusion_matrix(y_test, y_pred)
class_report_results.loc[class_report_results["Model"] == "Decision Tree", "Accuracy"] = round(accuracy_score(y_test, y_pred), 4)
class_report_results.loc[class_report_results["Model"] == "Decision Tree", "Precision"] = round(precision_score(y_test, y_pred, average="weighted", zero_division=0), 4)
class_report_results.loc[class_report_results["Model"] == "Decision Tree", "Recall"] = round(recall_score(y_test, y_pred, average="weighted"), 4)
# ROC-AUC
class_report_results.loc[class_report_results["Model"] == "Decision Tree", "ROC-AUC"] = round(roc_auc_score(y_test, y_pred), 4)
# False Positive Rate, True Positive Rate, and Tresholds
fpr, tpr, thresholds = roc_curve(y_test, y_prob[:, 1])
roc_auc_data = pd.DataFrame({
    "False Positive Rate": fpr,
    "True Positive Rate": tpr,
    "Threshold": thresholds
})
roc_auc_data.to_csv("tables/roc_auc_data_decision_tree.csv", index=False)

# Misclassification costs -----
misclass_matrix["Decision Tree"] = conf_matrix["Decision Tree"] * cost_matrix
misclass_cost.loc[class_report_results["Model"] == "Decision Tree", "Model cost"] = (conf_matrix["Decision Tree"] * cost_matrix).sum()
misclass_cost.loc[class_report_results["Model"] == "Decision Tree", "Relative cost"] = (conf_matrix["Decision Tree"] * cost_matrix).sum()/(cost_matrix.max() * len(X_test))

# Feature importance -----
feature_importance["Decision Tree"] = pd.DataFrame({"Feature": best_dt_model.feature_names_in_,
                                                    "Importance": best_dt_model.feature_importances_
                                                    }).sort_values(by="Importance", ascending=False)

# Display metrics -----
display(class_report_results.iloc[0:(class_report_results.loc[class_report_results["Model"] == "Decision Tree"].index[0] + 1)])
display(misclass_cost.iloc[0:(misclass_cost.loc[misclass_cost["Model"] == "Decision Tree"].index[0] + 1)])

Unnamed: 0,Type,Model,Accuracy,Precision,Recall,ROC-AUC
0,Baseline,Majority Class Model,0.7,0.49,0.7,0.5
1,Baseline,Shallow Decision Tree,0.719,0.7085,0.719,0.645
2,Advanced,Decision Tree,0.71,0.7547,0.71,0.6893


Unnamed: 0,Type,Model,Maximum cost,Model cost,Relative cost
0,Baseline,Majority Class Model,5000,,
1,Baseline,Shallow Decision Tree,5000,,
2,Advanced,Decision Tree,1000,130.0,0.13


### **Random Forest**

In [28]:
# Data preparation
X = credit_data.drop(columns=["DEBTOR_STATUS"])
y = credit_data["DEBTOR_STATUS"]

# Train-test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7) 

# Define the Random Forest Model with class weights
random_forest_model = RandomForestClassifier(criterion="gini", class_weight={0: 1, 1: 5}, random_state=7)

# Define the hyperparameter grid
param_grid = {
    "max_depth": [1, 2, 3, 4, 6, 8, None],
    "n_estimators": [100],
    "min_samples_split": [2, 5, 10],
    "max_features": ["sqrt", "log2", None]
}

# Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=random_forest_model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    verbose=0
    )

# Fit the model
grid_search.fit(X_train, y_train)

# Make predictions with the best model
best_rf_model = grid_search.best_estimator_
y_pred = best_rf_model.predict(X_test)
y_prob = best_rf_model.predict_proba(X_test)

# Save best parameters
best_parameters.loc[best_parameters["Model"] == "Random Forest", list(grid_search.best_params_.keys())] = list(grid_search.best_params_.values())

# Evaluate the final model -----
# Confusion matrix and related metrics
conf_matrix["Random Forest"] = confusion_matrix(y_test, y_pred)
class_report_results.loc[class_report_results["Model"] == "Random Forest", "Accuracy"] = round(accuracy_score(y_test, y_pred), 4)
class_report_results.loc[class_report_results["Model"] == "Random Forest", "Precision"] = round(precision_score(y_test, y_pred, average="weighted", zero_division=0), 4)
class_report_results.loc[class_report_results["Model"] == "Random Forest", "Recall"] = round(recall_score(y_test, y_pred, average="weighted"), 4)
# ROC-AUC
class_report_results.loc[class_report_results["Model"] == "Random Forest", "ROC-AUC"] = round(roc_auc_score(y_test, y_pred), 4)
# False Positive Rate, True Positive Rate, and Tresholds
fpr, tpr, thresholds = roc_curve(y_test, y_prob[:, 1])
roc_auc_data = pd.DataFrame({
    "False Positive Rate": fpr,
    "True Positive Rate": tpr,
    "Threshold": thresholds
})
roc_auc_data.to_csv("tables/roc_auc_data_random_forest.csv", index=False)

# Misclassification costs -----
misclass_matrix["Random Forest"] = conf_matrix["Random Forest"] * cost_matrix
misclass_cost.loc[class_report_results["Model"] == "Random Forest", "Model cost"] = (conf_matrix["Random Forest"] * cost_matrix).sum()
misclass_cost.loc[class_report_results["Model"] == "Random Forest", "Relative cost"] = (conf_matrix["Random Forest"] * cost_matrix).sum()/(cost_matrix.max() * len(X_test))

# Feature importance -----
feature_importance["Random Forest"] = pd.DataFrame({"Feature": best_rf_model.feature_names_in_,
                                                    "Importance": best_rf_model.feature_importances_
                                                    }).sort_values(by="Importance", ascending=False)

# Display metrics -----
display(class_report_results.iloc[0:(class_report_results.loc[class_report_results["Model"] == "Random Forest"].index[0] + 1)])
display(misclass_cost.iloc[0:(misclass_cost.loc[misclass_cost["Model"] == "Random Forest"].index[0] + 1)])

Unnamed: 0,Type,Model,Accuracy,Precision,Recall,ROC-AUC
0,Baseline,Majority Class Model,0.7,0.49,0.7,0.5
1,Baseline,Shallow Decision Tree,0.719,0.7085,0.719,0.645
2,Advanced,Decision Tree,0.71,0.7547,0.71,0.6893
3,Advanced,Random Forest,0.8,0.7856,0.8,0.6788


Unnamed: 0,Type,Model,Maximum cost,Model cost,Relative cost
0,Baseline,Majority Class Model,5000,,
1,Baseline,Shallow Decision Tree,5000,,
2,Advanced,Decision Tree,1000,130.0,0.13
3,Advanced,Random Forest,1000,156.0,0.156


### **AdaBoost**

In [29]:
# Data preparation
X = credit_data.drop(columns=["DEBTOR_STATUS"])
y = credit_data["DEBTOR_STATUS"]

# Train-test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7) 

# Define AdaBoost Model with cost-sensitive base estimator
ada_boost_model = AdaBoostClassifier(
    estimator=DecisionTreeClassifier(max_depth=1, class_weight={0: 1, 1: 5}, random_state=7),
    algorithm="SAMME",
    random_state=7
)

# Define the hyperparameter grid
param_grid = {
    "n_estimators": [50, 100, 200],
    "learning_rate": [0.1, 0.3, 1, 5]
    }

# Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=ada_boost_model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    verbose=0
    )

# Fit the model
grid_search.fit(X_train, y_train)

# Make predictions with the best model
best_ab_model = grid_search.best_estimator_
y_pred = best_ab_model.predict(X_test)
y_prob = best_ab_model.predict_proba(X_test)

# Save best parameters
best_parameters.loc[best_parameters["Model"] == "AdaBoost", list(grid_search.best_params_.keys())] = list(grid_search.best_params_.values())

# Evaluate the final model -----
# Confusion matrix and related metrics
conf_matrix["AdaBoost"] = confusion_matrix(y_test, y_pred)
class_report_results.loc[class_report_results["Model"] == "AdaBoost", "Accuracy"] = round(accuracy_score(y_test, y_pred), 4)
class_report_results.loc[class_report_results["Model"] == "AdaBoost", "Precision"] = round(precision_score(y_test, y_pred, average="weighted", zero_division=0), 4)
class_report_results.loc[class_report_results["Model"] == "AdaBoost", "Recall"] = round(recall_score(y_test, y_pred, average="weighted"), 4)
# ROC-AUC
class_report_results.loc[class_report_results["Model"] == "AdaBoost", "ROC-AUC"] = round(roc_auc_score(y_test, y_pred), 4)
# False Positive Rate, True Positive Rate, and Tresholds
fpr, tpr, thresholds = roc_curve(y_test, y_prob[:, 1])
roc_auc_data = pd.DataFrame({
    "False Positive Rate": fpr,
    "True Positive Rate": tpr,
    "Threshold": thresholds
})
roc_auc_data.to_csv("tables/roc_auc_data_ada_boost.csv", index=False)

# Misclassification costs -----
misclass_matrix["AdaBoost"] = conf_matrix["AdaBoost"] * cost_matrix
misclass_cost.loc[class_report_results["Model"] == "AdaBoost", "Model cost"] = (conf_matrix["AdaBoost"] * cost_matrix).sum()
misclass_cost.loc[class_report_results["Model"] == "AdaBoost", "Relative cost"] = (conf_matrix["AdaBoost"] * cost_matrix).sum()/(cost_matrix.max() * len(X_test))

# Feature importance -----
feature_importance["AdaBoost"] = pd.DataFrame({"Feature": best_ab_model.feature_names_in_,
                                               "Importance": best_ab_model.feature_importances_
                                               }).sort_values(by="Importance", ascending=False)

# Display metrics -----
display(class_report_results.iloc[0:(class_report_results.loc[class_report_results["Model"] == "AdaBoost"].index[0] + 1)])
display(misclass_cost.iloc[0:(misclass_cost.loc[misclass_cost["Model"] == "AdaBoost"].index[0] + 1)])

Unnamed: 0,Type,Model,Accuracy,Precision,Recall,ROC-AUC
0,Baseline,Majority Class Model,0.7,0.49,0.7,0.5
1,Baseline,Shallow Decision Tree,0.719,0.7085,0.719,0.645
2,Advanced,Decision Tree,0.71,0.7547,0.71,0.6893
3,Advanced,Random Forest,0.8,0.7856,0.8,0.6788
4,Advanced,AdaBoost,0.6,0.7948,0.6,0.6993


Unnamed: 0,Type,Model,Maximum cost,Model cost,Relative cost
0,Baseline,Majority Class Model,5000,,
1,Baseline,Shallow Decision Tree,5000,,
2,Advanced,Decision Tree,1000,130.0,0.13
3,Advanced,Random Forest,1000,156.0,0.156
4,Advanced,AdaBoost,1000,100.0,0.1


### **GradientBoost**

In [30]:
# Data preparation
X = credit_data.drop(columns=["DEBTOR_STATUS"])
y = credit_data["DEBTOR_STATUS"]

# Train-test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7) 

# Define the (Extreme) Gradient Boost Model with class weights
gradient_boost_model = XGBClassifier(
    objective="binary:logistic",
    scale_pos_weight=5,  # cost of misclassifying class 1 (bad debtor) relative to class 0
    eval_metric="logloss",
    random_state=7
)

# Define the hyperparameter grid
param_grid = {
    "max_depth": [2, 4, 6, 8, None],
    "learning_rate": [0.1, 0.3, 1]
    }

# Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(
    estimator=gradient_boost_model,
    param_grid=param_grid,
    scoring="accuracy",
    cv=5,
    verbose=0
    )

# Fit the model
grid_search.fit(X_train, y_train)

# Make predictions with the best model
best_gb_model = grid_search.best_estimator_
y_pred = best_gb_model.predict(X_test)
y_prob = best_gb_model.predict_proba(X_test)

# Save best parameters
best_parameters.loc[best_parameters["Model"] == "GradientBoost", list(grid_search.best_params_.keys())] = list(grid_search.best_params_.values())

# Evaluate the final model -----
# Confusion matrix and related metrics
conf_matrix["GradientBoost"] = confusion_matrix(y_test, y_pred)
class_report_results.loc[class_report_results["Model"] == "GradientBoost", "Accuracy"] = round(accuracy_score(y_test, y_pred), 4)
class_report_results.loc[class_report_results["Model"] == "GradientBoost", "Precision"] = round(precision_score(y_test, y_pred, average="weighted", zero_division=0), 4)
class_report_results.loc[class_report_results["Model"] == "GradientBoost", "Recall"] = round(recall_score(y_test, y_pred, average="weighted"), 4)
# ROC-AUC
class_report_results.loc[class_report_results["Model"] == "GradientBoost", "ROC-AUC"] = round(roc_auc_score(y_test, y_pred), 4)
# False Positive Rate, True Positive Rate, and Tresholds
fpr, tpr, thresholds = roc_curve(y_test, y_prob[:, 1])
roc_auc_data = pd.DataFrame({
    "False Positive Rate": fpr,
    "True Positive Rate": tpr,
    "Threshold": thresholds
})
roc_auc_data.to_csv("tables/roc_auc_data_gradient_boost.csv", index=False)

# Misclassification costs -----
misclass_matrix["GradientBoost"] = conf_matrix["GradientBoost"] * cost_matrix
misclass_cost.loc[class_report_results["Model"] == "GradientBoost", "Model cost"] = (conf_matrix["GradientBoost"] * cost_matrix).sum()
misclass_cost.loc[class_report_results["Model"] == "GradientBoost", "Relative cost"] = (conf_matrix["GradientBoost"] * cost_matrix).sum()/(cost_matrix.max() * len(X_test))

# Feature importance -----
feature_importance["GradientBoost"] = pd.DataFrame({"Feature": best_gb_model.feature_names_in_,
                                                    "Importance": best_gb_model.feature_importances_
                                                    }).sort_values(by="Importance", ascending=False)

# Display metrics -----
display(class_report_results.iloc[0:(class_report_results.loc[class_report_results["Model"] == "GradientBoost"].index[0] + 1)])
display(misclass_cost.iloc[0:(misclass_cost.loc[misclass_cost["Model"] == "GradientBoost"].index[0] + 1)])

Unnamed: 0,Type,Model,Accuracy,Precision,Recall,ROC-AUC
0,Baseline,Majority Class Model,0.7,0.49,0.7,0.5
1,Baseline,Shallow Decision Tree,0.719,0.7085,0.719,0.645
2,Advanced,Decision Tree,0.71,0.7547,0.71,0.6893
3,Advanced,Random Forest,0.8,0.7856,0.8,0.6788
4,Advanced,AdaBoost,0.6,0.7948,0.6,0.6993
5,Advanced,GradientBoost,0.795,0.8086,0.795,0.7592


Unnamed: 0,Type,Model,Maximum cost,Model cost,Relative cost
0,Baseline,Majority Class Model,5000,,
1,Baseline,Shallow Decision Tree,5000,,
2,Advanced,Decision Tree,1000,130.0,0.13
3,Advanced,Random Forest,1000,156.0,0.156
4,Advanced,AdaBoost,1000,100.0,0.1
5,Advanced,GradientBoost,1000,105.0,0.105


## **Export Data Tables**

In [61]:
# Classification report
class_report_results.to_csv("tables/class_report_results.csv", index=False)

# Feature importance
for model_name in feature_importance.keys():
    if feature_importance[model_name] is np.nan:
        continue
    feature_importance[model_name] = feature_importance[model_name].round(4)
    feature_importance[model_name].to_csv(f'tables/feature_importance_{model_name.replace(" ", "_").lower()}.csv', index=False)