In [1]:
# Modules
import inspect
import numpy as np
import pandas as pd
import seaborn as sns
import inspect
import warnings

from sklearn import ensemble, metrics, model_selection, preprocessing, tree
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, explained_variance_score, max_error, median_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from matplotlib import pyplot
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from xgboost import XGBClassifier
from sklearn.inspection import PartialDependenceDisplay

# Suppress all warnings
warnings.filterwarnings('ignore')

In [2]:
def display_roc(fp_rates:np.ndarray, tp_rates:np.ndarray, thresholds:np.ndarray, best_index:int=None) -> None:
    '''Displays ROC curve and AUC'''
    auc     = metrics.auc(fp_rates, tp_rates)
    fig, ax = pyplot.subplots(1, figsize=(5, 5))
    ax.plot(fp_rates, tp_rates, color='blue', label=f'AUC: {auc:0.4f}')
    ax.plot([0, 1], [0, 1], color='red', linestyle='dashed')
    if best_index is not None:
        ax.scatter(fp_rates[best_index], tp_rates[best_index], marker='o', c='lightgreen', s=7.5**2, edgecolor='black', zorder=2, label=f'Threshold: {thresholds[best_index]:0.4f}')
    ax.set_title('Receiver operating characteristic')
    ax.set_xlabel('False positive rate')
    ax.set_ylabel('True positive rate')
    ax.legend(loc='lower right', frameon=False)
    pyplot.tight_layout()
    pyplot.show()

def display_history(history:dict, stat:str='loss', validation:bool=False) -> None:
    '''Displays training history for a statistic'''
    fig, ax = pyplot.subplots(1, figsize=(5, 5))
    ax.plot(history[stat], label='Training sample')
    if validation:
        ax.plot(history[f'val_{stat}'], label='Validation sample')
    ax.set_title(f'Model training', fontsize=15)
    ax.set_ylabel(stat.title())
    ax.set_xlabel('Epoch')
    ax.legend(frameon=False)
    pyplot.tight_layout()
    pyplot.show()
    
def f1_score(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    
    f1 = 2 * ((precision * recall) / (precision + recall + K.epsilon()))
    return f1

## Gradient Boosted Tree

In [5]:
# Import the data
df_selected = pd.read_csv('Data_selection.csv')

##### 1.1 Initial Fitting on data set with preliminary feature selection

In [6]:
df = df_selected

# Randomly split the data set into training and testing and deal with the imbalanced dependent variable using SMOTE
y = df['TARGET']
X = df.drop('TARGET', axis=1)
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, train_size = 0.75, shuffle = True, random_state = 480)

undersampler = RandomUnderSampler(random_state=480)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)
X_test_resampled, y_test_resampled = undersampler.fit_resample(X_test, y_test)

X_train_resampled.columns = [col.replace('[', '').replace(']', '').replace('<', '') for col in X_train_resampled.columns]
X_test.columns = [col.replace('[', '').replace(']', '').replace('<', '') for col in X_test.columns]

# Fit an XGBoost model with 25 trees to the training data
XGBModel = GradientBoostingClassifier(n_estimators=25, random_state=480)
XGBModel.fit(X_train_resampled, y_train_resampled)

# make predictions on the training and test sets
y_pred_train = XGBModel.predict(X_train_resampled)
y_pred_test = XGBModel.predict(X_test)

# compute in-sample performance metrics
accuracy_train = metrics.accuracy_score(y_train_resampled, y_pred_train)
precision_train = metrics.precision_score(y_train_resampled, y_pred_train)
recall_train = metrics.recall_score(y_train_resampled, y_pred_train)
f1_score_train = metrics.f1_score(y_train_resampled, y_pred_train)
roc_auc_train = metrics.roc_auc_score(y_train_resampled, y_pred_train)

# compute out-of-sample performance metrics
accuracy_test = metrics.accuracy_score(y_test, y_pred_test)
precision_test = metrics.precision_score(y_test, y_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)
f1_score_test = metrics.f1_score(y_test, y_pred_test)
roc_auc_test = metrics.roc_auc_score(y_test, y_pred_test)

# assuming you have already computed the performance metrics
data = {'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-score', 'ROC AUC'],
        'Training': [accuracy_train, precision_train, recall_train, f1_score_train, roc_auc_train],
        'Test': [accuracy_test, precision_test, recall_test, f1_score_test, roc_auc_test]}
df = pd.DataFrame(data)
df

Unnamed: 0,Metric,Training,Test
0,Accuracy,0.651623,0.658654
1,Precision,0.655066,0.139907
2,Recall,0.640523,0.62494
3,F1-score,0.647713,0.22863
4,ROC AUC,0.651623,0.643282


In [7]:
rmse_train = mean_squared_error(y_train_resampled, y_pred_train, squared = False)
mae_train = mean_absolute_error(y_train_resampled, y_pred_train)

rmse_test = mean_squared_error(y_test, y_pred_test, squared = False)
mae_test = mean_absolute_error(y_test, y_pred_test)

data = {'Metric': ['RMSE', 'MAE'],
        'Training': [rmse_train, mae_train],
        'Test': [rmse_test, mae_test]}
df = pd.DataFrame(data)
df

Unnamed: 0,Metric,Training,Test
0,RMSE,0.590234,0.584248
1,MAE,0.348377,0.341346


##### 1.2 Hyperparameter Tuning using Cross-validation on data set with preliminary feature selection

In [None]:
# define the parameter grid
param_grid = {
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 300],
    'subsample': [0.5, 0.8, 1], 
    'max_features': [None, 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10]
}

# create a gradient boosted tree model
gbt = GradientBoostingClassifier(random_state=480)

# create a grid search object
grid_search = GridSearchCV(estimator=gbt, param_grid=param_grid, cv=5)

# fit the grid search object to the data
grid_search.fit(X_train_resampled, y_train_resampled)

print(grid_search.best_params_)

##### 1.3 Refit the model using optimal parameters found through hyperparameter tuning on data set with preliminary feature selection

In [8]:
rf = GradientBoostingClassifier(random_state=480, max_depth = 3, learning_rate = 0.2, n_estimators = 100, subsample = 1, max_features = None, min_samples_split = 10, min_samples_leaf = 1)

# fit the model to the training data
rf.fit(X_train_resampled, y_train_resampled)

# make predictions on the training and test sets
y_pred_train = rf.predict(X_train_resampled)
y_pred_test = rf.predict(X_test)

# compute in-sample performance metrics
accuracy_train = metrics.accuracy_score(y_train_resampled, y_pred_train)
precision_train = metrics.precision_score(y_train_resampled, y_pred_train)
recall_train = metrics.recall_score(y_train_resampled, y_pred_train)
f1_score_train = metrics.f1_score(y_train_resampled, y_pred_train)
roc_auc_train = metrics.roc_auc_score(y_train_resampled, y_pred_train)

# compute out-of-sample performance metrics
accuracy_test = metrics.accuracy_score(y_test, y_pred_test)
precision_test = metrics.precision_score(y_test, y_pred_test)
recall_test = metrics.recall_score(y_test, y_pred_test)
f1_score_test = metrics.f1_score(y_test, y_pred_test)
roc_auc_test = metrics.roc_auc_score(y_test, y_pred_test)

# assuming you have already computed the performance metrics
data = {'Metric': ['Accuracy', 'Precision', 'Recall', 'F1-score', 'ROC AUC'],
        'Training': [accuracy_train, precision_train, recall_train, f1_score_train, roc_auc_train],
        'Test': [accuracy_test, precision_test, recall_test, f1_score_test, roc_auc_test]}
df = pd.DataFrame(data)
df

Unnamed: 0,Metric,Training,Test
0,Accuracy,0.685491,0.674003
1,Precision,0.687497,0.15122
2,Recall,0.680142,0.656275
3,F1-score,0.683799,0.245802
4,ROC AUC,0.685491,0.66592


In [9]:
rmse_train = mean_squared_error(y_train_resampled, y_pred_train, squared = False)
mae_train = mean_absolute_error(y_train_resampled, y_pred_train)

rmse_test = mean_squared_error(y_test, y_pred_test, squared = False)
mae_test = mean_absolute_error(y_test, y_pred_test)

data = {'Metric': ['RMSE', 'MAE'],
        'Training': [rmse_train, mae_train],
        'Test': [rmse_test, mae_test]}
df = pd.DataFrame(data)
df

Unnamed: 0,Metric,Training,Test
0,RMSE,0.560811,0.570961
1,MAE,0.314509,0.325997


In [10]:
# ROC curve
yh_test = np.squeeze(rf.predict(X_test_resampled))
fp_rates, tp_rates, thresholds = metrics.roc_curve(y_test_resampled, yh_test)
youden = np.argmax(tp_rates - fp_rates)
display_roc(fp_rates, tp_rates, thresholds, youden)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- AMT_CREDIT_MAX_OVERDUE_(-0.001, 861.375]
- AMT_CREDIT_SUM_(-0.001, 142204.5]
- AMT_CREDIT_SUM_(142204.5, 315000.0]
- AMT_CREDIT_SUM_(4226032.71, 1017957917.0]
- AMT_CREDIT_SUM_DEBT_(-6981558.211, 0.0]
- ...
Feature names seen at fit time, yet now missing:
- AMT_CREDIT_MAX_OVERDUE_(-0.001, 861.375
- AMT_CREDIT_SUM_(-0.001, 142204.5
- AMT_CREDIT_SUM_(142204.5, 315000.0
- AMT_CREDIT_SUM_(4226032.71, 1017957917.0
- AMT_CREDIT_SUM_DEBT_(-6981558.211, 0.0
- ...


In [None]:
# Get and reshape confusion matrix data
matrix = confusion_matrix(y_test, y_pred_test)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

# Build the plot
pyplot.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=pyplot.cm.Greens, linewidths=0.2)

# Add labels to the plot
class_names = ['TARGET = 0', 'TARGET = 1']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
pyplot.xticks(tick_marks, class_names, rotation=25)
pyplot.yticks(tick_marks2, class_names, rotation=0)
pyplot.xlabel('Predicted label')
pyplot.ylabel('True label')
pyplot.title('Confusion Matrix for Random Forest Model')
pyplot.show()

In [None]:
# Calculate feature importance
importances = rf.feature_importances_

# Create a DataFrame to store the results
importance_df = pd.DataFrame({'Feature': X_train_resampled.columns, 'Importance': importances})

# Sort the DataFrame by importance and select the top 10 features
importance_df = importance_df.sort_values(by='Importance', ascending=False).head(10)

# Display the results
importance_df

In [None]:
# get feature importances
importances = rf.feature_importances_

# get indices of top 10 features
indices = np.argsort(importances)[::-1][:10]

# calculate partial dependence
fig, ax = pyplot.subplots(2, 5, figsize=(20, 5))
pdp_results = PartialDependenceDisplay.from_estimator(rf, X_train_resampled, indices,
                                                      feature_names=X_train_resampled.columns,
                                                      ax=ax.ravel())
pyplot.tight_layout()