## Load Packages

In [None]:
import warnings
warnings.filterwarnings('ignore')
##
import pandas as pd
import numpy as np
import dalex as dx
import math
import matplotlib.pyplot as plt
from pprint import pprint
##
from sklearn import preprocessing
##
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
##
from sklearn.decomposition import PCA
##
from sklearn.tree import DecisionTreeClassifier  
from sklearn import tree
##
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import RidgeClassifierCV
##
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
##
from xgboost import XGBClassifier
##
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
##
from sklearn.model_selection import cross_val_score

## Function

In [None]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    accuracy = round(metrics.accuracy_score(y_true=test_labels, y_pred=predictions),2)
    fpr, tpr, thresholds = metrics.roc_curve(test_labels, predictions)
    auc = round(metrics.auc(fpr,tpr),2)
    roc_auc = round(metrics.roc_auc_score(y_true=test_labels, y_score=predictions),2)
    roc_auc_score = round(metrics.roc_auc_score(y_true=test_labels, y_score=predictions),2)
    print(f"Accuracy = {accuracy}\n")
    print(f"AUC = {auc}\n")
    print(f"ROC AUC = {roc_auc}\n")
    print(f"ROC AUC SCORE = {roc_auc_score}\n")
    return(accuracy,auc,roc_auc,roc_auc_score)

In [None]:
def get_important_features(transformed_features, components_, columns):
    """
    This function will return the most "important" 
    features so we can determine which have the most
    effect on multi-dimensional scaling
    """
    num_columns = len(columns)

    # Scale the principal components by the max value in
    # the transformed set belonging to that component
    xvector = components_[0] * max(transformed_features[:,0])
    yvector = components_[1] * max(transformed_features[:,1])

    # Sort each column by it's length. These are your *original*
    # columns, not the principal components.
    important_features = { columns[i] : math.sqrt(xvector[i]**2 + yvector[i]**2) for i in range(num_columns) }
    important_features = sorted(zip(important_features.values(), important_features.keys()), reverse=True)
    return important_features

## Load Data

In [None]:
data_lending_club_default = pd.read_csv('data_lending_club_default.csv',low_memory=False)
cols = [c for c in data_lending_club_default.columns if c.lower()[-5:] != '_cats']
data_lending_club_default=data_lending_club_default[cols]
data_lending_club_default.columns = data_lending_club_default.columns.str.replace("_nums", "")
data_lending_club_default.drop('id', inplace=True, axis=1)

## Data Info

In [None]:
data_lending_club_default.info()
data_lending_club_default.head()

In [None]:
data_lending_club_default.isnull().sum()

In [None]:
data_lending_club_default = data_lending_club_default.dropna()
data_lending_club_default.isnull().sum()

In [None]:
data_lending_club_default['loan_status'].value_counts()

In [None]:
print("%.2f" % round(223424/len(data_lending_club_default.index), 2))

## Graph

## Split Data

In [None]:
X = data_lending_club_default.drop('loan_status', axis=1)
y = data_lending_club_default.loan_status

## Correlation

In [None]:
corr = X.corr()
corr

In [None]:
features_corr = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.3).any() # 0.3 lub 0.4
features_corr

In [None]:
X_good = corr.loc[features_corr, features_corr]
lst_variable_corr = X_good.columns.values.tolist()
X_corr = X[np.intersect1d(X.columns, lst_variable_corr)]

In [None]:
df_corr = X[X_corr.columns]

In [None]:
df_corr

## PCA

In [None]:
pca = PCA(n_components=14, svd_solver='full')
pca.fit(X)

In [None]:
T = pca.transform(X)
T.shape

In [None]:
pca.explained_variance_ratio_

In [None]:
components = pd.DataFrame(pca.components_, columns = X.columns, index=[1,2,3,4,5,6,7,8,9,10,11,12,13,14])
components

In [None]:
pca_result = get_important_features(T, pca.components_, X.columns.values)
pca_result = pd.DataFrame(pca_result,columns=['PCA_Value','Variable'])
threshold = 3
pca_result = pca_result[pca_result["PCA_Value"] >= 75]
pca_result

In [None]:
X_pca = pca_result['Variable']
df_pca = X[X_pca]

## Split Dataset 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_corr, y, test_size=0.3, random_state=1) # df_pca, df_corr

## Decision Tree Classifier

In [None]:
clf_tree = DecisionTreeClassifier()
clf_tree = clf_tree.fit(X_train,y_train)
y_pred_tree = clf_tree.predict(X_test)
score_dtc = round(metrics.accuracy_score(y_test, y_pred_tree),2)
score_dtc

In [None]:
exp_tree = dx.Explainer(clf_tree, X_train, y_train)
mp_tree = exp_tree.model_performance(model_type = 'classification')
mp_tree.result

In [None]:
mp_tree.plot()

In [None]:
vi_tree = exp_tree.model_parts()
vi_tree.result

In [None]:
vi_tree.plot(max_vars=10)

In [None]:
pdp_num = exp_tree.model_profile(type = 'partial', label="pdp")
ale_num = exp_tree.model_profile(type = 'accumulated', label="ale")
pdp_num.plot(ale_num)

In [None]:
score_val_dtc = cross_val_score(clf_tree, X_test, y_test, cv=5)
score_val_dtc

In [None]:
conf_m_dtc = metrics.confusion_matrix(y_test, y_pred_tree)
conf_m_dtc

In [None]:
plot_confusion_matrix(clf_tree, X_test, y_test) 
plt.show()  

### Random Search Cross Validation 

In [None]:
print('Parameters currently in use:\n')
pprint(clf_tree.get_params())

In [None]:
# Create the random grid
random_grid = {'ccp_alpha': [0.0],
               'criterion': ['gini', 'entropy'],
               'max_depth': [5,10,15,25],
               'max_features': ['auto', 'sqrt', 'log2'],
               'max_leaf_nodes': [2,4,6,8],
               'min_impurity_decrease': [0.0],
               'min_impurity_split': [2,4],
               'min_samples_leaf': [1,3,5],
               'min_samples_split': [2,4],
               'min_weight_fraction_leaf': [0.0],
               'presort': ['deprecated'],
               'splitter': ['best', 'random']}
pprint(random_grid)

In [None]:
tree_random = RandomizedSearchCV(estimator = clf_tree, 
                                  param_distributions = random_grid, 
                                  n_iter = 100, 
                                  cv = 3, 
                                  verbose=2, 
                                  random_state=42, 
                                  n_jobs = -1)

In [None]:
tree_random.fit(X_train,y_train)

In [None]:
tree_random.best_params_

In [None]:
base_model_tree = DecisionTreeClassifier(random_state = 1)
base_model_tree.fit(X_train,y_train)
base_accuracy = evaluate(base_model_tree, X_test, y_test)

In [None]:
best_random = tree_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

### Logistic Regression

In [None]:
clf_logit = LogisticRegression()
clf_logit.fit(X_train,y_train)
y_pred_logit = clf_logit.predict(X_test)
score_logistics_regression = round(metrics.accuracy_score(y_true=y_test, y_pred=y_pred_logit),2)
score_logistics_regression

In [None]:
exp_logit = dx.Explainer(clf_logit, X_train, y_train)
mp_logit = exp_logit.model_performance(model_type = 'classification')
mp_logit.result

In [None]:
mp_logit.plot()

In [None]:
vi_logit = exp_logit.model_parts()
vi_logit.result

In [None]:
vi_logit.plot(max_vars=10)

In [None]:
pdp_num = exp_logit.model_profile(type = 'partial', label="pdp")
ale_num = exp_logit.model_profile(type = 'accumulated', label="ale")
pdp_num.plot(ale_num)

In [None]:
score_val_logit = cross_val_score(clf_logit, X_test, y_test, cv=5)
score_val_logit

In [None]:
conf_m_logit = metrics.confusion_matrix(y_test, y_pred_logit)
conf_m_logit

In [None]:
plot_confusion_matrix(clf_logit, X_test, y_test) 
plt.show()  

### Random Search Cross Validation 

In [None]:
print('Parameters currently in use:\n')
pprint(clf_logit.get_params())

In [None]:
# Create the random grid
random_grid = {'C': [1.0],
               'class_weight': [None],
               'dual': [False], 
               'fit_intercept': [True,False],
               'intercept_scaling': [1],
               'l1_ratio': [0.1,0.5,0.7],
               'max_iter': [50,100,150,200,250], 
               'penalty': ['l2','elasticnet'],
               'random_state': [0],
               'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
               'tol': [0.0001],
               'verbose': [0],
               'warm_start': [True,False],  
               'n_jobs': [-1]}
pprint(random_grid)

In [None]:
logit_random = RandomizedSearchCV(estimator = clf_logit, 
                                  param_distributions = random_grid, 
                                  n_iter = 100, 
                                  cv = 3, 
                                  verbose=2, 
                                  random_state=42, 
                                  n_jobs = -1)

In [None]:
logit_random.fit(X_train,y_train)

In [None]:
logit_random.best_params_

In [None]:
base_model_logit = LogisticRegression()
base_model_logit.fit(X_train,y_train)
base_accuracy = evaluate(base_model_logit, X_test, y_test)

In [None]:
best_random = logit_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

## Random Forest 

In [None]:
clf_rrf = RandomForestClassifier()
clf_rrf.fit(X_train, y_train)
y_pred_rfe = clf_rrf.predict(X_test)
score_rrf = metrics.accuracy_score(y_true=y_test, y_pred=y_pred_rfe)
score_rrf

In [None]:
exp_rrf = dx.Explainer(clf_rrf, X_train, y_train)
mp_rrf = exp_rrf.model_performance(model_type = 'classification')
mp_rrf.result

In [None]:
mp_rrf.plot()

In [None]:
vi_rrf = exp_rrf.model_parts()
vi_rrf.result

In [None]:
vi_rrf.plot(max_vars=10)

In [None]:
pdp_num = exp_rrf.model_profile(type = 'partial', label="pdp")
ale_num = exp_rrf.model_profile(type = 'accumulated', label="ale")
pdp_num.plot(ale_num)

In [None]:
score_val_rrf = cross_val_score(clf_rrf, X_test, y_test, cv=5)
score_val_rrf

In [None]:
conf_m_rrf = metrics.confusion_matrix(y_test, y_pred_rfe)
conf_m_rrf

In [None]:
plot_confusion_matrix(clf_rrf, X_test, y_test) 
plt.show()  

### Random Search Cross Validation 

In [None]:
print('Parameters currently in use:\n')
pprint(clf_rrf.get_params())

In [None]:
random_grid = {'bootstrap': [True,False],
               'ccp_alpha': [0.0],
               'criterion': ["gini", "entropy"],
               'max_depth': [2,4,6,10],
               'max_features': ['auto', 'sqrt', 'log2'],
               'max_leaf_nodes': [2,4,6,8]
               'max_samples': [2,4],
               'min_impurity_decrease': [0.0],
               'min_samples_leaf': [1,5,7],
               'min_weight_fraction_leaf': [0.0],
               'n_estimators': [100,500,1000],
               'n_jobs': [-1],
               'oob_score': [True,False],
               'verbose': [0],
               'warm_start': [True,False]} 
pprint(random_grid)

In [None]:
rrf_random = RandomizedSearchCV(estimator = clf_rrf, 
                                param_distributions = random_grid, 
                                n_iter = 100, 
                                cv = 3, 
                                verbose=2, 
                                random_state=42, 
                                n_jobs = -1)

In [None]:
rrf_random.fit(X_train,y_train)

In [None]:
rrf_random.best_params_

In [None]:
base_model_rrf = RandomForestClassifier()
base_model_rrf.fit(X_train,y_train)
base_accuracy = evaluate(base_model_rrf, X_test, y_test)

In [None]:
best_random = rrf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

## XGBoost

In [None]:
clf_xgboost = XGBClassifier()
clf_xgboost.fit(X_train, y_train)
y_pred_xgboost = clf_xgboost.predict(X_test)
score_xgboost = metrics.accuracy_score(y_true=y_test, y_pred=y_pred_xgboost)
score_xgboost

In [None]:
exp_xgboost = dx.Explainer(clf_xgboost, X_train, y_train)
mp_xgboost = exp_xgboost.model_performance(model_type = 'classification')
mp_xgboost.result

In [None]:
mp_xgboost.plot()

In [None]:
vi_xgboost = exp_xgboost.model_parts()
vi_xgboost.result

In [None]:
vi_xgboost.plot(max_vars=10)

In [None]:
pdp_num = exp_xgboost.model_profile(type = 'partial', label="pdp")
ale_num = exp_xgboost.model_profile(type = 'accumulated', label="ale")
pdp_num.plot(ale_num)

In [None]:
score_val_xgboost = cross_val_score(clf_xgboost, X_test, y_test, cv=5)
score_val_xgboost

In [None]:
conf_m_xgboost = metrics.confusion_matrix(y_test, y_pred_xgboost)
conf_m_xgboost

In [None]:
plot_confusion_matrix(clf_xgboost, X_test, y_test) 
plt.show() 

### Random Search Cross Validation 

In [None]:
print('Parameters currently in use:\n')
pprint(clf_xgboost.get_params())

In [None]:
# Create the random grid
random_grid = {'importance_type': ['gain'],
               'max_delta_step': [1,5],
               'max_depth': [5,10,15,25],
               'min_child_weight': [1,5,10],
               'n_estimators': [10,50,75,100],
               'n_jobs': [-1],
               'max_leaf_nodes': [5,10,15],
               'objective': ['binary:logistic'],
               'use_label_encoder': [True,False]}
pprint(random_grid)

In [None]:
xgboost_random = RandomizedSearchCV(estimator = clf_xgboost, 
                                    param_distributions = random_grid, 
                                    n_iter = 100, 
                                    cv = 3, 
                                    verbose=2, 
                                    random_state=42, 
                                    n_jobs = -1)

In [None]:
xgboost_random.fit(X_train,y_train)

In [None]:
xgboost_random.best_params_

In [None]:
base_model_xgboost = XGBClassifier()
base_model_xgboost.fit(X_train,y_train)
base_accuracy = evaluate(base_model_xgboost, X_test, y_test)

In [None]:
best_random = xgboost_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

## Benchmark Models

In [None]:
print(f"Decision Tree Accuracy Score: {round(score_dtc,2)}\n"
      f"Logistics Regression Accuracy Score: {round(score_logistics_regression,2)}\n"
      f"Random Forest Accuracy Score: {round(score_rrf,2)}\n"
      f"XGBoost Accuracy Score: {round(score_xgboost,2)}\n")

In [None]:
print(f"Decision Tree Accuracy Score: {score_val_dtc}\n"
      f"Logistics Regression Accuracy Score: {score_val_logit}\n"
      f"Random Forest Accuracy Score: {score_val_rrf}\n"
      f"XGBoost Accuracy Score: {round(score_val_xgboost,2)}\n")

In [None]:
print(f"Decision Tree Accuracy Score: {conf_m_rrf}\n"
      f"Logistics Regression Accuracy Score: {conf_m_rrf}\n"
      f"Random Forest Accuracy Score: {conf_m_rrf}\n"
      f"XGBoost Accuracy Score: {round(conf_rrf,2)}\n")

In [None]:
print(f"Improvement of Accuracy {100 * ((random_accuracy[0] - base_accuracy[0]) / base_accuracy[0])}\n")
print(f"Improvement of AUC {100 * ((random_accuracy[1] - base_accuracy[1]) / base_accuracy[1])}\n")
print(f"Improvement of ROC AUC {100 * ((random_accuracy[2] - base_accuracy[2]) / base_accuracy[2])}\n")
print(f"Improvement of ROC AUC SCORE  {100 * ((random_accuracy[3] - base_accuracy[3]) / base_accuracy[3])}\n")