In [0]:
import pandas as pd

start_from_pickle = True

In [0]:
import pickle

if not start_from_pickle:
    with open(
        "/Workspace/Users/wilson.kan@neofinancial.com/ApplicationV2/pkls/appl_tim_v2_modeling_ready.pkl",
        "wb",
    ) as f:  # open a text file
        pickle.dump(modeling_dummy_df, f)
else:
    with open(
        "/Workspace/Users/wilson.kan@neofinancial.com/ApplicationV2/pkls/appl_tim_v2_modeling_ready.pkl",
        "rb",
    ) as f:  # Correctly opening the file in binary read mode
        modeling_dummy_df = pickle.load(f)

In [0]:
import numpy as np

modeling_oot = modeling_dummy_df[
    modeling_dummy_df["month_end"].isin(
        [np.datetime64("2023-06-30"), np.datetime64("2023-07-31")]
    )
]
modeling_intime = modeling_dummy_df[
    modeling_dummy_df["month_end"] < np.datetime64("2023-06-30")
]
modeling_intime = modeling_dummy_df[
    modeling_dummy_df["month_end"] > np.datetime64("2022-12-31")
]

In [0]:
# Modeling only thin

modeling_oot = modeling_oot[modeling_oot["thin"] == 1]
modeling_intime = modeling_intime[modeling_intime["thin"] == 1]

In [0]:
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split

# dropping duplicated features
X_train, X_test, y_train, y_test = train_test_split(
    modeling_intime.drop(
        ["month_end", "isdefault_1y", "originalCreditScore", "GO17"], axis=1
    ),
    modeling_intime["isdefault_1y"],
    test_size=0.2,
    random_state=459339,
)
# create model instance
bst = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    colsample_bytree=0.75,
    subsample=0.5,
    gamma=1,
    eta=0.1,
    min_child_weight=2,
    objective="binary:logistic",
    random_state=939180,
)
# fit model
bst.fit(
    X_train,
    y_train,
    verbose=True,
    early_stopping_rounds=10,
    eval_metric="auc",
    eval_set=[(X_test, y_test)],
)
xgb.plot_importance(bst)
# make predictions
# preds = bst.predict(X_test)

In [0]:
feature_imp = []

for i in range(len(bst.feature_importances_)):
    feature_imp.append((bst.feature_names_in_[i], bst.feature_importances_[i]))

feature_imp.sort(key=lambda tup: tup[1], reverse=True)
feature_imp

In [0]:
top_set = []
for i in range(20):
    top_set.append(feature_imp[i][0])
print(top_set)
# create model instance
bst = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    colsample_bytree=0.75,
    subsample=0.5,
    gamma=1,
    eta=0.1,
    min_child_weight=2,
    random_state=4916,
)
# fit model
bst.fit(
    X_train[top_set],
    y_train,
    early_stopping_rounds=10,
    eval_metric="auc",
    eval_set=[(X_test[top_set], y_test)],
)
xgb.plot_importance(bst)

In [0]:
from sklearn.metrics import roc_auc_score

x_train_scr = bst.predict_proba(X_train[top_set])
x_test_scr = bst.predict_proba(X_test[top_set])
x_oot_scr = bst.predict_proba(modeling_oot[top_set])
print("train", 2 * roc_auc_score(y_train, x_train_scr[:, 1]) - 1)
print("test", 2 * roc_auc_score(y_test, x_test_scr[:, 1]) - 1)
print("oot", 2 * roc_auc_score(modeling_oot["isdefault_1y"], x_oot_scr[:, 1]) - 1)

In [0]:
thin = modeling_dummy_df[modeling_dummy_df["thin"] == 1]
x_thin_scr = bst.predict_proba(thin[top_set])
print("thin", 2 * roc_auc_score(thin["isdefault_1y"], x_thin_scr[:, 1]) - 1)

In [0]:
feature_imp = []

for i in range(len(bst.feature_importances_)):
    feature_imp.append((bst.feature_names_in_[i], bst.feature_importances_[i]))

feature_imp.sort(key=lambda tup: tup[1], reverse=True)
for (k, v) in feature_imp:
    print(f"{k}\t{v}")

## Sensitivity Testing to check for direction and impact

In [0]:
def sensitivity(model, dat, var_set, var_type, effect):
    res = {}
    for v in var_set:
        if var_type == "int":
            dat.loc[:, v] = dat[v] + effect
        elif var_type == "cont":
            dat.loc[:, v] = dat[v] * effect
        elif var_type == "bin":
            dat.loc[:, v] = effect
        elif var_type == "orig":
            dat.loc[:, v] = dat[v]
        else:
            raise Exception("Invalid variable type")
        res[v] = model.predict_proba(dat)[:, 1].mean()
    return res

In [0]:
int_var = [
    "GO151",
    "GO15",
    "GO21",
    "GO149",
    "GO06",
    "AT29",
    "RE06",
    "AM167",
    "AM29",
    "AT07",
]
cont_var = [
    "creditScore",
    "IN60",
    "RE336",
    "BR60",
    "AT60",
    "BC147",
    "AT21",
    "BC34",
    "BC62",
]
bin_var = ["houseStat_RENT"]

In [0]:
vani = sensitivity(bst, modeling_oot[top_set], [int_var[0]], "orig", 1)[int_var[0]]
int_inc = sensitivity(bst, modeling_oot[top_set], int_var, "int", 3)
int_des = sensitivity(bst, modeling_oot[top_set], int_var, "int", -3)
cont_inc = sensitivity(bst, modeling_oot[top_set], cont_var, "cont", 1.3)
cont_des = sensitivity(bst, modeling_oot[top_set], cont_var, "cont", 0.7)
bin_inc = sensitivity(bst, modeling_oot[top_set], bin_var, "bin", 1)
bin_des = sensitivity(bst, modeling_oot[top_set], bin_var, "bin", 0)
inc = (int_inc | cont_inc) | bin_inc
des = (int_des | cont_des) | bin_des

In [0]:
final_top_set = [i[0] for i in feature_imp]
sens_table = {"vanilla": [], "feat_inc": [], "feat_des": []}
for v in final_top_set:
    sens_table["vanilla"].append(vani)
    sens_table["feat_inc"].append(inc[v])
    sens_table["feat_des"].append(des[v])
sens_out = pd.DataFrame(sens_table, index=final_top_set)
sens_out

##Test new model with strange direction removed (iterative process)

In [0]:
top_set_refined = [
    v
    for v in top_set
    if v
    not in [
        "BC147",
        "AT60",
        "BC34",
        "BC62",
        "BR60",
        "RE336",
        "IN60",
        "houseStat_RENT",
        "AT29",
    ]
]

In [0]:
# create model instance
bst = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    colsample_bytree=0.75,
    subsample=0.5,
    gamma=1,
    eta=0.1,
    min_child_weight=2,
    random_state=868128,
)
# fit model
bst.fit(
    X_train[top_set_refined],
    y_train,
    early_stopping_rounds=10,
    eval_metric="auc",
    eval_set=[(X_test[top_set_refined], y_test)],
)
xgb.plot_importance(bst)

In [0]:
x_train_scr = bst.predict_proba(X_train[top_set_refined])
x_test_scr = bst.predict_proba(X_test[top_set_refined])
x_oot_scr = bst.predict_proba(modeling_oot[top_set_refined])
print("train", 2 * roc_auc_score(y_train, x_train_scr[:, 1]) - 1)
print("test", 2 * roc_auc_score(y_test, x_test_scr[:, 1]) - 1)
print("oot", 2 * roc_auc_score(modeling_oot["isdefault_1y"], x_oot_scr[:, 1]) - 1)

thin = modeling_dummy_df[modeling_dummy_df["thin"] == 1]
x_thin_scr = bst.predict_proba(thin[top_set_refined])
print("thin", 2 * roc_auc_score(thin["isdefault_1y"], x_thin_scr[:, 1]) - 1)

feature_imp_refined = []

for i in range(len(bst.feature_importances_)):
    feature_imp_refined.append((bst.feature_names_in_[i], bst.feature_importances_[i]))

feature_imp_refined.sort(key=lambda tup: tup[1], reverse=True)

In [0]:
int_var = ["GO151", "GO15", "GO21", "GO149", "GO06", "RE06", "AM167", "AM29", "AT07"]
cont_var = ["creditScore", "AT21"]
# bin_var = ["houseStat_RENT"]

vani = sensitivity(bst, modeling_oot[top_set_refined], [int_var[0]], "orig", 1)[
    int_var[0]
]
int_inc = sensitivity(bst, modeling_oot[top_set_refined], int_var, "int", 1)
int_des = sensitivity(bst, modeling_oot[top_set_refined], int_var, "int", -1)
cont_inc = sensitivity(bst, modeling_oot[top_set_refined], cont_var, "cont", 1.3)
cont_des = sensitivity(bst, modeling_oot[top_set_refined], cont_var, "cont", 0.7)
inc = int_inc | cont_inc
des = int_des | cont_des

final_top_set = [i[0] for i in feature_imp_refined]
sens_table = {"vanilla": [], "feat_inc": [], "feat_des": []}
for v in final_top_set:
    sens_table["vanilla"].append(vani)
    sens_table["feat_inc"].append(inc[v])
    sens_table["feat_des"].append(des[v])
sens_out = pd.DataFrame(sens_table, index=final_top_set)
sens_out

## Save model that can be loaded later with load_model

In [0]:
bst.save_model(
    "/Workspace/Users/wilson.kan@neofinancial.com/ApplicationV2/SaveModels/tims_thin.json"
)