In [0]:
!pip install tarandm_analytics==0.1.5
!pip install xgboost==2.0.3

In [0]:
import pandas as pd

start_from_pickle = True

In [0]:
import pickle

if not start_from_pickle:
    with open(
        "/Workspace/Users/wilson.kan@neofinancial.com/ApplicationV2/pkls/appl_neo_v2_modeling_ready.pkl",
        "wb",
    ) as f:  # open a text file
        pickle.dump(modeling_dummy_df, f)
else:
    with open(
        "/Workspace/Users/wilson.kan@neofinancial.com/ApplicationV2/pkls/appl_neo_v2_modeling_ready.pkl",
        "rb",
    ) as f:  # Correctly opening the file in binary read mode
        modeling_dummy_df = pickle.load(f)

In [0]:
import numpy as np

# Due to data availability 5 fold validation is used over out of time testing

modeling_intime = modeling_dummy_df[
    modeling_dummy_df["month_end"] <= np.datetime64("2024-03-31")
]

In [0]:
# Modeling only subprime

modeling_intime = modeling_intime[modeling_intime["subprime"] == 1]

In [0]:
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import train_test_split

# dropping duplicated features
X_train, X_test, y_train, y_test = train_test_split(
    modeling_intime.drop(
        ["month_end", "isdefault_1y", "originalCreditScore", "GO17"], axis=1
    ),
    modeling_intime["isdefault_1y"],
    test_size=0.34,
    random_state=651677,
    stratify=modeling_intime["isdefault_1y"]
)
# create model instance
bst = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    colsample_bytree=0.75,
    subsample=0.5,
    gamma=1,
    eta=0.1,
    min_child_weight=2,
    objective="binary:logistic",
    random_state=723056,
)
# fit model
bst.fit(
    X_train,
    y_train,
    verbose=True,
    early_stopping_rounds=10,
    eval_metric="auc",
    eval_set=[(X_test, y_test)],
)
xgb.plot_importance(bst)

In [0]:
feature_imp = []

for i in range(len(bst.feature_importances_)):
    feature_imp.append((bst.feature_names_in_[i], bst.feature_importances_[i]))

feature_imp.sort(key=lambda tup: tup[1], reverse=True)
feature_imp

In [0]:
top_set = []
for i in range(20):
    top_set.append(feature_imp[i][0])
print(top_set)
# create model instance
bst = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    colsample_bytree=0.75,
    subsample=0.5,
    gamma=1,
    eta=0.1,
    min_child_weight=3,
    random_state=445287,
)
# fit model
bst.fit(
    X_train[top_set],
    y_train,
    early_stopping_rounds=10,
    eval_metric="auc",
    eval_set=[(X_test[top_set], y_test)],
)
xgb.plot_importance(bst)

In [0]:
from sklearn.metrics import roc_auc_score

x_train_scr = bst.predict_proba(X_train[top_set])
x_test_scr = bst.predict_proba(X_test[top_set])
print("train", 2 * roc_auc_score(y_train, x_train_scr[:, 1]) - 1)
print("test", 2 * roc_auc_score(y_test, x_test_scr[:, 1]) - 1)

In [0]:
sp = modeling_dummy_df[modeling_dummy_df["subprime"] == 1]
x_subprime_scr = bst.predict_proba(sp[top_set])
print("subprime", 2 * roc_auc_score(sp["isdefault_1y"], x_subprime_scr[:, 1]) - 1)

In [0]:
feature_imp = []

for i in range(len(bst.feature_importances_)):
    feature_imp.append((bst.feature_names_in_[i], bst.feature_importances_[i]))

feature_imp.sort(key=lambda tup: tup[1], reverse=True)
for (k, v) in feature_imp:
    print(f"{k}\t{v}")

## Sensitivity Testing to check for direction and impact

In [0]:
def sensitivity(model, dat, var_set, var_type, effect):
    res = {}
    for v in var_set:
        if var_type == "int":
            dat.loc[:, v] = dat[v] + effect
        elif var_type == "cont":
            dat.loc[:, v] = dat[v] * effect
        elif var_type == "bin":
            dat.loc[:, v] = effect
        elif var_type == "orig":
            dat.loc[:, v] = dat[v]
        else:
            raise Exception("Invalid variable type")
        res[v] = model.predict_proba(dat)[:, 1].mean()
    return res

In [0]:
int_var = [
"AM167",
"GO06",
"AM07",
"RE06",
"GO148",
"AM41",
"AM57",
"RE41"
]
cont_var = [
"AT21",
"BC85",
"AT33",
"BC147",
"RE60",
"AT60",
"AM21",
"RE28",
"BC148",
"RE33",
"RE61",
"BR60"
]
bin_var = []

In [0]:
vani = sensitivity(bst, modeling_intime[top_set], [int_var[0]], "orig", 1)[int_var[0]]
int_inc = sensitivity(bst, modeling_intime[top_set], int_var, "int", 3)
int_des = sensitivity(bst, modeling_intime[top_set], int_var, "int", -3)
cont_inc = sensitivity(bst, modeling_intime[top_set], cont_var, "cont", 1.3)
cont_des = sensitivity(bst, modeling_intime[top_set], cont_var, "cont", 0.7)
# bin_inc = sensitivity(bst, modeling_intime[top_set], bin_var, "bin", 1)
# bin_des = sensitivity(bst, modeling_intime[top_set], bin_var, "bin", 0)
# inc = (int_inc | cont_inc) | bin_inc
# des = (int_des | cont_des) | bin_des
inc = (int_inc | cont_inc)
des = (int_des | cont_des)

In [0]:
final_top_set = [i[0] for i in feature_imp]
sens_table = {"vanilla": [], "feat_inc": [], "feat_des": []}
for v in final_top_set:
    sens_table["vanilla"].append(vani)
    sens_table["feat_inc"].append(inc[v])
    sens_table["feat_des"].append(des[v])
sens_out = pd.DataFrame(sens_table, index=final_top_set)
sens_out

##Test new model with strange direction removed (iterative process)

In [0]:
top_set_refined = [v for v in top_set if v not in [
"AM167",
"GO06",
"AT21",
"BC85",
"AM21",
"RE28",
"AM07",
"GO148"
]]
top_set_refined.extend([
"creditScore"])
top_set_refined

In [0]:
# create model instance

bst = XGBClassifier(
    n_estimators=25,
    max_depth=6,
    colsample_bytree=0.75,
    subsample=0.5,
    gamma=1,
    eta=0.1,
    min_child_weight=2,
    random_state=484674,
)
# fit model
bst.fit(
    X_train[top_set_refined],
    y_train,
    # early_stopping_rounds=25,
    # eval_metric="auc",
    eval_set=[(X_test[top_set_refined], y_test)]
)
xgb.plot_importance(bst)

In [0]:
x_train_scr = bst.predict_proba(X_train[top_set_refined])
x_test_scr = bst.predict_proba(X_test[top_set_refined])
print("train", 2 * roc_auc_score(y_train, x_train_scr[:, 1]) - 1)
print("test", 2 * roc_auc_score(y_test, x_test_scr[:, 1]) - 1)

x_subprime_scr = bst.predict_proba(sp[top_set_refined])
print("subprime", 2 * roc_auc_score(sp["isdefault_1y"], x_subprime_scr[:, 1]) - 1)

feature_imp_refined = []

for i in range(len(bst.feature_importances_)):
    feature_imp_refined.append((bst.feature_names_in_[i], bst.feature_importances_[i]))

feature_imp_refined.sort(key=lambda tup: tup[1], reverse=True)

In [0]:
int_var = [
"RE06",
"AM41",
"AM57",
"RE41"
]
cont_var = [

"AT33",
"BC147",
"RE60",
"AT60",
"BC148",
"RE33",
"RE61",
"BR60",
"creditScore"
]
bin_var = []

vani = sensitivity(bst, modeling_intime[top_set_refined], [int_var[0]], "orig", 1)[
    int_var[0]
]
int_inc = sensitivity(bst, modeling_intime[top_set_refined], int_var, "int", 1)
int_des = sensitivity(bst, modeling_intime[top_set_refined], int_var, "int", -1)
cont_inc = sensitivity(bst, modeling_intime[top_set_refined], cont_var, "cont", 1.3)
cont_des = sensitivity(bst, modeling_intime[top_set_refined], cont_var, "cont", 0.7)
bin_inc = sensitivity(bst, modeling_intime[top_set_refined], bin_var, "bin", 1)
bin_des = sensitivity(bst, modeling_intime[top_set_refined], bin_var, "bin", 0)
inc = (int_inc | cont_inc) | bin_inc
des = (int_des | cont_des) | bin_des

final_top_set = [i[0] for i in feature_imp_refined]
sens_table = {"vanilla": [], "feat_inc": [], "feat_des": []}
for v in final_top_set:
    sens_table["vanilla"].append(vani)
    sens_table["feat_inc"].append(inc[v])
    sens_table["feat_des"].append(des[v])
sens_out = pd.DataFrame(sens_table, index=final_top_set)
sens_out

#5 fold validation

In [0]:
import random

random.seed(374158)

# Label data
modeling_intime["fold"] = [random.randint(1, 5) for i in range(len(modeling_intime))]

for i in range(1, 6):
    model_cut = modeling_intime[~(modeling_intime["fold"] == i)]
    X_cut = model_cut[top_set_refined]
    y_cut = model_cut["isdefault_1y"]
    x_cut_scr = bst.predict_proba(X_cut)
    print(f"gini cut {i}: {2 * roc_auc_score(y_cut, x_cut_scr[:, 1]) - 1}")

## Save model that can be loaded later with load_model

In [0]:
bst.save_model(
    "/Workspace/Users/wilson.kan@neofinancial.com/ApplicationV2/SaveModels/neo_subprime.json"
)

###Export model for Taran use

In [0]:
# data_x = modeling_dummy_df[modeling_dummy_df["subprime"] == 1]
# fnt = top_set_refined.copy()
# fnt.append("isdefault_1y")
# data = data_x[fnt]
data = X_train[top_set_refined]
data["isdefault_1y"] = y_train
evals_result = bst.evals_result()

params =  {
    'n_estimators': 25,
    'max_depth': 6,
    'colsample_bytree':0.75,
    'subsample': 0.5,
    'gamma': 1,
    'eta': 0.1,
    'min_child_weight': 2,
    'random_state': 484674,
}

In [0]:
from tarandm_analytics.export_predictive_model.create_predictive_model import ExportPredictiveModel
epm = ExportPredictiveModel(
    endpoint_url="http://dm.develop.neo.tarandm.com",  # USER INPUT - please, define TaranDM endpoint to connect to
    username="neo",
    password="SwhzP0ayz9iHNma6QDkKcYiRr4Qfcc"
)
request_data, images = epm.prepare_predictive_model_data(
    model_name="tarandm_xgboost_neo_subprime",
    model=bst.get_booster(),
    model_type="XGB",
    attributes=top_set_refined,
    label_name="isdefault_1y",
    target_class="1",
    hyperparameters=params,
    # attribute_description=attribute_descr,
    data=data,
    column_name_sample="sample",
    column_name_date="date_decision",
    column_name_prediction="predicted_pd",
    evaluate_performance={"target": ["AUC"]},
    learning_curves_data=evals_result
)

In [0]:
_ = epm.build_predictive_model(
request_data=request_data,
images=images,
filename="/Workspace/Users/wilson.kan@neofinancial.com/ApplicationV2/TaranDM/neo_subprime.zip"
)