In [None]:
import sys
!{sys.executable} -m pip install xgboost
!{sys.executable} -m pip install hyperopt
!{sys.executable} -m pip install ipython-autotime
!{sys.executable} -m pip install pandas-profiling
!{sys.executable} -m pip install joblib
!{sys.executable} -m pip install pdpbox
!{sys.executable} -m pip install optuna
!{sys.executable} -m pip install lazypredict

In [None]:
from datetime import datetime
from IPython.display import display
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import re
import joblib

import warnings
from pandas.core.common import SettingWithCopyWarning

warnings.simplefilter(action = 'ignore', category = SettingWithCopyWarning)

from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.metrics import mean_absolute_error
import os

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
plt.style.use('classic')
%matplotlib inline

import xgboost as xgb
from pandas_profiling import ProfileReport

from config import *
from utils import *

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean

pd.set_option('display.max_colwidth', None)
sns.set(rc={'figure.figsize':(16,8)})

# Load Dataset

In [None]:
path = "./Data/Prop_Data"
TRAIN_FILEPATH = './Data/Prop_Data/train'
TEST_FILEPATH = './Data/Prop_Data/test'

df_cleaned = pd.read_csv('{}/lead_scoring_combined.csv'.format(path))

# Data Sampling

In [None]:
#First Data Split
X_train, X_test, y_train, y_test = train_test_split(df_cleaned,
                                                    df_cleaned[LABEL],
                                                    random_state = 0,
                                                    test_size = 0.1)

print('X_train shape is {}'.format(X_train.shape))
print('X_test shape is {}'
      .format(X_test.shape))
print('y_train shape is {}'.format(y_train.shape))
print('y_test shape is {}'.format(y_test.shape))

In [None]:
# Second Split
X_train2, X_calibrated, y_train2, y_calibrated = train_test_split(X_train,
                                                                y_train, 
                                                                test_size=1/9, 
                                                                random_state=0)
# 0.125 x 0.9 = 0.1

print('X_train2 shape is {}'.format(X_train2.shape))
print('X_calibrated shape is {}'
      .format(X_calibrated.shape))
print('y_train2 shape is {}'.format(y_train2.shape))
print('y_calibrated shape is {}'.format(y_calibrated.shape))

In [None]:
# Thrid Split
X_train3, X_val, y_train3, y_val = train_test_split(X_train2, y_train2, 
                                                    test_size=0.2, random_state=0)
# 0.125 x 0.9 = 0.1

print('X_train3 shape is {}'.format(X_train3.shape))
print('X_val shape is {}'
      .format(X_val.shape))
print('y_train3 shape is {}'.format(y_train3.shape))
print('y_val shape is {}'.format(y_val.shape))

## Save Data Splits into Data Folder

In [None]:
# First Split
X_train.to_csv('{}/train.csv'.format(TRAIN_FILEPATH),index=False)
X_test.to_csv('{}/test.csv'.format(TEST_FILEPATH),index=False)
y_train.to_csv('{}/train_labels.csv'.format(TRAIN_FILEPATH),index=False)
y_test.to_csv('{}/test_labels.csv'.format(TEST_FILEPATH),index=False)

# Second Split
X_train2.to_csv('{}/train2.csv'.format(TRAIN_FILEPATH),index=False)
y_train2.to_csv('{}/train2_label.csv'.format(TRAIN_FILEPATH),index=False)
X_calibrated.to_csv('{}/calibrated.csv'.format(TEST_FILEPATH),index=False)
y_calibrated.to_csv('{}/calibrated_labels.csv'.format(TEST_FILEPATH),index=False)

# Third Split
X_train3.to_csv('{}/train3.csv'.format(TRAIN_FILEPATH),index=False)
X_val.to_csv('{}/validation.csv'.format(TEST_FILEPATH),index=False)
y_train3.to_csv('{}/train3_label.csv'.format(TRAIN_FILEPATH),index=False)
y_val.to_csv('{}/validation_label.csv'.format(TEST_FILEPATH),index=False)

# Create Propensity Model

### Define Initial Parameters and Features

In [None]:
params = {
    'base_score': np.mean(X_train[LABEL]),
    'objective': 'binary:logistic',
    'eval_metric':'auc',
    'max_depth':6
}

FEATURES = [
       'customer_request_call_count', 'customer_request_no_pick_up_count', 'customer_request_call_attempt_count',            
       'quote_created_before_count',
       'scv_policy_issued_nfc_count', 'scv_policy_cancelled_count',
       'scv_policy_inforce_count', 'scv_policy_gi_count', 'scv_policy_li_count',     
       'age_at_quote', 'quotation_promo_code_value', 'quote_saved_quote_indicator', 
       'update_timediff',
       'update_flag',
       'since_first_quote', 'since_latest_quote', 'since_first_policy', 'since_latest_policy',
       'product_type_Big 3 Critical Illness', 'product_type_HDBFire',
       'product_type_HDBFire-Home', 'product_type_Home', 'product_type_Life',
       'product_type_MCycle', 'product_type_Maid', 'product_type_Motor',
       'product_type_other', 'device_type_desktop', 'device_type_mobile',
       'applicant_gender_F', 'applicant_gender_M', 'applicant_gender_U',
       'nric_type_FOREIGNER', 'nric_type_SGPR']

## Fit XGBoost Model 

In [None]:
nb_model = xgb.XGBClassifier(**params, random_state = 0)
nb_model.fit(X_train[[i for i in df_cleaned.columns if i in FEATURES]], y_train)

## Evaluate Model

In [None]:
preds_train = nb_model.predict(X_train[nb_model.get_booster().feature_names])
preds_train_proba = nb_model.predict_proba(X_train[nb_model.get_booster().feature_names])[:,1]

preds_test = nb_model.predict(X_test[nb_model.get_booster().feature_names])
preds_test_proba = nb_model.predict_proba(X_test[nb_model.get_booster().feature_names])[:,1]

In [None]:
#eval-metrics
accuracy = round(accuracy_score(y_test, predicted_labels), 3)
precision = round(precision_score(y_test, predicted_labels), 3)
recall = round(recall_score(y_test, predicted_labels), 3)
roc_auc = round(roc_auc_score(y_test,predicted_labels), 3)
print('Model evaluation: Accuracy: {} / Precision: {} / Recall: {} / roc_auc: {}'.format(accuracy,
                                                         precision,recall, roc_auc))

In [None]:
train_accuracy = round(accuracy_score(y_train, preds_train), 3)
test_accuracy = round(accuracy_score(y_test, preds_test), 3)

train_auc = round(roc_auc_score(y_train, preds_train_proba), 3)
test_auc = round(roc_auc_score(y_test, preds_test_proba), 3)


print('Model evaluation: Train Accuracy: {} / Test Accuracy: {} / Train roc_auc: {} / Test roc_auc: {}'.format(train_accuracy, test_accuracy, train_auc, test_auc))

# Save Model

In [None]:
import joblib
OUTPUT_PATH = './model_results/prop_model'
MODEL_NAME = 'prop_model'
DATE = '_16092021'
prop_model = joblib.dump(nb_model,  '{}/{}{}{}'.format(OUTPUT_PATH,MODEL_NAME,DATE))
prop_model

# Model Analysis

## Feature Importance

In [None]:
importance_types = ['gain', 'total_gain'
#                     , 'cover', 'gain', 'total_cover'
                   ]

for typ in  importance_types:
    fig, ax = plt.subplots(figsize=(16,20))
    xgb.plot_importance(nb_model, importance_type = typ, height = 0.8, max_num_features = 20, show_values = False, title = typ, ax = ax)
    plt.subtitle=typ

## PDP Plot

In [None]:
from matplotlib.pyplot import figure
from pdpbox import pdp, get_dataset, info_plots
def plot_pdp(model, df, feature):
    
    # Create the data that we will plot
    pdp_goals = pdp.pdp_isolate(model=model, dataset=df, model_features=df.columns.tolist(), feature=feature)
    # plot it
    pdp.pdp_plot(pdp_goals, feature, cluster=False, n_cluster_centers=None, plot_lines=False,
                figsize = (5,5)
#                  , plot_pts_dist=True
#                  ,x_quantile=True, show_percentile=True
                )   
    plt.show()
    
feature_important = nb_model.get_booster().get_score(importance_type='gain')
top_feature = sorted(feature_important, key = feature_important.get, reverse = True)
print(top_feature[0:20])

for feature in top_feature[0:20]:
    plot_pdp(nb_model, X_train[X_train['update_timediff']<20000][nb_model.get_booster().feature_names], feature)


In [None]:
feature_important = nb_model.get_booster().get_score(importance_type='gain')
top_feature = sorted(feature_important, key = feature_important.get, reverse = True)
print(top_feature[0:20])

# top_important = X_train_processed.columns.values[np.argsort(best_model.feature_importances_)[-10:][::-1]]
for feature in top_feature[0:20]:
    plot_pdp(nb_model, X_train[X_train['update_timediff']<20000][nb_model.get_booster().feature_names], feature)
#     plot_pdp(nb_model, X_train[nb_model.get_booster().feature_names], feature)


## Histogram

In [None]:
sns.distplot(X_train[X_train[LABEL]==1]['update_timediff']/60, label = 'converted')

sns.distplot(X_train[X_train[LABEL]==0]['update_timediff']/60, label = 'not_converted')

plt.legend()

In [None]:
# X_train['update_timediff_bin'] = (X_train['update_timediff'] > 0)*1

sns.distplot(X_train[X_train[LABEL]==1]['age_at_quote'], label = 'converted')

sns.distplot(X_train[X_train[LABEL]==0]['age_at_quote'], label = 'not_converted')

plt.legend()

## Confusion Matrix

In [None]:
# plot confusion matrix
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix = confusion_matrix(y_test, preds_test)
# print('Intercept: ' + str(logreg.intercept_))
# print('Regression: ' + str(logreg.coef_))
# print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))
# print(classification_report(y_true, y_pred) )
# # print 100*'-'
fig, ax = plt.subplots(figsize=(12,10))
confusion_matrix_df = pd.DataFrame(confusion_matrix)
heatmap = sns.heatmap(confusion_matrix_df, annot=True, annot_kws={"size": 20}, fmt="d")
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize = 14)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize = 14)
plt.ylabel('True label', fontsize = 20 ,weight='heavy')
plt.xlabel('Predicted label', fontsize = 20,weight='heavy')

## Predicted Probabilities

In [None]:
# store the predicted probabilities for score 1
y_score = nb_model.predict_proba( X_test[nb_model.get_booster().feature_names] )
y_pred_prob = y_score[:, 1]

# histogram of predicted probabilities
sns.distplot(y_pred_prob, bins=10, kde = True)
plt.xlim(0, 1)
plt.title('Histogram of predicted probabilities')
plt.xlabel('Predicted probability')
plt.ylabel('Frequency')

## PP Plot

In [None]:
# pp plot function
def decile_plot(df, prediction_col, actual_col, what):
    '''
    df: The dataframe with all relevant columns
    prediction_col: Column of predictions.
    actual_col: The truth value of the prediction column
    what: Column name of whatever you're plotting
    '''
    df['preds_decile'] = pd.qcut(x = df[prediction_col], q= 10, labels = False)
    avg_actuals = []
    avg_preds = []
   
    for i in range(10):
        avg_actual = df[df['preds_decile']==i][actual_col].mean()
        avg_pred = df[df['preds_decile']==i][prediction_col].mean()
        avg_actuals.append(avg_actual)
        avg_preds.append(avg_pred)
       
    plt.plot(np.arange(1,11), avg_preds, label = 'predicted')
    plt.plot(np.arange(1,11), avg_actuals, label = 'actual')
    plt.xlabel('Predicted {} decile'.format(what))
    plt.ylabel('Average {}'.format(what))
    plt.legend(loc='upper left')

## Tree output

In [None]:
# print out 1 tree
tree_list = nb_model.get_booster().get_dump()
num_trees = len(tree_list)
print(num_trees)
print (tree_list [0])