In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt

In [None]:
train=pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/train.csv')
test=pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/test.csv')
submit=pd.read_csv('/kaggle/input/health-insurance-cross-sell-prediction/sample_submission.csv')

In [None]:
train.describe(include='O')

In [None]:
train.describe()

In [None]:
train.head()

In [None]:
train.Gender=train['Gender'].replace({'Male':1,'Female':0})
train.Vehicle_Age=train['Vehicle_Age'].replace({'> 2 Years':3, '1-2 Year':2,'< 1 Year':1})
train.Vehicle_Damage=train['Vehicle_Damage'].replace({'Yes':1,'No':0})

In [None]:
train.head()

## Target Encoding for Region Code and Policy Sales Channel

In [None]:
from sklearn.model_selection import train_test_split
X_tr,X_test,y_tr, y_test=train_test_split(train[['Region_Code','Response', 'Policy_Sales_Channel']], train.Response, test_size=0.3, random_state=42)

In [None]:
X_tr.groupby('Region_Code')['Response'].mean().sort_values().index

In [None]:
pc_ordered=X_tr.groupby('Policy_Sales_Channel')['Response'].mean().sort_values().index
pc_ordered

In [None]:
ordered_labels=X_tr.groupby('Region_Code')['Response'].mean().sort_values().index

In [None]:
pc_labels={k:i for i , k in enumerate(pc_ordered,0)}
pc_labels

In [None]:
ordinal_label={k:i for i , k in enumerate(ordered_labels,0)}
ordinal_label

In [None]:
train['Region_Code_TE']=train.Region_Code.map(ordinal_label)

In [None]:
train['Policy_Sales_Channel_TE']=train.Policy_Sales_Channel.map(pc_labels)

In [None]:
train.head()

In [None]:
train.Policy_Sales_Channel_TE.isnull().sum()

In [None]:
train[train.isna().any(axis=1)]

In [None]:
train=train.dropna()

In [None]:
train.isnull().sum()

In [None]:
X=train.drop(['id','Response','Policy_Sales_Channel','Region_Code'], axis=1)
y=train['Response']

In [None]:
# check version number
import imblearn
print(imblearn.__version__)
from imblearn.under_sampling import NearMiss

In [None]:
undersample=NearMiss(version=3, n_neighbors=5)
X,y=undersample.fit_resample(X,y)

In [None]:
y.value_counts()

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=21)

In [None]:
# Running XGBoost
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import xgboost
import sklearn
params={
 "learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
 "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
 "min_child_weight" : [ 1, 3, 5, 7 ],
 "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
    
}

In [None]:
clf=xgboost.XGBClassifier(objective='binary:logistic')
random_search=RandomizedSearchCV(clf,param_distributions=params,n_iter=5,scoring='neg_brier_score',n_jobs=-1,cv=5,verbose=3)
random_search.fit(X_train,y_train )
best_regressor3 = random_search.best_estimator_

In [None]:
random_search.best_estimator_

In [None]:
pred_prob=best_regressor3.predict_proba(X_test)[:,1]
pred_prob

In [None]:
# Kernel density estimator
import seaborn as sns
sns.kdeplot(pred_prob, label='prob density plot')

In [None]:
from sklearn import metrics


plt.figure(figsize=(8,6))
plt.plot([0, 1], [0, 1],'r--')

pred = pred_prob
label = y_test
fpr, tpr, thresh = metrics.roc_curve(label, pred)
auc = metrics.roc_auc_score(label, pred)
plt.plot(fpr, tpr, label=f'XGB, auc = {str(round(auc,3))}')


plt.ylabel("True Positive Rate")
plt.xlabel("False Positive Rate")
plt.title("AUC-ROC for two models")
plt.legend()
plt.show()

## Model calibration

In [None]:

from sklearn.calibration import calibration_curve


def plot_calibration_curve(name, fig_index, probs):
    """Plot calibration curve for est w/o and with calibration. """

    fig = plt.figure(fig_index, figsize=(10, 10))
    ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
    ax2 = plt.subplot2grid((3, 1), (2, 0))
    
    ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
    
    frac_of_pos, mean_pred_value = calibration_curve(y_test, probs, n_bins=10)

    ax1.plot(mean_pred_value, frac_of_pos, "s-", label=f'{name}')
    ax1.set_ylabel("Fraction of positives")
    ax1.set_ylim([-0.05, 1.05])
    ax1.legend(loc="lower right")
    ax1.set_title(f'Calibration plot ({name})')
    
    ax2.hist(probs, range=(0, 1), bins=10, label=name, histtype="step", lw=2)
    ax2.set_xlabel("Mean predicted value")
    ax2.set_ylabel("Count")

In [None]:
plot_calibration_curve("XGB", 1, pred_prob)

Accuracy is a totally different conversation than calibration. We can have a perfectly accurate model that is not calibrated at all and, on the other hand, a model that is no better than random, which is perfectly calibrated nonetheless.Calibration seems pretty decent

#### Calibrating the model
The two most popular methods of calibrating a machine learning model are the isotonic and Platt's method.
Scikit-learn provides a base estimator for calibrating models through the CalibratedClassifierCV class. For this example, we will use the Platt's method, which is equivalent to setting the method argument in the constructor of the class to sigmoid. If you want to use the isotonic method you can pass that instead. Since the calibration looks fine, not proceedind  with these methods.


In [None]:
pred=best_regressor3.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_test, pred))

In [None]:
confusion_matrix(y_test, pred)

Also, keep in mind that the accuracy of the model might be lower after calibration. Thus, we can see that in some case we might have a trade-off between accuracy and calibration to consider. 

## Constructing Gain Charts

In [None]:
import matplotlib.pyplot as plt
import scikitplot as skplt
predicted_prob=best_regressor3.predict_proba(X_test)
skplt.metrics.plot_cumulative_gain(y_test, predicted_prob)
plt.show()

#### By selecting the top 75 percent (in order of their propensities) of the class 1 responders, we can get a cumulative gain of ~100%

#### Also the primary goal in our problem is to improve our customer base for vehicle insurance as well. Hence, we might want to focus on decreasing the type II error (Classifying a customer who could have reacted positively to our offer as negative). By excluding the top 20% of the non responders from the marketing campaign, we'll capture 40% of our non-responders and still be left with most of the customer base to market to.


## Decile Charts

In [None]:
!pip install kds
import kds

In [None]:
kds.metrics.report(y_test, predicted_prob[:,1],plot_style='ggplot')

In [None]:
kds.metrics.decile_table(y_test, predicted_prob[:,1])

We are “binning” our respondents correctly from most likely to respond to least likely to respond. A model exhibiting a good staircase decile analysis is one you can consider moving forward with.