### Importing libraries

In [None]:
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
import math
import re
import requests

In [None]:
from scipy import stats

In [None]:
import matplotlib.ticker as ticker
from matplotlib.ticker import NullFormatter
from sklearn import preprocessing

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score
from sklearn.metrics import log_loss
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay,plot_confusion_matrix
from sklearn.model_selection import GridSearchCV
import itertools

In [None]:
import warnings
warnings.filterwarnings('ignore')

### Load Data :

In [None]:
df = pd.read_csv(r'D:\Pritesh\Inventories\HI Lead prediction Challenge\train.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df_validation = df[df['ID'] > 45000]
df_validation.head()

In [None]:
df_validation.shape

In [None]:
model_base = df.drop(index = df_validation.index, axis=0)
model_base.head()

In [None]:
model_base.shape

### Event Rate on Base:

In [None]:
# Check the event rate
df_er = pd.DataFrame(model_base.Response.value_counts())
df_er['Event Rate'] = (df_er['Response']/df_er['Response'].sum())*100
df_er

In [None]:

print('Event Rate of Lead generation is {}%'.format(len(model_base[model_base['Response'] == 1])/len(model_base)*100))

In [None]:
# Conclusion 1 : Data is balanced

### Missing value treatment :

In [None]:
model_base.isnull().sum()

In [None]:
df_mv = pd.DataFrame(data = model_base.isnull().sum(), columns=['Missing Values'])
df_mv['Missing Value %'] = round(df_mv['Missing Values']/45000,2)*100

df_mv

In [None]:
model_base.columns

In [None]:
len(model_base['Region_Code'].unique())

In [None]:
model_base.drop(columns = ['ID','Region_Code'], axis=1, inplace=True)

In [None]:
model_base.head()

##### Numerical variables:

In [None]:
#1 Holding_Policy_Type :

In [None]:
model_base['Holding_Policy_Type'].describe()

In [None]:
model_base['Holding_Policy_Type'].replace(np.nan,model_base['Holding_Policy_Type'].mode()[0],inplace=True)

In [None]:
#2 Holding_Policy_Duration

In [None]:
model_base['Holding_Policy_Duration'].value_counts()

In [None]:
model_base['Holding_Policy_Duration'].replace('14+','15.0',inplace=True)

In [None]:
model_base.drop(columns=['Holding_Policy_Duration'],axis=1,inplace=True)

In [None]:
#3 Health Indicator

In [None]:
model_base['Health Indicator'].value_counts()

In [None]:
model_base.dropna(subset=['Health Indicator'],axis=0,inplace=True)

In [None]:
model_base.isnull().sum()

In [None]:
# All the missing values have been treated

### Outlier treatment

In [None]:
#Drop City_Code
model_base.drop(columns=['City_Code'],axis=1, inplace=True)

In [None]:
model_base.head()

In [None]:
def dist_box(data):
 # function plots a combined graph for univariate analysis of continous variable 
 #to check spread, central tendency , dispersion and outliers  
    Name=data.name.upper()
    fig,(ax_box,ax_dis)  =plt.subplots(nrows=2,sharex=True,gridspec_kw = {"height_ratios": (.25, .75)},figsize=(8, 5))
    mean=data.mean()
    median=data.median()
    mode=data.mode().tolist()[0]
    sns.set_theme(style="white")
    fig.suptitle("SPREAD OF DATA FOR "+ Name  , fontsize=18, fontweight='bold')
    sns.boxplot(x=data,showmeans=True, orient='h',color="teal",ax=ax_box)
    ax_box.set(xlabel='')
     # just trying to make visualisation better. This will set background to white
    sns.despine(top=True,right=True,left=True) # to remove side line from graph
    sns.distplot(data,kde=False,color='purple',ax=ax_dis)
    ax_dis.axvline(mean, color='r', linestyle='--',linewidth=2)
    ax_dis.axvline(median, color='g', linestyle='-',linewidth=2)
    ax_dis.axvline(mode, color='y', linestyle='-',linewidth=2)
    plt.legend({'Mean':mean,'Median':median,'Mode':mode})

In [None]:
dist_box(model_base['Reco_Policy_Premium'])

In [None]:
# Reco_Policy_Premium has outliers.

In [None]:
QL = model_base['Reco_Policy_Premium'].quantile(0.01)
QH = model_base['Reco_Policy_Premium'].quantile(0.99)

In [None]:
def outlier_imputation(x):
    if x < QL:
        return QL
    elif x > QH:
        return QH
    else:
        return x

In [None]:
model_base['Reco_Policy_Premium'] = model_base['Reco_Policy_Premium'].apply(outlier_imputation)

In [None]:
model_base.head()

#### Processing Categorical variables:

In [None]:
num_vars = [i for i in model_base.columns if model_base[i].dtype != 'object']
cat_vars = [i for i in model_base.columns if model_base[i].dtype == 'object']

In [None]:
# Bivariates in cat vars

## Function to plot stacked bar chart
def stacked_plot(x):
    sns.set_palette(sns.color_palette("nipy_spectral", 8))
    tab1 = pd.crosstab(x,model_base['Response'],margins=True)
    print(tab1)
    print('-'*120)
    tab = pd.crosstab(x,model_base['Response'],normalize='index')
    tab.plot(kind='bar',stacked=True,figsize=(7,4))
    plt.xticks(rotation=360)
    labels=["No","Yes"]
    plt.legend(loc='lower left', frameon=False,)
    plt.legend(loc="upper left", labels=labels,title="Response",bbox_to_anchor=(1,1))
    sns.despine(top=True,right=True,left=True) # to remove side line from graph
    #plt.legend(labels)
    plt.show()

In [None]:
for i, variable in enumerate(cat_vars):
       stacked_plot(model_base[variable])

### Feature Selection:

In [None]:
# Encoding the Categorical Variables

In [None]:
model_base.head()

In [None]:
model_base['Health Indicator'].value_counts()

In [None]:
# Label Encoder on Health Indicator:

In [None]:
model_base['Health Indicator'] = model_base['Health Indicator'].astype('category')

In [None]:
model_base['Health Indicator _cat'] = model_base['Health Indicator'].cat.codes

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
labelencoder = LabelEncoder()

In [None]:
model_base['Health Indicator _cat'] = labelencoder.fit_transform(model_base['Health Indicator'])

In [None]:
model_base.head()

In [None]:
model_base['Health Indicator'].value_counts()

In [None]:
model_base.groupby(['Health Indicator'])['Health Indicator _cat'].mean()

In [None]:
model_base.head(2)

In [None]:
model_base.drop(columns=['Health Indicator'], axis=1, inplace=True)

In [None]:
#One Hot Encoder:

In [None]:
cat_vars_2 = ['Accomodation_Type','Reco_Insurance_Type','Is_Spouse']

In [None]:
for var in cat_vars_2:
    cat_list = 'var'+'_'+var
    cat_list = pd.get_dummies(model_base[var],prefix=var)
    model_base = model_base.join(cat_list)

model_base.drop(cat_vars_2, axis=1, inplace=True)

In [None]:
model_base.head()

In [None]:
# List of final Model Variables :

vars_final = ['Upper_Age','Lower_Age','Holding_Policy_Type','Reco_Policy_Cat','Reco_Policy_Premium','Response',
              'Health Indicator','Accomodation_Type','Reco_Insurance_Type','Is_Spouse']

### Entropy and Information gain :

In [None]:
def calc_entropy(column):
    """
    Calculate entropy given a pandas series, list, or numpy array.
    """
    # Compute the counts of each unique value in the column
    counts = np.bincount(column)
    # Divide by the total column length to get a probability
    probabilities = counts / len(column)
    
    # Initialize the entropy to 0
    entropy = 0
    # Loop through the probabilities, and add each one to the total entropy
    for prob in probabilities:
        if prob > 0:
            # use log from math and set base to 2
            entropy += prob * math.log(prob, 2)
    
    return -entropy

In [None]:
def calc_information_gain(data, split_name, target_name):
    """
    Calculate information gain given a data set, column to split on, and target
    """
    # Calculate the original entropy
    original_entropy = calc_entropy(data[target_name])
    
    #Find the unique values in the column
    values = data[split_name].unique()
    
    
    # Make two subsets of the data, based on the unique values
    left_split = data[data[split_name] == values[0]]
    right_split = data[data[split_name] == values[1]]
    
    # Loop through the splits and calculate the subset entropies
    to_subtract = 0
    for subset in [left_split, right_split]:
        prob = (subset.shape[0] / data.shape[0]) 
        to_subtract += prob * calc_entropy(subset[target_name])
    
    # Return information gain
    return original_entropy - to_subtract

### Processing the validation set:

In [None]:
df_validation = df_validation[vars_final]

In [None]:
df_validation.head()

In [None]:
df_validation.isnull().sum()

In [None]:
df_validation.dropna(subset=['Health Indicator'],axis=0,inplace=True)

In [None]:
df_validation['Holding_Policy_Type'].replace(np.nan,model_base['Holding_Policy_Type'].mode()[0],inplace=True)

In [None]:
df_validation['Health Indicator _cat'] = labelencoder.fit_transform(df_validation['Health Indicator'])
df_validation.drop(columns=['Health Indicator'],axis=1, inplace=True)

In [None]:
for var in cat_vars_2:
    cat_list = 'var'+'_'+var
    cat_list = pd.get_dummies(df_validation[var],prefix=var)
    df_validation = df_validation.join(cat_list)

df_validation.drop(cat_vars_2, axis=1, inplace=True)

### Model Development:

In [None]:
# Train Test Split:

In [None]:
base = model_base.copy()
y = base['Response'].values
X = base.drop(columns=['Response'], axis=1)

In [None]:
valid = df_validation.copy()
y_val = valid['Response'].values
X_val = valid.drop(columns=['Response'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,shuffle=True)

In [None]:
print('Training set : ',(X_train.shape,y_train.shape))
print('Testing set : ',(X_test.shape,y_test.shape))

In [None]:
# Defining Functions for evaluation metrics:

In [None]:
def make_confusion_matrix(y_actual,y_predict,title):
    '''Plot confusion matrix'''
    fig, ax = plt.subplots(1, 1)
    
    cm = confusion_matrix(y_actual, y_predict, labels=[0,1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                               display_labels=["No","Yes"])
    disp.plot(cmap='Greens',colorbar=True,ax=ax)
    
    ax.set_title(title)
    plt.tick_params(axis=u'both', which=u'both',length=0)
    plt.grid(b=None,axis='both',which='both',visible=False)
    plt.show()


In [None]:
def get_metrics_score(model,X_train_df,X_test_df,y_train_pass,y_test_pass,X_val, y_val, flag=True):
    '''
    Function to calculate different metric scores of the model - Accuracy, Recall, Precision, and F1 score
    model: classifier to predict values of X
    train, test: Independent features
    train_y,test_y: Dependent variable
    threshold: thresold for classifiying the observation as 1
    flag: If the flag is set to True then only the print statements showing different will be displayed. The default value is set to True.
    roc: If the roc is set to True then only roc score will be displayed. The default value is set to False.
    '''
    # defining an empty list to store train and test results
    score_list=[] 
    
    pred_train = model.predict(X_train_df)
    pred_test = model.predict(X_test_df)
    pred_val = model.predict(X_val)
    
    
    pred_train = np.round(pred_train)
    pred_test = np.round(pred_test)
    pred_val = np.round(pred_val)
    
    
    train_acc = accuracy_score(y_train_pass,pred_train)
    test_acc = accuracy_score(y_test_pass,pred_test)
    val_acc = accuracy_score(y_val,pred_val)
    
    
    train_recall = recall_score(y_train_pass,pred_train)
    test_recall = recall_score(y_test_pass,pred_test)
    val_recall = recall_score(y_val,pred_val)
    
    
    train_precision = precision_score(y_train_pass,pred_train)
    test_precision = precision_score(y_test_pass,pred_test)
    val_precision = precision_score(y_val,pred_val)
    
    
    train_f1 = f1_score(y_train_pass,pred_train)
    test_f1 = f1_score(y_test_pass,pred_test)
    val_f1 = f1_score(y_val,pred_val)
    
    train_auc = roc_auc_score(y_train_pass,pred_train)
    test_auc = roc_auc_score(y_test_pass,pred_test)
    val_auc = roc_auc_score(y_val,pred_val)
    
    
    score_list.extend((train_acc,test_acc,val_acc, train_recall,test_recall, val_recall, train_precision,test_precision, val_precision,
                       train_f1,test_f1,val_f1, train_auc, test_auc, val_auc))
    
    if flag == True: 
        metric_names = ['Train_Accuracy', 'Test_Accuracy','Val_Accuracy', 'Train_Recall', 'Test_Recall','Val_Recall','Train_Precision',
                      'Test_Precision','Val_Precision', 'Train_F1-Score', 'Test_F1-Score','Val_F1-Score', 'Train_AUC_Score','Test_AUC_Score','Val_AUC_Score']
        cols = ['Metric', 'Score']
        records = [(name, score) for name, score in zip(metric_names, score_list)]
        display(pd.DataFrame.from_records(records, columns=cols, index='Metric').T)
        make_confusion_matrix(y_train_pass,pred_train,"Confusion Matrix for Train set")     
        make_confusion_matrix(y_test_pass,pred_test,"Confusion Matrix for Test set")
        make_confusion_matrix(y_val,pred_val,"Confusion Matrix for Validation set")
        
    return score_list # returning the list with train and test scores




In [None]:

# # defining empty lists to add train and test results 
acc_train = []
acc_test = []
acc_val = []

recall_train = []
recall_test = []
recall_val = []

precision_train = []
precision_test = []
precision_val = []

f1_train = []
f1_test = []
f1_val = []

auc_train = []
auc_test = []
auc_val = []


In [None]:

def add_score_model(score):
    '''add score of modelto list'''
    acc_train.append(score[0])
    acc_test.append(score[1])
    acc_val.append(score[2])
    
    recall_train.append(score[3])
    recall_test.append(score[4])
    recall_val.append(score[5])
    
    precision_train.append(score[6])
    precision_test.append(score[7])
    precision_val.append(score[8])
    
    f1_train.append(score[9])
    f1_test.append(score[10])
    f1_val.append(score[11])
    
    auc_train.append(score[12])
    auc_test.append(score[13])
    auc_val.append(score[14])
    

#### Logistic Regression:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,shuffle=True)

In [None]:
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

In [None]:
# Normalize the dataset

X_train = StandardScaler().fit(X_train).transform(X_train)
X_train[0:2]

In [None]:
LR = LogisticRegression(C=0.01, solver='liblinear').fit(X_train, y_train)
LR

In [None]:
y_pred_train = LR.predict(X_train)
y_proba_train = LR.predict_proba(X_train)

y_pred_test = LR.predict(X_test)
y_proba_test = LR.predict_proba(X_test)

y_pred_val = LR.predict(X_val)
y_proba_val = LR.predict_proba(X_val)

In [None]:
LR_score=get_metrics_score(LR,X_train,X_test,y_train,y_test, X_val, y_val)
add_score_model(LR_score)

#### Naive Bayes:

In [None]:
GNB = GaussianNB().fit(X_train,y_train)

In [None]:
y_pred_train = GNB.predict(X_train)
y_proba_train = GNB.predict_proba(X_train)

y_pred_test = GNB.predict(X_test)
y_proba_test = GNB.predict_proba(X_test)

y_pred_val = GNB.predict(X_val)
y_proba_val = GNB.predict_proba(X_val)

In [None]:
GNB_score=get_metrics_score(GNB,X_train,X_test,y_train,y_test, X_val, y_val)
add_score_model(GNB_score)

#### KNN :

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,shuffle=True)

In [None]:
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

In [None]:
# Normalize the dataset

X_train = StandardScaler().fit(X_train).transform(X_train)
X_train[0:2]

In [None]:
k = 4
KNN = KNeighborsClassifier(n_neighbors = k).fit(X_train,y_train)

In [None]:
y_pred_train = KNN.predict(X_train)
y_proba_train = KNN.predict_proba(X_train)

y_pred_test = KNN.predict(X_test)
y_proba_test = KNN.predict_proba(X_test)

y_pred_val = KNN.predict(X_val)
y_proba_val = KNN.predict_proba(X_val)

In [None]:
KNN_score=get_metrics_score(KNN,X_train,X_test,y_train,y_test, X_val, y_val)
add_score_model(KNN_score)

#### SVM:

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,shuffle=True)

In [None]:
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)

In [None]:
# Normalize the dataset

X_train = StandardScaler().fit(X_train).transform(X_train)
X_train[0:2]

In [None]:
SVM = svm.SVC(kernel='rbf',probability=True)

In [None]:
SVM.fit(X_train,y_train)

In [None]:
y_pred_train = SVM.predict(X_train)
y_proba_train = SVM.predict_proba(X_train)

y_pred_test = SVM.predict(X_test)
y_proba_test = SVM.predict_proba(X_test)

y_pred_val = SVM.predict(X_val)
y_proba_val = SVM.predict_proba(X_val)

In [None]:
SVM_score=get_metrics_score(SVM,X_train,X_test,y_train,y_test, X_val, y_val)
add_score_model(SVM_score)

#### Decision Trees:

In [None]:
DTree = DecisionTreeClassifier(criterion="entropy", max_depth = 4)
DTree.fit(X_train,y_train)

In [None]:
y_pred_train = DTree.predict(X_train)
y_proba_train = DTree.predict_proba(X_train)

y_pred_test = DTree.predict(X_test)
y_proba_test = DTree.predict_proba(X_test)

y_pred_val = DTree.predict(X_val)
y_proba_val = DTree.predict_proba(X_val)

In [None]:
DTree_score=get_metrics_score(DTree,X_train,X_test,y_train,y_test, X_val, y_val)
add_score_model(DTree_score)

#### Random Forest:

In [None]:
RFC = RandomForestClassifier(max_depth=2, random_state=0)
RFC.fit(X_train, y_train)

In [None]:
y_pred_train = RFC.predict(X_train)
y_proba_train = RFC.predict_proba(X_train)

y_pred_test = RFC.predict(X_test)
y_proba_test = RFC.predict_proba(X_test)

y_pred_val = RFC.predict(X_val)
y_proba_val = RFC.predict_proba(X_val)

In [None]:
RFC_score=get_metrics_score(RFC,X_train,X_test,y_train,y_test, X_val, y_val)
add_score_model(RFC_score)

#### XG Boost:

In [None]:
XGB = XGBClassifier().fit(X_train, y_train)

In [None]:
y_pred_train = XGB.predict(X_train)
y_proba_train = XGB.predict_proba(X_train)

y_pred_test = XGB.predict(X_test)
y_proba_test = XGB.predict_proba(X_test)

y_pred_val = XGB.predict(X_val)
y_proba_val = XGB.predict_proba(X_val)

In [None]:
XGB_score=get_metrics_score(XGB,X_train,X_test,y_train,y_test, X_val, y_val)
add_score_model(XGB_score)

#### Bagging Classifiers:

In [None]:
BagC = BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0).fit(X_train, y_train)

In [None]:
y_pred_train = BagC.predict(X_train)
y_proba_train = BagC.predict_proba(X_train)

y_pred_test = BagC.predict(X_test)
y_proba_test = BagC.predict_proba(X_test)

y_pred_val = BagC.predict(X_val)
y_proba_val = BagC.predict_proba(X_val)

In [None]:
BagC_score=get_metrics_score(BagC,X_train,X_test,y_train,y_test, X_val, y_val)
add_score_model(BagC_score)