In [25]:
import os
import numpy as np
import pandas as pd
import warnings
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import *
import seaborn as sns 
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier


warnings.filterwarnings("ignore")
pd.options.display.max_rows = None
pd.options.display.max_columns = None

In [22]:
df = pd.read_csv('home_insurance.csv')

In [3]:
df.head()

Unnamed: 0,QUOTE_DATE,COVER_START,CLAIM3YEARS,P1_EMP_STATUS,P1_PT_EMP_STATUS,BUS_USE,CLERICAL,AD_BUILDINGS,RISK_RATED_AREA_B,SUM_INSURED_BUILDINGS,NCD_GRANTED_YEARS_B,AD_CONTENTS,RISK_RATED_AREA_C,SUM_INSURED_CONTENTS,NCD_GRANTED_YEARS_C,CONTENTS_COVER,BUILDINGS_COVER,SPEC_SUM_INSURED,SPEC_ITEM_PREM,UNSPEC_HRP_PREM,P1_DOB,P1_MAR_STATUS,P1_POLICY_REFUSED,P1_SEX,APPR_ALARM,APPR_LOCKS,BEDROOMS,ROOF_CONSTRUCTION,WALL_CONSTRUCTION,FLOODING,LISTED,MAX_DAYS_UNOCC,NEIGH_WATCH,OCC_STATUS,OWNERSHIP_TYPE,PAYING_GUESTS,PROP_TYPE,SAFE_INSTALLED,SEC_DISC_REQ,SUBSIDENCE,YEARBUILT,CAMPAIGN_DESC,PAYMENT_METHOD,PAYMENT_FREQUENCY,LEGAL_ADDON_PRE_REN,LEGAL_ADDON_POST_REN,HOME_EM_ADDON_PRE_REN,HOME_EM_ADDON_POST_REN,GARDEN_ADDON_PRE_REN,GARDEN_ADDON_POST_REN,KEYCARE_ADDON_PRE_REN,KEYCARE_ADDON_POST_REN,HP1_ADDON_PRE_REN,HP1_ADDON_POST_REN,HP2_ADDON_PRE_REN,HP2_ADDON_POST_REN,HP3_ADDON_PRE_REN,HP3_ADDON_POST_REN,MTA_FLAG,MTA_FAP,MTA_APRP,MTA_DATE,LAST_ANN_PREM_GROSS,POL_STATUS,i,Police
0,11/22/2007,22/11/2007,N,R,,N,,Y,19.0,1000000.0,7.0,Y,6.0,50000.0,7.0,Y,Y,7500.0,44.42,12.45,15/06/1939,O,N,M,N,Y,3.0,11.0,15.0,Y,3.0,0.0,N,PH,8.0,0.0,10.0,Y,Y,N,1960.0,,PureDD,,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,,,,274.81,Lapsed,1,P000001
1,11/22/2007,01/01/2008,N,E,,Y,N,Y,25.0,1000000.0,6.0,Y,9.0,50000.0,7.0,Y,Y,0.0,0.0,24.6,20/05/1970,M,N,M,N,N,3.0,11.0,15.0,Y,3.0,0.0,N,PH,3.0,0.0,2.0,N,N,N,1960.0,,PureDD,,Y,Y,N,N,N,N,N,N,N,N,N,N,N,N,Y,308.83,-9.27,,308.83,Live,2,P000002
2,11/23/2007,23/11/2007,N,E,,N,,N,,0.0,0.0,Y,12.0,50000.0,7.0,N,Y,0.0,0.0,0.0,10/06/1947,S,N,M,Y,Y,2.0,11.0,15.0,Y,3.0,0.0,Y,PH,8.0,0.0,9.0,N,Y,N,1946.0,,PureDD,,Y,Y,N,N,N,N,N,N,N,N,N,N,N,N,Y,52.65,52.65,03/11/2010,52.65,Live,3,P000003
3,11/23/2007,12/12/2007,N,R,,N,,N,,0.0,0.0,Y,14.0,50000.0,7.0,N,Y,0.0,0.0,0.0,16/12/1925,W,N,F,N,Y,2.0,11.0,15.0,Y,3.0,0.0,N,PH,18.0,0.0,19.0,N,Y,N,1870.0,,NonDD,,N,N,N,N,N,N,N,N,N,N,N,N,N,N,N,,,,54.23,Live,4,P000004
4,11/22/2007,15/12/2007,N,R,,N,,Y,5.0,1000000.0,7.0,Y,10.0,50000.0,7.0,Y,Y,0.0,0.0,19.82,11/02/1936,M,N,M,Y,Y,3.0,11.0,15.0,Y,3.0,0.0,N,PH,8.0,0.0,1.0,N,Y,N,1960.0,,DD-Other,,Y,Y,N,N,Y,Y,N,N,N,N,N,N,N,N,N,,,,244.58,Live,5,P000005


In [4]:
df = df[df["POL_STATUS"].notnull()]
    
#Clean the target variable
df = df[df["POL_STATUS"] != "Unknown"]
df["lapse"] = np.where(df["POL_STATUS"] == "Live", 0, 1)
    
#Create dummy variables for categorical variables
categorical_cols = ["CLAIM3YEARS", "BUS_USE", "AD_BUILDINGS",
                        "APPR_ALARM", "CONTENTS_COVER", "P1_SEX",
                        "BUILDINGS_COVER", "P1_POLICY_REFUSED", 
                        "APPR_LOCKS", "FLOODING",
                        "NEIGH_WATCH", "SAFE_INSTALLED", "SEC_DISC_REQ",
                        "SUBSIDENCE", "LEGAL_ADDON_POST_REN", 
                        "HOME_EM_ADDON_PRE_REN","HOME_EM_ADDON_POST_REN", 
                        "GARDEN_ADDON_PRE_REN", "GARDEN_ADDON_POST_REN", 
                        "KEYCARE_ADDON_PRE_REN", "KEYCARE_ADDON_POST_REN", 
                        "HP1_ADDON_PRE_REN", "HP1_ADDON_POST_REN",
                        "HP2_ADDON_PRE_REN", "HP2_ADDON_POST_REN", 
                        "HP3_ADDON_PRE_REN", "HP3_ADDON_POST_REN", 
                        "MTA_FLAG", "OCC_STATUS", "OWNERSHIP_TYPE",
                        "PROP_TYPE", "PAYMENT_METHOD", "P1_EMP_STATUS",
                        "P1_MAR_STATUS"
                        ]
    
for col in categorical_cols:
    dummies = pd.get_dummies(df[col], drop_first = True, prefix = col)
    df = pd.concat([df, dummies], 1)
    
#Create time features
df["age"] = (datetime.strptime("2013-01-01", "%Y-%m-%d") - pd.to_datetime(df["P1_DOB"])).dt.days // 365
df["property_age"] = 2013 - df["YEARBUILT"]
df["cover_length"] = 2013 - pd.to_datetime(df["COVER_START"]).dt.year


    
#Impute missing value
df["RISK_RATED_AREA_B"] = df["RISK_RATED_AREA_B"].fillna(0)
df["RISK_RATED_AREA_C"] = df["RISK_RATED_AREA_C"].fillna(0)
df["MTA_FAP"] = df["MTA_FAP"].fillna(0)
df["MTA_APRP"] = df["MTA_APRP"].fillna(0)


In [5]:
FEATS = [
         "CLAIM3YEARS_Y", "BUS_USE_Y", "AD_BUILDINGS_Y",
         "CONTENTS_COVER_Y", "P1_SEX_M", "P1_SEX_N", "BUILDINGS_COVER_Y", 
         "P1_POLICY_REFUSED_Y", "APPR_ALARM_Y", "APPR_LOCKS_Y", "FLOODING_Y", 
         "NEIGH_WATCH_Y", "SAFE_INSTALLED_Y", "SEC_DISC_REQ_Y", "SUBSIDENCE_Y", 
         "LEGAL_ADDON_POST_REN_Y", "HOME_EM_ADDON_PRE_REN_Y", 
         "HOME_EM_ADDON_POST_REN_Y", "GARDEN_ADDON_PRE_REN_Y",
         "GARDEN_ADDON_POST_REN_Y", "KEYCARE_ADDON_PRE_REN_Y", 
         "KEYCARE_ADDON_POST_REN_Y", "HP1_ADDON_PRE_REN_Y", "HP1_ADDON_POST_REN_Y", 
         "HP2_ADDON_PRE_REN_Y", "HP2_ADDON_POST_REN_Y", "HP3_ADDON_PRE_REN_Y", 
         "HP3_ADDON_POST_REN_Y", "MTA_FLAG_Y", "OCC_STATUS_LP",
         "OCC_STATUS_PH", "OCC_STATUS_UN", "OCC_STATUS_WD",
         "OWNERSHIP_TYPE_2.0", "OWNERSHIP_TYPE_3.0", "OWNERSHIP_TYPE_6.0", 
         "OWNERSHIP_TYPE_7.0", "OWNERSHIP_TYPE_8.0", "OWNERSHIP_TYPE_11.0", 
         "OWNERSHIP_TYPE_12.0", "OWNERSHIP_TYPE_13.0", "OWNERSHIP_TYPE_14.0", 
         "OWNERSHIP_TYPE_16.0", "OWNERSHIP_TYPE_17.0", 
         "OWNERSHIP_TYPE_18.0", "PROP_TYPE_2.0", "PROP_TYPE_3.0", "PROP_TYPE_4.0", 
         "PROP_TYPE_7.0", "PROP_TYPE_9.0", "PROP_TYPE_10.0", 
         "PROP_TYPE_16.0", "PROP_TYPE_17.0", "PROP_TYPE_18.0", "PROP_TYPE_19.0", 
         "PROP_TYPE_20.0", "PROP_TYPE_21.0", "PROP_TYPE_22.0", "PROP_TYPE_23.0", 
         "PROP_TYPE_24.0", "PROP_TYPE_25.0", "PROP_TYPE_26.0", "PROP_TYPE_27.0", 
         "PROP_TYPE_29.0", "PROP_TYPE_30.0", "PROP_TYPE_31.0", 
         "PROP_TYPE_32.0", "PROP_TYPE_37.0", "PROP_TYPE_39.0", 
         "PROP_TYPE_40.0", "PROP_TYPE_44.0", "PROP_TYPE_45.0", "PROP_TYPE_47.0", 
         "PROP_TYPE_48.0", "PROP_TYPE_51.0", "PROP_TYPE_52.0", "PROP_TYPE_53.0", 
         "PAYMENT_METHOD_NonDD", "PAYMENT_METHOD_PureDD", "P1_EMP_STATUS_C", 
         "P1_EMP_STATUS_E", "P1_EMP_STATUS_F", "P1_EMP_STATUS_H", "P1_EMP_STATUS_I", 
         "P1_EMP_STATUS_N", "P1_EMP_STATUS_R", "P1_EMP_STATUS_S", "P1_EMP_STATUS_U", 
         "P1_EMP_STATUS_V", "P1_MAR_STATUS_B", "P1_MAR_STATUS_C", "P1_MAR_STATUS_D", 
         "P1_MAR_STATUS_M", "P1_MAR_STATUS_N", "P1_MAR_STATUS_O", "P1_MAR_STATUS_P", 
         "P1_MAR_STATUS_S", "P1_MAR_STATUS_W",          "age", "property_age", "cover_length", "RISK_RATED_AREA_B", 
         "RISK_RATED_AREA_C", "MTA_FAP", "MTA_APRP",
         "SUM_INSURED_BUILDINGS", "NCD_GRANTED_YEARS_B", "SUM_INSURED_CONTENTS", 
         "NCD_GRANTED_YEARS_C", "SPEC_SUM_INSURED", "SPEC_ITEM_PREM", 
         "UNSPEC_HRP_PREM", "BEDROOMS", "MAX_DAYS_UNOCC", "LAST_ANN_PREM_GROSS"
        ]

TARGET=['lapse']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df[FEATS], df[TARGET], test_size = .3, random_state = 42)

In [7]:
# Standardise the dataset 

numerical_cols = ["age", "property_age", "cover_length", "RISK_RATED_AREA_B", 
        "RISK_RATED_AREA_C", "MTA_FAP", "MTA_APRP",
        "SUM_INSURED_BUILDINGS", "NCD_GRANTED_YEARS_B", "SUM_INSURED_CONTENTS", 
        "NCD_GRANTED_YEARS_C", "SPEC_SUM_INSURED", "SPEC_ITEM_PREM", 
        "UNSPEC_HRP_PREM", "BEDROOMS", "MAX_DAYS_UNOCC", "LAST_ANN_PREM_GROSS"]


for col in numerical_cols:
    scaler = StandardScaler()
    X_train[col] = scaler.fit_transform(X_train[[col]])
    X_test[col] = scaler.transform(X_test[[col]])

In [11]:
# train Catboost model
model = CatBoostClassifier(logging_level='Silent')
model.fit(X_train, y_train)
fitted_model_params = model.get_all_params()
print('got params')

y_preds = model.predict(X_test)
test_recall = recall_score(y_test, y_preds, average='binary')
test_precision = precision_score(y_test, y_preds)
test_class = classification_report(y_test, y_preds)
cnf_matrix = confusion_matrix(y_test, y_preds)
print('{}: Recall w/all features on test data {}:'.format('catboost', test_recall.round(4)))
print(test_class)
print(cnf_matrix)
print('-------------------------------------------------------')

got params
catboost: Recall w/all features on test data 0.3553:
              precision    recall  f1-score   support

           0       0.77      0.93      0.84     39589
           1       0.67      0.36      0.46     17113

    accuracy                           0.75     56702
   macro avg       0.72      0.64      0.65     56702
weighted avg       0.74      0.75      0.73     56702

[[36628  2961]
 [11032  6081]]
-------------------------------------------------------


In [13]:
#Random Forest
clf=RandomForestClassifier()
clf.fit(X_train,y_train)
y_preds = clf.predict(X_test)
test_recall = recall_score(y_test, y_preds, average='binary')
test_precision = precision_score(y_test, y_preds)
test_class = classification_report(y_test, y_preds)
cnf_matrix = confusion_matrix(y_test, y_preds)
print('{}: Recall w/all features on test data {}:'.format('RandomForest', test_recall.round(4)))
print(test_class)
print(cnf_matrix)
print('-------------------------------------------------------')


RandomForest: Recall w/all features on test data 0.2983:
              precision    recall  f1-score   support

           0       0.75      0.93      0.83     39589
           1       0.65      0.30      0.41     17113

    accuracy                           0.74     56702
   macro avg       0.70      0.61      0.62     56702
weighted avg       0.72      0.74      0.71     56702

[[36868  2721]
 [12009  5104]]
-------------------------------------------------------


In [14]:
#Naive Bayes
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_preds = gnb.predict(X_test)
test_recall = recall_score(y_test, y_preds, average='binary')
test_precision = precision_score(y_test, y_preds)
test_class = classification_report(y_test, y_preds)
cnf_matrix = confusion_matrix(y_test, y_preds)
print('{}: Recall w/all features on test data {}:'.format('GaussianNB', test_recall.round(4)))
print(test_class)
print(cnf_matrix)
print('-------------------------------------------------------')


GaussianNB: Recall w/all features on test data 0.8054:
              precision    recall  f1-score   support

           0       0.82      0.40      0.54     39589
           1       0.37      0.81      0.50     17113

    accuracy                           0.52     56702
   macro avg       0.60      0.60      0.52     56702
weighted avg       0.69      0.52      0.53     56702

[[15699 23890]
 [ 3331 13782]]
-------------------------------------------------------


In [18]:
#Adaboost
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)
y_preds = ada.predict(X_test)
test_recall = recall_score(y_test, y_preds, average='binary')
test_precision = precision_score(y_test, y_preds)
test_class = classification_report(y_test, y_preds)
cnf_matrix = confusion_matrix(y_test, y_preds)
print('{}: Recall w/all features on test data {}:'.format('AdaBoost', test_recall.round(4)))
print(test_class)
print(cnf_matrix)
print('-------------------------------------------------------')


AdaBoost: Recall w/all features on test data 0.2212:
              precision    recall  f1-score   support

           0       0.74      0.94      0.83     39589
           1       0.61      0.22      0.32     17113

    accuracy                           0.72     56702
   macro avg       0.67      0.58      0.58     56702
weighted avg       0.70      0.72      0.67     56702

[[37184  2405]
 [13327  3786]]
-------------------------------------------------------


In [20]:
#GradientBoostingClassifier
grad = GradientBoostingClassifier()
grad.fit(X_train, y_train)
y_preds = grad.predict(X_test)
test_recall = recall_score(y_test, y_preds, average='binary')
test_precision = precision_score(y_test, y_preds)
test_class = classification_report(y_test, y_preds)
cnf_matrix = confusion_matrix(y_test, y_preds)
print('{}: Recall w/all features on test data {}:'.format('GradientBoosting', test_recall.round(4)))
print(test_class)
print(cnf_matrix)
print('-------------------------------------------------------')


GradientBoosting: Recall w/all features on test data 0.235:
              precision    recall  f1-score   support

           0       0.74      0.95      0.83     39589
           1       0.65      0.23      0.35     17113

    accuracy                           0.73     56702
   macro avg       0.70      0.59      0.59     56702
weighted avg       0.71      0.73      0.68     56702

[[37461  2128]
 [13092  4021]]
-------------------------------------------------------


In [24]:
#XGBoost
xgbc = XGBClassifier()
xgbc.fit(X_train, y_train)
y_preds = xgbc.predict(X_test)
test_recall = recall_score(y_test, y_preds, average='binary')
test_precision = precision_score(y_test, y_preds)
test_class = classification_report(y_test, y_preds)
cnf_matrix = confusion_matrix(y_test, y_preds)
print('{}: Recall w/all features on test data {}:'.format('xgboost', test_recall.round(4)))
print(test_class)
print(cnf_matrix)
print('-------------------------------------------------------')



xgboost: Recall w/all features on test data 0.3565:
              precision    recall  f1-score   support

           0       0.77      0.92      0.84     39589
           1       0.66      0.36      0.46     17113

    accuracy                           0.75     56702
   macro avg       0.71      0.64      0.65     56702
weighted avg       0.73      0.75      0.72     56702

[[36415  3174]
 [11013  6100]]
-------------------------------------------------------


In [26]:
#LightGBM
light = LGBMClassifier()
light.fit(X_train, y_train)
y_preds = light.predict(X_test)
test_recall = recall_score(y_test, y_preds, average='binary')
test_precision = precision_score(y_test, y_preds)
test_class = classification_report(y_test, y_preds)
cnf_matrix = confusion_matrix(y_test, y_preds)
print('{}: Recall w/all features on test data {}:'.format('lightgbm', test_recall.round(4)))
print(test_class)
print(cnf_matrix)
print('-------------------------------------------------------')


lightgbm: Recall w/all features on test data 0.3309:
              precision    recall  f1-score   support

           0       0.76      0.93      0.84     39589
           1       0.67      0.33      0.44     17113

    accuracy                           0.75     56702
   macro avg       0.72      0.63      0.64     56702
weighted avg       0.74      0.75      0.72     56702

[[36827  2762]
 [11450  5663]]
-------------------------------------------------------


In [27]:
#Voting Ensemble for Classification
estimators = []
estimators.append(('catboost', model))
estimators.append(('randomforest', clf))
estimators.append(('lightgbm', light))
estimators.append(('adaboost', ada))
estimators.append(('xgboost', xgbc))
estimators.append(('gradientboosting', grad))
ensemble = VotingClassifier(estimators)
ensemble.fit(X_train, y_train)
y_preds = ensemble.predict(X_test)
test_recall = recall_score(y_test, y_preds, average='binary')
test_precision = precision_score(y_test, y_preds)
test_class = classification_report(y_test, y_preds)
cnf_matrix = confusion_matrix(y_test, y_preds)
print('{}: Recall w/all features on test data {}:'.format('VotingClassifier', test_recall.round(4)))
print(test_class)
print(cnf_matrix)
print('-------------------------------------------------------')


VotingClassifier: Recall w/all features on test data 0.2819:
              precision    recall  f1-score   support

           0       0.75      0.95      0.84     39589
           1       0.69      0.28      0.40     17113

    accuracy                           0.75     56702
   macro avg       0.72      0.61      0.62     56702
weighted avg       0.74      0.75      0.71     56702

[[37464  2125]
 [12289  4824]]
-------------------------------------------------------
