In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.model_selection import ShuffleSplit, cross_val_score

from scipy.stats import boxcox
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from lightgbm import LGBMClassifier
from bayes_opt import BayesianOptimization

# Load Data

In [None]:
train = pd.read_csv('../input/inclass-health-insurance-2/massp-health-insurance-prediction/train.csv')
test = pd.read_csv('../input/inclass-health-insurance-2/massp-health-insurance-prediction/test.csv')

In [None]:
train.head(5)

In [None]:
train.shape

In [None]:
test.head(5)

In [None]:
test.shape

In [None]:
train.columns

### TARGET

In [None]:
train['Response'].value_counts()

# EDA

### Handling Missing Values

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

### Multivariable Analysing

In [None]:
for col in test.columns:
    print(f"{col}")
    print(f"Train:{train[col].nunique()}\nTest:{test[col].nunique()}")
    print("---------------------")

In [None]:
train.info()

### Concatenating train and test

In [None]:
data = pd.concat([train,test],axis=0,sort=False)

In [None]:
data.shape

# Feature Engineering

In [None]:
for col in ["Gender", "Vehicle_Age", "Vehicle_Damage"]:
    data_one_hot = pd.get_dummies(data[col], prefix = col)
    data = pd.concat([data, data_one_hot], axis=1)
    data = data.drop(col, axis=1)

In [None]:
data.columns = [col.replace(' ', '_') for col in data.columns]

In [None]:
data.head()

In [None]:
data.shape

Creat new features

In [None]:
data["Insured_With_No_Damage"] = data["Previously_Insured"]*data["Vehicle_Damage_No"]

data["Not_Insured_With_Damage"] = data["Previously_Insured"].apply(lambda x: 1 if x == 0 else 0) * data["Vehicle_Damage_Yes"]

data["New_Damage_No_Insurance"] = data["Vehicle_Age_<_1_Year"]*data["Not_Insured_With_Damage"]

In [None]:
top3regions = data["Region_Code"].value_counts().index.tolist()[0:3]
top3channels = data["Policy_Sales_Channel"].value_counts().index.tolist()[0:3]

data["Top_3_Region"] = data["Region_Code"].apply(lambda x: 1 if x in top3regions else 0)
data["Top_3_Sales_Channel"] = data["Policy_Sales_Channel"].apply(lambda x: 1 if x in top3channels else 0)


In [None]:
data["Amount_Spent_Per_Day"] = data["Annual_Premium"]/data["Vintage"]

In [None]:
data.head()

In [None]:
data.shape

Outlier

In [None]:
data["Annual_Premium"].skew()

In [None]:
q75 = data["Annual_Premium"].quantile(q=.75)
q25 = data["Annual_Premium"].quantile(q=.25)
IQR = q75-q25

lowerBound = q25 - 1.5*IQR
upperBound = q75 + 1.5*IQR

print(lowerBound)
print(upperBound)

In [None]:
outliers = data.loc[(data["Annual_Premium"] < lowerBound) | (data["Annual_Premium"] > upperBound)]
data = data.drop(outliers.index)
print("Dropped", outliers.shape[0], "outliers.")

In [None]:
data['Annual_Premium']= np.where(data['Annual_Premium'] > upper_limit, upper_limit,
                        np.where(data['Annual_Premium'] < lower_limit, lower_limit, data['Annual_Premium']))

In [None]:
data.shape

Normalization

In [None]:
data["Age"], age_lambda = boxcox(data["Age"])
data["Annual_Premium"], annualprem_lambda = boxcox(data["Annual_Premium"])

In [None]:
data.shape

Scaling

In [None]:
scale = StandardScaler()
data[["Age", "Annual_Premium", "Vintage", "Amount_Spent_Per_Day"]] = scale.fit_transform(data[["Age", 
                                                                                               "Annual_Premium", 
                                                                                               "Vintage", 
                                                                                               "Amount_Spent_Per_Day"]])

In [None]:
data.shape

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.drop(columns=['id','Response','Driving_License','Gender_Female'],inplace=True)

In [None]:
data.shape

# Model

Seperate train & test

In [None]:
y = train['Response']

In [None]:
data.shape

In [None]:
train_set = data[:len(train)]
test_set = data[len(train):]

In [None]:
train_set.shape

In [None]:
test_set.shape

In [None]:
# smote = SMOTE()
# X_os, y_os = smote.fit_resample(train_set, y)

In [None]:
# print('Original dataset shape {}'.format(Counter(y)))
# print('Resampled dataset shape {}'.format(Counter(y_os))) 

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_set, y, test_size=0.3, random_state=101)

### LGBM

In [None]:
model= LGBMClassifier(boosting_type='gbdt',objective='binary',random_state=101)

In [None]:
model=LGBMClassifier(colsample_bytree=0.5, learning_rate=0.03,
                     n_estimators=600, objective='binary', reg_alpha=0.1,
                     random_state=101,reg_lambda=0.8)

model.fit(X_train,y_train)

In [None]:
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
scores = cross_val_score(model, X_val, y_val, cv=cv,scoring='roc_auc')
scores.mean()

In [None]:
val_pred= model.predict_proba(X_val)[:,1]

In [None]:
print(roc_auc_score(y_val,val_pred))

### Random Forest

In [None]:
model = RandomForestClassifier(max_depth=50, max_leaf_nodes=500, n_estimators=200)
model.fit(X_train, y_train)

In [None]:
pred_val = model.predict_proba(X_val)[:, 1]

In [None]:
print(roc_auc_score(y_val, pred_val))

Hyperparameter Tuning

In [None]:
# def bayes_parameter_opt_lgb(X, y, init_round=15, opt_round=25, n_folds=5, random_seed=6, n_estimators=10000, learning_rate=0.05, output_process=False):

#     train_data = lgb.Dataset(data=X, label=y)

#     def lgb_eval(num_leaves, feature_fraction, bagging_fraction, max_depth, lambda_l1, lambda_l2, min_split_gain, min_child_weight):
#         params = {'application':'binary','num_iterations': n_estimators, 'learning_rate':learning_rate, 'early_stopping_round':100, 'metric':'auc'}
#         params["num_leaves"] = int(round(num_leaves))
#         params['feature_fraction'] = max(min(feature_fraction, 1), 0)
#         params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
#         params['max_depth'] = int(round(max_depth))
#         params['lambda_l1'] = max(lambda_l1, 0)
#         params['lambda_l2'] = max(lambda_l2, 0)
#         params['min_split_gain'] = min_split_gain
#         params['min_child_weight'] = min_child_weight
#         cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed, stratified=True, verbose_eval =200, metrics=['auc'])
#         return max(cv_result['auc-mean'])

#     lgbBO = BayesianOptimization(lgb_eval, {'num_leaves': (24, 45),
#                                             'feature_fraction': (0.1, 0.9),
#                                             'bagging_fraction': (0.8, 1),
#                                             'max_depth': (5, 8.99),
#                                             'lambda_l1': (0, 5),
#                                             'lambda_l2': (0, 3),
#                                             'min_split_gain': (0.001, 0.1),
#                                             'min_child_weight': (5, 50)}, random_state=0)

#     lgbBO.maximize(init_points=init_round, n_iter=opt_round)
    
#     if output_process==True: lgbBO.points_to_csv("bayes_opt_result.csv")
    
#     return lgbBO.res['max']['max_params']

In [None]:
# opt_params = bayes_parameter_opt_lgb(X_os, y_os, init_round=5, opt_round=10, n_folds=3, random_seed=6, n_estimators=100, learning_rate=0.05)

In [None]:
# import re
# train_os = train_os.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
# lgb_train = lgb.Dataset(data = X_train, label = y_train)
# lgb_eval = lgb.Dataset(data = X_val, label = y_val)

In [None]:
# params = {'num_leaves': 45, 
#           'feature_fraction': 0.9, 
#           'bagging_fraction': 0.8, 
#           'max_depth': 9, 
#           'lambda_l1': 0.7297, 
#           'lambda_l2': 3, 
#           'min_split_gain': 0.001, 
#           'min_child_weight': 40.97,
#           'metric': 'auc',
#           'num_iteration': 500}

In [None]:
# model = lgb.train(params, lgb_train, valid_sets=lgb_eval, early_stopping_rounds=100, verbose_eval=10)

In [None]:
# pred= model.predict(test_set)

# Result

In [None]:
pred= model.predict_proba(test_set)[:,1]

In [None]:
sub = pd.DataFrame()
sub['id'] = test['id']
sub['Response']= pred
sub.to_csv('submission.csv',index=False)