## Health Insurance Cross Sell Prediction

### Description: 
Building a model to predict whether a customer would be interested in Vehicle Insurance is extremely helpful for the company because it can then accordingly plan its communication strategy to reach out to those customers and optimise its business model and revenue.

reference : https://www.kaggle.com/datasets/anmolkumar/health-insurance-cross-sell-prediction?select=sample_submission.csv

| Variable   |      Definition      |
|----------|:-------------|
| id |  Unique ID for the customer |
| Gender |    Gender of the customer   |
| Age | Age of the customer |
| Driving_License |   0 : Customer does not have DL, 1 : Customer already has DL  |
| Region_Code | Unique code for the region of the customer |
| Previously_Insured |   1 : Customer already has Vehicle Insurance, 0 : Customer doesn't have Vehicle Insurance
| Vehicle_Age	 | Age of the Vehicle|
| Vehicle_Damage |  1 : Customer got his/her vehicle damaged in the past. 0 : Customer didn't get his/her vehicle damaged in the past.  |
| Annual_Premium | The amount customer needs to pay as premium in the year |
| Policy_Sales_Channel |    Anonymized Code for the channel of outreaching to the customer ie. Different Agents, Over Mail, Over Phone, In Person, etc.   |
| Vintage |Number of Days, Customer has been associated with the company |
| Response | 1 : Customer is interested, 0 : Customer is not interested |


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
train_data = pd.read_csv('Dataset/train.csv')
train_data.head()

#### Feature Engineering

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2

In [None]:
### Encoding

## Label Encoder
train_data.Gender = LabelEncoder().fit_transform(train_data.Gender)

train_data.Vehicle_Damage = LabelEncoder().fit_transform(train_data.Vehicle_Damage)

lbe_dic_Veh =dict(zip(train_data.Vehicle_Age.unique(), [2,1,0]))
train_data.Vehicle_Age = [lbe_dic_Veh[i] for i in train_data.Vehicle_Age]

## Bin counting
# Add Region's Features
region_code_bcTable  = train_data.groupby('Region_Code').agg({'Vehicle_Damage': np.mean, 'Age' : np.median, 'Gender':np.mean  })
region_code_bcTable = region_code_bcTable.rename(columns={'Vehicle_Damage':'Region_Damage_mean','Age': 'Region_Age_median', 'Gender' : 'Female_Ratio' }) # Region's features
train_data = train_data.join(region_code_bcTable, on='Region_Code')

In [None]:
## Chose Columns for modeling
train_data = train_data[[
   'Response',
    'id',
    'Gender', 'Driving_License','Vehicle_Damage', 'Previously_Insured', # category variables
    'Age','Region_Damage_mean','Region_Age_median', 'Female_Ratio','Vehicle_Age',  'Annual_Premium', 'Vintage'
    ]]
train_data.info()

In [None]:
balance_number = len(train_data.query('Response == 0'))/len(train_data.query('Response == 1'))
for i in range(round(balance_number*0.5)):
    train_data = train_data.append(train_data.query('Response == 1'))
train_data.head()

In [None]:
### Split training & validation data
## Avoid training data singnal appear in Validation data
dataset = train_data.copy()
x_train, x_val, y_train, y_val = train_test_split(dataset.iloc[:, 2:], dataset.iloc[:, 0], test_size=0.3, random_state=4) # Split Traning and Val dataset 


In [None]:
# Selecting Features
selector = SelectKBest(chi2, k=7 ).fit(x_train, y_train)
x_train = selector.transform(x_train)
x_val = selector.transform(x_val)
chosen_features = selector.get_feature_names_out()
print('Feature select',list(chosen_features))

# Normalization
sc = StandardScaler().fit(x_train)
x_train = sc.transform(x_train)
x_val = sc.transform(x_val)

#### Training Infra

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve

In [None]:
## Preparing dataset from csv file
## Build DataLoader for preprocessed Dataset
class TDataset(Dataset):

    def __init__(self,x,y):

        # Change lsit to Tensor
        self.x = torch.from_numpy(np.array(x).astype(np.float32))
        self.y = torch.from_numpy(np.array(y).astype(np.float32))

    def __getitem__(self, index):
        return self.x[index], self.y[index]

    def __len__(self):
        return len(self.x)

In [None]:
## Build model framework
class LogisticRegression_model(nn.Module):
        
    def __init__(self,n_features ,hyper_param):
        super(LogisticRegression_model, self).__init__()

        # Define Layers
        self.neurals_1 = hyper_param['neurals_1']
        self.linear_0 = nn.Linear(n_features, hyper_param['neurals_1'])
        self.act0 = nn.Tanh()
        self.linear_1 = nn.Linear(hyper_param['neurals_1'], 1)
        self.sigmoid = nn.Sigmoid()
    

    def forward(self, x):
        x = self.act0(self.linear_0(x))
        y_hat = self.sigmoid(self.linear_1(x))
        return y_hat

#### Hyperparameter optimization

In [None]:
import optuna

In [None]:
def objective(trial, dataset=dataset):
    global x_train, x_val, y_train, y_val 
    
    n_samples, n_features = x_train.shape # For Model to get Tensot shape to build neural network
    print(f'Samples number: {n_samples}, Features number :{n_features}')

    # Model Hyperparameters
    hyper_param = {
    'batch_size': trial.suggest_int('batch_size', 128, 512 , 64),
    'epochs' :trial.suggest_int('epochs', 5,15,1),
    'lr' : trial.suggest_float('lr',0.05, 0.3) ,
    'neurals_1' : trial.suggest_int('neurals_1', 4, n_features )
    }
    


    train_dataset = TDataset(x=x_train, y=y_train)
    val_dataset = TDataset(x=x_val, y=y_val)
    train_dataloader = DataLoader(dataset = train_dataset, batch_size=hyper_param['batch_size'], shuffle=False)
    val_dataloader = DataLoader(dataset = val_dataset, batch_size=hyper_param['batch_size'], shuffle=False)

    

    

    model = LogisticRegression_model(n_features,hyper_param)
    criterion = nn.BCELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=hyper_param['lr'])

    for epoch in range(hyper_param['epochs']):
        steps = 0
        for train_x, train_y in train_dataloader:
            steps += 1
            y_hat = model(train_x)
            loss = criterion(y_hat, train_y.reshape_as(y_hat))

            loss.backward()

            optimizer.step()

            optimizer.zero_grad()
            print(f'steps: {steps}', end='\r')

        if (epoch + 1) % 5 == 0:
            print(f'epoch {epoch + 1}: loss = {loss:.8f}')

        #writer.add_scalar('Train/Loss', loss.item(), epoch) # For model visulization on tensorboard

    with torch.no_grad(): 
        #Validation part
        val_hat = model(val_dataset.x)
        auc = roc_auc_score(val_dataset.y, val_hat.detach().numpy())

        return auc # Define Onjection function target value

In [None]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler())
study.optimize(objective, n_trials = 3)
df = pd.DataFrame(study.trials_dataframe())
df.to_excel('Hyperparameter_trial.xlsx')
# Showing optimization results
print('Number of finished trials:', len(study.trials))
print('Best trial parameters:', study.best_trial.params)
print('Best score:', study.best_value)

In [None]:
## Visulize hyperparameters optimizaing process
from optuna.visualization import plot_optimization_history

plotly_config = {"staticPlot": True}

fig = plot_optimization_history(study)
fig.show(config=plotly_config)

In [None]:
## visualize hyperparameters important weight
from optuna.visualization import plot_param_importances

fig = plot_param_importances(study)
fig.show(config=plotly_config)

#### Training

In [None]:
n_samples, n_features = x_train.shape # For Model to get Tensot shape to build neural network
print(f'Samples number: {n_samples}, Features number :{n_features}')

model = LogisticRegression_model(n_features = n_features,hyper_param = study.best_trial.params)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=study.best_trial.params['lr'])

train_dataset = TDataset(x=x_train, y=y_train)
val_dataset = TDataset(x=x_val, y=y_val)
train_dataloader = DataLoader(dataset = train_dataset, batch_size=study.best_trial.params['batch_size'], shuffle=False)
val_dataloader = DataLoader(dataset = val_dataset, batch_size=study.best_trial.params['batch_size'], shuffle=False)


for epoch in range(study.best_trial.params['epochs']):
    for train_x, train_y in train_dataloader:
        y_hat = model(train_x)
        loss = criterion(y_hat, train_y.reshape_as(y_hat))

        loss.backward()

        optimizer.step()

        optimizer.zero_grad()

    if (epoch + 1) % 2 == 0:
        print(f'epoch {epoch + 1}: loss = {loss:.8f}')

#### Testing

In [None]:
# Testing data 

with torch.no_grad():
    y_predicted = model(val_dataset.x)
    y_predicted_cls = y_predicted.round()
    acc =  accuracy_score(y_predicted_cls, val_dataset.y.round())
    test_loss = criterion( val_dataset.y.reshape_as(y_predicted_cls), y_predicted_cls)

    auc = roc_auc_score(val_dataset.y,y_predicted)
    print(f'accuracy = {acc: .4f}, auc = {auc: .4f}')
    
    fpr, tpr, thresholds = roc_curve( np.array(val_dataset.y), np.array(y_predicted))

    plt.plot(fpr,tpr,label=f"AUC={auc: .4f}")
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc=4)
    plt.show()