<a href="https://colab.research.google.com/github/Tsuke007/dseg1_w2/blob/main/Insurance_data_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Download dataset from: https://www.kaggle.com/anmolkumar/health-insurance-cross-sell-prediction

In [None]:
!gdown --id 1E0eztrotQ9CPH5glLgONIwxV-M4WqXJ_

In [None]:
!unzip insurance_cross_sell.zip

#Pipeline
1. Collect Dataset
2. Clean Data
3. Extract Features
4. Split Training and test
5. Train
6. Evaluate
7. Create Pipeline

#1. Collect Dataset

In [None]:
import pandas as pd
import numpy as np

In [None]:
insured_df = pd.read_csv('train.csv')

In [None]:
insured_df.head()

In [None]:
def collect_data():
  insured_df = pd.read_csv('train.csv')
  return insured_df

#2. Clean Data

In [None]:
insured_df.head()

In [None]:
insured_df.info()

In [None]:
insured_df['Response']

In [None]:
insured_df.isna().sum()

#3. Extract Features

In [None]:
insured_df.head()

In [None]:
def extract_feature(df, is_training=True):
  insured_new_df = df.copy()
  label_df = []

  if 'Response' in insured_new_df.columns:
    label_df = insured_new_df['Response']

     # drop unused columns
  if is_training:
    insured_new_df =insured_new_df.drop(['Response','id'],axis=1)
  col_names = ['Gender','Vehicle_Age','Vehicle_Damage']

    # One-Hot Encoding
  dummies_df = pd.get_dummies(insured_new_df[col_names])

   # Merge One-Hot Encoding
  insured_new_df = pd.concat([insured_new_df, dummies_df], axis=1)

  # Drop unused columns (One-Hot Encoding)
  insured_new_df = insured_new_df.drop(col_names, axis=1)


  for col in insured_new_df.columns:
    insured_new_df[col] = pd.to_numeric(insured_new_df[col],errors='coerce')

  return insured_new_df,label_df

In [None]:
insured_new_df,label_df = extract_feature(insured_df)

In [None]:
insured_new_df

#4. Train & Test split 

In [None]:
len(insured_new_df)

In [None]:
from sklearn.model_selection import train_test_split
train_insured, test_insured, train_label, test_label = \
          train_test_split(insured_new_df, label_df, test_size=0.2, random_state=42)

In [None]:
len(train_insured)

In [None]:
len(test_insured)

In [None]:
def split_train_test(insured_new_df, label):
  train_insured, test_insured, train_label, test_label = \
          train_test_split(insured_new_df, label, test_size=0.2, random_state=42)
  return train_insured, test_insured, train_label, test_label

In [None]:
train_insured, test_insured, train_label, test_label = \
        split_train_test(insured_new_df, label_df)

#5. Train (Classification)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

In [None]:
def train_model(insured_new_df, label):
  model = GradientBoostingClassifier(random_state=2020)
  #model = SVC()
  model.fit(insured_new_df, label)
  return model

In [25]:
model = train_model(train_insured, train_label)

In [26]:
model

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=2020, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

#6. Evaluation (Classification)

In [27]:
def eval_acc(prediction, actual):
  acc = sum(prediction == actual) / len(actual)
  return acc

In [28]:
pred = model.predict(test_insured)
eval_acc(pred, test_label)

0.8750754375377188

In [29]:
train_insured

Unnamed: 0,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Gender_Female,Gender_Male,Vehicle_Age_1-2 Year,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage_No,Vehicle_Damage_Yes
332803,39,1,15.0,0,52906.0,55.0,227,1,0,1,0,0,0,1
116248,38,1,11.0,0,23038.0,26.0,29,0,1,1,0,0,0,1
255005,22,1,30.0,1,45318.0,152.0,166,0,1,0,1,0,1,0
317474,23,1,41.0,1,29132.0,151.0,277,1,0,0,1,0,1,0
344212,56,1,48.0,0,2630.0,154.0,155,0,1,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259178,24,1,36.0,1,22575.0,152.0,287,1,0,0,1,0,1,0
365838,56,1,35.0,0,41287.0,124.0,298,0,1,1,0,0,0,1
131932,22,1,2.0,0,18857.0,152.0,76,1,0,0,1,0,0,1
146867,44,1,32.0,1,2630.0,156.0,51,0,1,1,0,0,0,1


In [30]:
pred = model.predict(test_insured)
pred

array([0, 0, 0, ..., 0, 0, 0])

In [31]:
test_label

200222    0
49766     0
172201    0
160713    0
53272     0
         ..
258403    0
234155    0
24476     0
60423     0
185839    0
Name: Response, Length: 76222, dtype: int64

#7. Create pipeline (Classification)

In [32]:
def run_pipeline():
  # collect data, clean data
  insured_df = collect_data()

  # Extract Feature
  train_df, test_df = extract_feature(insured_df) 
  
  # split training and test
  train_insured, test_insured, train_label, test_label = \
        split_train_test(insured_new_df, label_df)

  # Evaluation
  pred = model.predict(test_insured)
  acc = eval_acc(pred, test_label)

  return acc

In [33]:
print(run_pipeline())

0.8750754375377188


In [46]:
model.save('Insurance.h5')