In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
from sklearn.model_selection import train_test_split

ds_sbi = pd.read_csv(r'https://raw.githubusercontent.com/dsrscientist/DSData/master/SBI_Life_insurance.csv')
ds_sbi.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,insuranceclaim
0,19,0,27.9,0,1,3,16884.924,1
1,18,1,33.77,1,0,2,1725.5523,1
2,28,1,33.0,3,0,2,4449.462,0
3,33,1,22.705,0,0,1,21984.47061,0
4,32,1,28.88,0,0,1,3866.8552,1


In [2]:
ds_sbi.isnull().sum().any()

False

In [3]:
#dataset does not have any null values.
ds_sbi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
age               1338 non-null int64
sex               1338 non-null int64
bmi               1338 non-null float64
children          1338 non-null int64
smoker            1338 non-null int64
region            1338 non-null int64
charges           1338 non-null float64
insuranceclaim    1338 non-null int64
dtypes: float64(2), int64(6)
memory usage: 83.8 KB


In [4]:
ds_sbi.describe()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,insuranceclaim
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,0.505232,30.663397,1.094918,0.204783,1.515695,13270.422265,0.585202
std,14.04996,0.50016,6.098187,1.205493,0.403694,1.104885,12110.011237,0.492871
min,18.0,0.0,15.96,0.0,0.0,0.0,1121.8739,0.0
25%,27.0,0.0,26.29625,0.0,0.0,1.0,4740.28715,0.0
50%,39.0,1.0,30.4,1.0,0.0,2.0,9382.033,1.0
75%,51.0,1.0,34.69375,2.0,0.0,2.0,16639.912515,1.0
max,64.0,1.0,53.13,5.0,1.0,3.0,63770.42801,1.0


In [5]:
ds_sbi.corr()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,insuranceclaim
age,1.0,-0.020856,0.109272,0.042469,-0.025019,0.002127,0.299008,0.113723
sex,-0.020856,1.0,0.046371,0.017163,0.076185,0.004588,0.057292,0.031565
bmi,0.109272,0.046371,1.0,0.012759,0.00375,0.157566,0.198341,0.384198
children,0.042469,0.017163,0.012759,1.0,0.007673,0.016569,0.067998,-0.409526
smoker,-0.025019,0.076185,0.00375,0.007673,1.0,-0.002181,0.787251,0.333261
region,0.002127,0.004588,0.157566,0.016569,-0.002181,1.0,-0.006208,0.020891
charges,0.299008,0.057292,0.198341,0.067998,0.787251,-0.006208,1.0,0.309418
insuranceclaim,0.113723,0.031565,0.384198,-0.409526,0.333261,0.020891,0.309418,1.0


In [6]:
#Based on above data, we can see that, age, bmi, smoker and charges have more influence on the insurance claim.  Children has negative influence.
#split the data to test the model with all the columns
x = ds_sbi.drop('insuranceclaim', axis=1)
y = ds_sbi['insuranceclaim']

#Let's normalize the data
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
x_new = ss.fit_transform(x)

x_train, x_test, y_train, y_test = train_test_split(x_new, y, test_size=0.3, random_state = 1, stratify = y)

In [7]:
# Spot Check Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn import model_selection

models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DTC', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
kfold=10
scoring='accuracy'
#scoring = 'roc_auc'
seed = 1
results = []
names = []
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.864287 (0.018119)
LDA: 0.849348 (0.029682)
KNN: 0.880210 (0.037194)
DTC: 0.975418 (0.012701)
NB: 0.723370 (0.033897)
SVM: 0.895276 (0.018518)


In [8]:
models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('DTC', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
# evaluate each model in turn
kfold=10
#scoring='accuracy'
scoring='roc_auc'
seed = 1
results = []
names = []
for name, model in models:
	kfold = model_selection.KFold(n_splits=10, random_state=seed)
	cv_results = model_selection.cross_val_score(model, x_test, y_test, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
	print(msg)

LR: 0.944336 (0.034800)
LDA: 0.938815 (0.033337)
KNN: 0.932399 (0.022621)
DTC: 0.944333 (0.052955)
NB: 0.884331 (0.039437)
SVM: 0.950140 (0.031015)


In [9]:
#Based on above data, we can see that SVM is better results with "roc_auc" scoring.  Let's plot ROC_AUC curve for this model.
svm = SVC(gamma='auto', probability=True, random_state=1)
print(svm)
svm_model = svm.fit(x_train, y_train)
y_pred_prob = svm_model.predict_proba(x_test)[:,1]



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=True, random_state=1, shrinking=True, tol=0.001,
    verbose=False)


In [10]:
from sklearn import metrics
import matplotlib.pyplot as plt

fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob)
plt.plot(fpr, tpr)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.title('ROC curve for diabetes classifier')
plt.xlabel('False Positive Rate (1 - Specificity)')
plt.ylabel('True Positive Rate (Sensitivity)')
plt.grid(True)
plt.show()

<Figure size 640x480 with 1 Axes>

In [11]:
#ROC-AUC value
print(metrics.roc_auc_score(y_test, y_pred_prob))

0.9490126130717289
