In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.cluster import KMeans

import warnings
warnings.filterwarnings("ignore")

In [49]:
df = pd.read_csv("insurance.csv")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,insuranceclaim
0,19,0,27.900,0,1,3,16884.92400,1
1,18,1,33.770,1,0,2,1725.55230,1
2,28,1,33.000,3,0,2,4449.46200,0
3,33,1,22.705,0,0,1,21984.47061,0
4,32,1,28.880,0,0,1,3866.85520,1
...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830,0
1334,18,0,31.920,0,0,0,2205.98080,1
1335,18,0,36.850,0,0,2,1629.83350,1
1336,21,0,25.800,0,0,3,2007.94500,0


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             1338 non-null   int64  
 1   sex             1338 non-null   int64  
 2   bmi             1338 non-null   float64
 3   children        1338 non-null   int64  
 4   smoker          1338 non-null   int64  
 5   region          1338 non-null   int64  
 6   charges         1338 non-null   float64
 7   insuranceclaim  1338 non-null   int64  
dtypes: float64(2), int64(6)
memory usage: 83.8 KB


In [51]:
df.describe()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,insuranceclaim
count,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0,1338.0
mean,39.207025,0.505232,30.663397,1.094918,0.204783,1.515695,13270.422265,0.585202
std,14.04996,0.50016,6.098187,1.205493,0.403694,1.104885,12110.011237,0.492871
min,18.0,0.0,15.96,0.0,0.0,0.0,1121.8739,0.0
25%,27.0,0.0,26.29625,0.0,0.0,1.0,4740.28715,0.0
50%,39.0,1.0,30.4,1.0,0.0,2.0,9382.033,1.0
75%,51.0,1.0,34.69375,2.0,0.0,2.0,16639.912515,1.0
max,64.0,1.0,53.13,5.0,1.0,3.0,63770.42801,1.0


In [52]:
features=df.iloc[:,:-1]
target=df.iloc[:,-1]

In [53]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(features,target,test_size=0.3,random_state=1)

In [54]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(xtrain,ytrain)
ypred=lr.predict(xtest)

In [55]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
ac = accuracy_score(ytest,ypred)
cm = confusion_matrix(ytest,ypred)
cr = classification_report(ytest,ypred)

print(f"Accuracy: {ac}\n{cm}\n{cr}")

Accuracy: 0.8059701492537313
[[112  45]
 [ 33 212]]
              precision    recall  f1-score   support

           0       0.77      0.71      0.74       157
           1       0.82      0.87      0.84       245

    accuracy                           0.81       402
   macro avg       0.80      0.79      0.79       402
weighted avg       0.80      0.81      0.80       402



In [56]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4,random_state=1)

In [57]:
xtrain.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [58]:
xtrain_pc = pca.fit_transform(xtrain)
xtest_pc = pca.transform(xtest)

In [59]:
xtrain_pc

array([[-4.44948491e+03, -1.25727799e+01, -4.30826018e+00,
        -1.14865024e+00],
       [-1.06789255e+04,  1.42216632e+01, -2.67668531e+00,
         1.14638840e-01],
       [-1.05553823e+04,  1.02797040e+01, -3.10311218e+00,
        -1.22893647e+00],
       ...,
       [ 3.13645015e+04, -1.33121822e+00,  1.11438714e+00,
         6.01557753e-01],
       [ 6.16756645e+03,  1.29585579e+00, -9.01273174e+00,
         9.24975658e-01],
       [-1.72246880e+03, -1.86705010e+01, -3.23646100e+00,
        -5.27336144e-02]])

In [60]:
lr.fit(xtrain_pc,ytrain)
ypred = lr.predict(xtest_pc)

In [61]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
ac = accuracy_score(ytest,ypred)
cm = confusion_matrix(ytest,ypred)
cr = classification_report(ytest,ypred)

print(f"Accuracy: {ac}\n{cm}\n{cr}")

Accuracy: 0.8557213930348259
[[132  25]
 [ 33 212]]
              precision    recall  f1-score   support

           0       0.80      0.84      0.82       157
           1       0.89      0.87      0.88       245

    accuracy                           0.86       402
   macro avg       0.85      0.85      0.85       402
weighted avg       0.86      0.86      0.86       402



In [62]:
from sklearn.preprocessing import StandardScaler

In [63]:
for col in xtrain:
    ss=StandardScaler()
    xtrain[col]=ss.fit_transform(xtrain[[col]])

In [64]:
xtrain_pc = pca.fit_transform(xtrain)
xtest_pc = pca.transform(xtest)

In [65]:
lr.fit(xtrain_pc,ytrain)
ypred = lr.predict(xtest_pc)

In [66]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
ac = accuracy_score(ytest,ypred)
cm = confusion_matrix(ytest,ypred)
cr = classification_report(ytest,ypred)

print(f"Accuracy: {ac}\n{cm}\n{cr}")

Accuracy: 0.6094527363184079
[[  0 157]
 [  0 245]]
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       157
           1       0.61      1.00      0.76       245

    accuracy                           0.61       402
   macro avg       0.30      0.50      0.38       402
weighted avg       0.37      0.61      0.46       402

