In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

In [49]:
df=pd.read_csv("insurance.csv")
df.head(2)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,insuranceclaim
0,19,0,27.9,0,1,3,16884.924,1
1,18,1,33.77,1,0,2,1725.5523,1


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             1338 non-null   int64  
 1   sex             1338 non-null   int64  
 2   bmi             1338 non-null   float64
 3   children        1338 non-null   int64  
 4   smoker          1338 non-null   int64  
 5   region          1338 non-null   int64  
 6   charges         1338 non-null   float64
 7   insuranceclaim  1338 non-null   int64  
dtypes: float64(2), int64(6)
memory usage: 83.8 KB


In [51]:
df["insuranceclaim"].value_counts()

insuranceclaim
1    783
0    555
Name: count, dtype: int64

In [52]:
features=df.drop("insuranceclaim",axis=1)
target=df["insuranceclaim"]

In [53]:
len(features.columns)

7

In [54]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
features.iloc[:]=scaler.fit_transform(features.iloc[:])
features.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,-1.438764,-1.010519,-0.45332,-0.908614,1.970587,1.343905,0.298584
1,-1.509965,0.989591,0.509621,-0.078767,-0.507463,0.438495,-0.953689
2,-0.797954,0.989591,0.383307,1.580926,-0.507463,0.438495,-0.728675
3,-0.441948,0.989591,-1.305531,-0.908614,-0.507463,-0.466915,0.719843
4,-0.513149,0.989591,-0.292556,-0.908614,-0.507463,-0.466915,-0.776802


In [55]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(
    features, target, test_size=0.2, random_state=1,stratify=target)

In [56]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(xtrain,ytrain) #
ypred=lr.predict(xtest)

In [57]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report

In [58]:
c=pd.DataFrame(confusion_matrix(ytest,ypred),
              index=["Not Claimed","Claimed"],
              columns=["Not Claimed","Claimed"])
print(f"Confusion Matrix : \n{c}")
print("Classification Report : \n",classification_report(ytest,ypred))

Confusion Matrix : 
             Not Claimed  Claimed
Not Claimed           91       20
Claimed                7      150
Classification Report : 
               precision    recall  f1-score   support

           0       0.93      0.82      0.87       111
           1       0.88      0.96      0.92       157

    accuracy                           0.90       268
   macro avg       0.91      0.89      0.89       268
weighted avg       0.90      0.90      0.90       268



In [59]:
from sklearn.decomposition import PCA
pca = PCA(n_components=4)
xtrain_pca=pca.fit_transform(xtrain)
xtest_pca=pca.transform(xtest)

In [60]:
lr=LogisticRegression()
lr.fit(xtrain_pca,ytrain) #
ypred=lr.predict(xtest_pca)

In [61]:
c=pd.DataFrame(confusion_matrix(ytest,ypred),
              index=["Not Claimed","Claimed"],
              columns=["Not Claimed","Claimed"])
print(f"Confusion Matrix : \n{c}")
print("Classification Report : \n",classification_report(ytest,ypred))

Confusion Matrix : 
             Not Claimed  Claimed
Not Claimed           78       33
Claimed               20      137
Classification Report : 
               precision    recall  f1-score   support

           0       0.80      0.70      0.75       111
           1       0.81      0.87      0.84       157

    accuracy                           0.80       268
   macro avg       0.80      0.79      0.79       268
weighted avg       0.80      0.80      0.80       268



In [62]:
features.columns
# 7 features

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [63]:
from sklearn.feature_selection import SelectKBest,chi2

In [66]:
features=df.drop("insuranceclaim",axis=1)
target=df["insuranceclaim"]
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(
    features, target, test_size=0.2, random_state=1,stratify=target)

In [67]:
chi=SelectKBest(chi2,k=4)
xtrain_chi=chi.fit_transform(xtrain,ytrain)
xtest_chi=chi.transform(xtest)

In [69]:
d={"Columns":features.columns,
  "Selected":chi.get_support()}
d1=pd.DataFrame(d)
d1

Unnamed: 0,Columns,Selected
0,age,True
1,sex,False
2,bmi,True
3,children,True
4,smoker,False
5,region,False
6,charges,True


In [70]:
lr=LogisticRegression()
lr.fit(xtrain_chi,ytrain) #
ypred=lr.predict(xtest_chi)

In [71]:
c=pd.DataFrame(confusion_matrix(ytest,ypred),
              index=["Not Claimed","Claimed"],
              columns=["Not Claimed","Claimed"])
print(f"Confusion Matrix : \n{c}")
print("Classification Report : \n",classification_report(ytest,ypred))

Confusion Matrix : 
             Not Claimed  Claimed
Not Claimed           87       24
Claimed               10      147
Classification Report : 
               precision    recall  f1-score   support

           0       0.90      0.78      0.84       111
           1       0.86      0.94      0.90       157

    accuracy                           0.87       268
   macro avg       0.88      0.86      0.87       268
weighted avg       0.88      0.87      0.87       268

