## Health insurence Prediction

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score


## Load Data

In [15]:
df = pd.read_csv("datasheet/health_insurance.csv")
df.shape # (rows,colums)

(1338, 7)

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [17]:
df.head(10)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692


In [18]:
# Are there any missing values
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [19]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [20]:
df["sex"].value_counts()

sex
male      676
female    662
Name: count, dtype: int64

### Label Encording

In [21]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [22]:
from sklearn.preprocessing import LabelEncoder

In [23]:
cat_feature = ["sex", "smoker", "region"]

for feature in cat_feature:
    encode = LabelEncoder()
    encoded_data = encode.fit_transform(df[feature])
    df[feature] = encoded_data

    mapping = dict(zip(encode.classes_, encode.transform(encode.classes_)))
    print(f"Mapping for {feature}: {mapping}")

Mapping for sex: {'female': 0, 'male': 1}
Mapping for smoker: {'no': 0, 'yes': 1}
Mapping for region: {'northeast': 0, 'northwest': 1, 'southeast': 2, 'southwest': 3}


In [24]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


### Spilt dataset

In [25]:
# Split data into x and y
X = df.drop("charges",axis=1)
y = df["charges"]

### Feature Scaling

In [26]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
standerd_X = scaler.fit_transform(X)
X = pd.DataFrame(standerd_X, columns=X.columns)


In [27]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,-1.438764,-1.010519,-0.45332,-0.908614,1.970587,1.343905
1,-1.509965,0.989591,0.509621,-0.078767,-0.507463,0.438495
2,-0.797954,0.989591,0.383307,1.580926,-0.507463,0.438495
3,-0.441948,0.989591,-1.305531,-0.908614,-0.507463,-0.466915
4,-0.513149,0.989591,-0.292556,-0.908614,-0.507463,-0.466915


In [28]:
# Split data into train and test sets
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=32)

In [29]:
x_train

Unnamed: 0,age,sex,bmi,children,smoker,region
280,0.056461,-1.010519,-0.417230,-0.078767,1.970587,-1.372326
679,0.697271,-1.010519,-1.227610,0.751079,-0.507463,-0.466915
172,-1.509965,0.989591,-2.412011,-0.908614,-0.507463,-1.372326
45,1.124479,0.989591,1.088698,-0.908614,-0.507463,1.343905
330,1.551686,-1.010519,0.938597,-0.078767,1.970587,-1.372326
...,...,...,...,...,...,...
252,1.053277,0.989591,0.581801,0.751079,1.970587,0.438495
88,0.483668,-1.010519,-0.479567,-0.908614,-0.507463,-0.466915
892,1.053277,0.989591,-1.087352,-0.908614,-0.507463,-1.372326
1334,-1.509965,-1.010519,0.206139,-0.908614,-0.507463,-1.372326


In [30]:
y_train


280     22331.56680
679     10156.78320
172      1694.79640
45      20630.28351
330     48517.56315
           ...     
252     44260.74990
88       8026.66660
892     10422.91665
1334     2205.98080
727     16657.71745
Name: charges, Length: 1070, dtype: float64

### Model

In [55]:
from sklearn.ensemble import RandomForestRegressor

In [56]:
model_rf = RandomForestRegressor()

model_rf.fit(x_train,y_train)

### Evaluation

In [57]:
y_preds = model_rf.predict(x_test)

In [58]:
y_preds

array([17157.4479097 , 17004.1673753 ,  5222.9370275 , 19271.1412972 ,
       10962.5639065 ,  2954.7614751 , 44346.4206385 ,  4365.1545498 ,
        6004.632183  , 10136.6696435 ,  3775.5541821 ,  4474.3364062 ,
       14730.7035749 , 47737.4917155 ,  9775.2821819 , 43145.0184318 ,
       12215.9474831 , 11360.7290154 , 10242.8698752 , 16214.1627635 ,
        6712.9086305 , 15203.26833   , 14692.3574912 ,  5612.8448877 ,
       44741.6492908 ,  2609.856456  ,  7649.3317982 , 17091.2125453 ,
       10772.4450364 , 20400.122183  ,  2374.3384998 ,  7883.0258575 ,
       13681.901603  , 14526.1121869 ,  9322.4958172 , 40062.0935365 ,
        7095.3458454 ,  4347.5595873 , 27387.7184848 , 19303.0381521 ,
        8182.5865224 ,  1632.011307  , 10065.9547526 ,  1945.927683  ,
        4908.9292564 ,  9446.3384008 ,  6294.5504549 , 12133.4883677 ,
       15552.354702  , 35909.7029158 , 16981.5727818 , 46788.6965716 ,
        4584.617034  ,  4098.9703872 , 16916.1490383 , 17568.242848  ,
      

In [59]:
mse = mean_squared_error(y_test, y_preds)
mse

22694239.651255723

In [60]:
mae = mean_absolute_error(y_test, y_preds)
mae

2679.480236318471

In [61]:
r2 = r2_score(y_test, y_preds)
r2

0.8323629108291346

In [62]:
import pickle

In [63]:
with open("model/model_2.pkl", "wb") as f:
    pickle.dump(model_rf, f)

In [64]:
## Load model

with open("model/model_2.pkl", "rb") as f:
    model = pickle.load(f)

gender	age	hypertension	heart_disease	smoking_history	bmi	HbA1c_level	blood_glucose_level

In [65]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [66]:
## Example

test_1 = {
    "age": 27,
    "sex" : 0,
    "bmi" : 24,
    "children" : 2,
    "smoker": 1,
    "region": 3
}

test_1 = pd.DataFrame([test_1])

In [67]:
test_scaled = scaler.transform(test_1)
test_1_df = pd.DataFrame(test_scaled, columns=test_1.columns)

In [68]:
test_1_df

Unnamed: 0,age,sex,bmi,children,smoker,region
0,-0.869155,-1.010519,-1.093093,0.751079,1.970587,1.343905


In [72]:
result = model.predict(test_1_df)
print(f"Changes : {result[0]:.2f}")

Changes : 17365.48
