# Medical Cost Prediction

![Pipeline](pipeline.svg)

### Importing libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing  import OneHotEncoder,StandardScaler

from sklearn.linear_model import LinearRegression,Ridge,Lasso

from joblib import dump, load


### Read Data

In [3]:
df=pd.read_csv('data/insurance.csv')

In [4]:
df.sample(5)         #random data generated each time

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
64,20,female,22.42,0,yes,northwest,14711.7438
1153,35,female,35.815,1,no,northwest,5630.45785
31,18,female,26.315,0,no,northeast,2198.18985
693,24,male,23.655,0,no,northwest,2352.96845
761,23,male,35.2,1,no,southwest,2416.955


In [5]:
df.describe(include='all')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,1338.0,1338,1338.0,1338.0,1338,1338,1338.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,676,,,1064,364,
mean,39.207025,,30.663397,1.094918,,,13270.422265
std,14.04996,,6.098187,1.205493,,,12110.011237
min,18.0,,15.96,0.0,,,1121.8739
25%,27.0,,26.29625,0.0,,,4740.28715
50%,39.0,,30.4,1.0,,,9382.033
75%,51.0,,34.69375,2.0,,,16639.912515


In [6]:
df.shape

(1338, 7)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [8]:
df.duplicated().value_counts()

False    1337
True        1
Name: count, dtype: int64

In [9]:
df=df.drop_duplicates()
df.shape

(1337, 7)

### Splitting data 

In [10]:
X=df.drop(['charges'],axis=1)
y=df['charges']

60 train + 20 validation + 20 test

100

test =40 , train 60

test=40 

validation - 20 , test -20

In [11]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.40,random_state=123)
X_val,X_test,y_val,y_test=train_test_split(X_test,y_test,test_size=0.50,random_state=123)

X_train.to_csv("data/X_train.csv",index=False)
X_test.to_csv("data/X_test.csv",index=False)
y_train.to_csv("data/y_train.csv",index=False)
y_test.to_csv("data/y_test.csv",index=False)
X_val.to_csv("data/X_val.csv",index=False)
y_val.to_csv("data/y_val.csv",index=False)

In [12]:
print(X_train.shape,X_test.shape,X_val.shape,y_train.shape,y_test.shape,y_val.shape)

(802, 6) (268, 6) (267, 6) (802,) (268,) (267,)


### Seperateing numerical and categorical columns

In [13]:
numerical_cols=X_train.select_dtypes(exclude='object')

In [14]:
categorical_cols=X_train.select_dtypes(include='object')

In [15]:
numerical_cols

Unnamed: 0,age,bmi,children
596,42,29.480,2
1061,57,27.940,1
971,34,23.560,0
672,36,29.700,0
345,34,29.260,3
...,...,...,...
1239,25,42.130,1
1148,55,21.500,1
106,19,28.400,1
1042,20,30.685,0


In [16]:
categorical_cols

Unnamed: 0,sex,smoker,region
596,female,no,southeast
1061,male,no,southeast
971,female,no,northeast
672,male,no,southeast
345,female,no,southeast
...,...,...,...
1239,female,no,southeast
1148,male,no,southwest
106,female,no,southwest
1042,male,yes,northeast


### Encoding categorical data

In [17]:
oh=OneHotEncoder()

In [18]:
encoder=oh.fit(categorical_cols)

#getting column name for dataframe
col=encoder.get_feature_names_out()

In [19]:
#saving encoder
dump(encoder,'models/encoder/one_hot_encoder.pkl')

['models/encoder/one_hot_encoder.pkl']

In [20]:
#loading encoder
# encoder_model=load('models/encoder/one_hot_encoder.pkl') 
transform=encoder.transform(categorical_cols).toarray()


In [21]:
categorical_encode_data=pd.DataFrame(transform,columns=col)
categorical_encode_data

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...
797,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
798,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
799,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
800,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0


### Scaling Numerical data

In [22]:
scaler=StandardScaler() #numerical_cols

In [23]:
numerical_scaler=scaler.fit(numerical_cols)
numerical_scaler

In [24]:
#saving encoder
dump(numerical_scaler,'models/scaling/standard_scaler.pkl')

['models/scaling/standard_scaler.pkl']

In [25]:
# model_scaling=load('models/scaling/standard_scaler.pkl')

In [26]:
scaled_data=numerical_scaler.transform(numerical_cols)

In [27]:
numerical_scaled_data=pd.DataFrame(scaled_data,columns=numerical_cols.columns)
numerical_scaled_data

Unnamed: 0,age,bmi,children
0,0.199951,-0.194066,0.742221
1,1.288864,-0.446239,-0.079963
2,-0.380803,-1.163457,-0.902147
3,-0.235614,-0.158042,-0.902147
4,-0.380803,-0.230091,1.564405
...,...,...,...
797,-1.034150,1.877352,-0.079963
798,1.143675,-1.500779,-0.079963
799,-1.469715,-0.370915,-0.079963
800,-1.397121,0.003250,-0.902147


### Concat Numerical column and Categorical column

In [28]:
Features=pd.concat([numerical_scaled_data,categorical_encode_data],axis=1)

In [29]:
Features

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,0.199951,-0.194066,0.742221,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,1.288864,-0.446239,-0.079963,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
2,-0.380803,-1.163457,-0.902147,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,-0.235614,-0.158042,-0.902147,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.380803,-0.230091,1.564405,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
797,-1.034150,1.877352,-0.079963,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
798,1.143675,-1.500779,-0.079963,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0
799,-1.469715,-0.370915,-0.079963,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
800,-1.397121,0.003250,-0.902147,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0


In [30]:
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

#linear regression
model=LinearRegression()
model.fit(Features,y_train)
dump(model,'models/lr.pkl')
print('train score: ',model.score(Features,y_train)*100)

train score:  74.38983158807748


In [31]:
model_DT=DecisionTreeRegressor()
model_DT.fit(Features,y_train)
dump(model_DT,'models/DT.pkl')
print('train score: ',model_DT.score(Features,y_train)*100)

train score:  99.9321796631031


In [32]:
model_RF=RandomForestRegressor()
model_RF.fit(Features,y_train)
dump(model_RF,'models/RF.pkl')
print('train score: ',model_RF.score(Features,y_train)*100)

train score:  97.70169938187404


In [40]:
# evaluvation
from evaluvation_pipeline import evaluvation_pipeline

In [41]:
_,score=evaluvation_pipeline('data/X_val.csv',"data/y_val.csv","models/lr.pkl")
score

74.98647426234041

In [42]:
_,score=evaluvation_pipeline('data/X_val.csv',"data/y_val.csv","models/DT.pkl")
score

75.83805846083274

In [43]:
_,score=evaluvation_pipeline('data/X_val.csv',"data/y_val.csv","models/RF.pkl")
score

85.01817383786859