# Medical Cost Prediction

In [41]:
import pandas as pd
import numpy as np

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Ridge,Lasso,ElasticNet,RidgeCV,LassoCV,ElasticNetCV

In [43]:
df=pd.read_csv("insurance.csv")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [10]:
print("Number of Null value:",df.isnull().sum().sum())

Number of Null value: 0


In [9]:
df.describe(include="all")

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,1338.0,1338,1338.0,1338.0,1338,1338,1338.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,676,,,1064,364,
mean,39.207025,,30.663397,1.094918,,,13270.422265
std,14.04996,,6.098187,1.205493,,,12110.011237
min,18.0,,15.96,0.0,,,1121.8739
25%,27.0,,26.29625,0.0,,,4740.28715
50%,39.0,,30.4,1.0,,,9382.033
75%,51.0,,34.69375,2.0,,,16639.912515


In [45]:
print(df.nunique())

age           47
sex            2
bmi          548
children       6
smoker         2
region         4
charges     1337
dtype: int64


In [57]:
df["region"].value_counts()

southeast    364
southwest    325
northwest    325
northeast    324
Name: region, dtype: int64

In [46]:
df["children"]=df["children"].astype(str)

In [47]:
print(df.select_dtypes("object"))

         sex children smoker     region
0     female        0    yes  southwest
1       male        1     no  southeast
2       male        3     no  southeast
3       male        0     no  northwest
4       male        0     no  northwest
...      ...      ...    ...        ...
1333    male        3     no  northwest
1334  female        0     no  northeast
1335  female        0     no  southeast
1336  female        0     no  southwest
1337  female        0    yes  northwest

[1338 rows x 4 columns]


In [48]:
print("'Total Non-Numeric Column':", len(df.select_dtypes("object").columns))

'Total Non-Numeric Column': 4


In [49]:
{column: list(df[column].unique()) for column in df.select_dtypes("object")}

{'sex': ['female', 'male'],
 'children': ['0', '1', '3', '2', '5', '4'],
 'smoker': ['yes', 'no'],
 'region': ['southwest', 'southeast', 'northwest', 'northeast']}

In [50]:
df["sex"]=df["sex"].replace({'male':1,'female':0})
print(df["sex"])

0       0
1       1
2       1
3       1
4       1
       ..
1333    1
1334    0
1335    0
1336    0
1337    0
Name: sex, Length: 1338, dtype: int64


In [59]:
df["smoker"]=df["smoker"].replace({'no':0, 'yes':1})
print(df)

      age  sex     bmi children  smoker     region      charges
0      19    0  27.900        0       1  southwest  16884.92400
1      18    1  33.770        1       0  southeast   1725.55230
2      28    1  33.000        3       0  southeast   4449.46200
3      33    1  22.705        0       0  northwest  21984.47061
4      32    1  28.880        0       0  northwest   3866.85520
...   ...  ...     ...      ...     ...        ...          ...
1333   50    1  30.970        3       0  northwest  10600.54830
1334   18    0  31.920        0       0  northeast   2205.98080
1335   18    0  36.850        0       0  southeast   1629.83350
1336   21    0  25.800        0       0  southwest   2007.94500
1337   61    0  29.070        0       1  northwest  29141.36030

[1338 rows x 7 columns]


In [60]:
df["region"]=df["region"].replace({'southeast':0,'southwest':1,'northwest':2,'northeast':3})
print(df)

      age  sex     bmi children  smoker  region      charges
0      19    0  27.900        0       1       1  16884.92400
1      18    1  33.770        1       0       0   1725.55230
2      28    1  33.000        3       0       0   4449.46200
3      33    1  22.705        0       0       2  21984.47061
4      32    1  28.880        0       0       2   3866.85520
...   ...  ...     ...      ...     ...     ...          ...
1333   50    1  30.970        3       0       2  10600.54830
1334   18    0  31.920        0       0       3   2205.98080
1335   18    0  36.850        0       0       0   1629.83350
1336   21    0  25.800        0       0       1   2007.94500
1337   61    0  29.070        0       1       2  29141.36030

[1338 rows x 7 columns]


In [61]:
y=df["charges"]
X=df.drop(["charges"],axis=1)
X

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.900,0,1,1
1,18,1,33.770,1,0,0
2,28,1,33.000,3,0,0
3,33,1,22.705,0,0,2
4,32,1,28.880,0,0,2
...,...,...,...,...,...,...
1333,50,1,30.970,3,0,2
1334,18,0,31.920,0,0,3
1335,18,0,36.850,0,0,0
1336,21,0,25.800,0,0,1


In [63]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7,shuffle=True,random_state=123)

In [64]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
300,36,1,27.550,3,0,3
904,60,0,35.100,0,0,1
670,30,1,31.570,3,0,0
617,49,1,25.600,2,1,1
373,26,1,32.900,2,1,1
...,...,...,...,...,...,...
1238,37,1,22.705,3,0,3
1147,20,0,31.920,0,0,2
106,19,0,28.400,1,0,1
1041,18,1,23.085,0,0,3


In [65]:
y_train

300      6746.74250
904     12644.58900
670      4837.58230
617     23306.54700
373     36085.21900
           ...     
1238     6985.50695
1147     2261.56880
106      2331.51900
1041     1704.70015
1122    46661.44240
Name: charges, Length: 936, dtype: float64

In [67]:
model_1=LinearRegression()
model_1.fit(X_train,y_train)
model_1.score(X_test,y_test)
model_1.score(X_test,y_test)*100

76.30561420408382

In [69]:
model_2=Ridge()
model_2.fit(X_train,y_train)
model_2.score(X_test,y_test)
model_2.score(X_test,y_test)*100

76.2888256803132

In [70]:
model_3=Lasso()
model_3.fit(X_train,y_train)
model_3.score(X_test,y_test)
model_3.score(X_test,y_test)*100

76.30500879855147

In [72]:
model_4=ElasticNet()
model_4.fit(X_train,y_train)
model_4.score(X_test,y_test)
model_4.score(X_test,y_test)*100

35.147507567005555

In [73]:
model_5=ElasticNet()
model_5.fit(X_train,y_train)
model_5.score(X_test,y_test)
model_5.score(X_test,y_test)*100

35.147507567005555

In [74]:
model_6=RidgeCV()
model_6.fit(X_train,y_train)
model_6.score(X_test,y_test)
model_6.score(X_test,y_test)*100

76.30420627978194

In [75]:
model_7=LassoCV()
model_7.fit(X_train,y_train)
model_7.score(X_test,y_test)
model_7.score(X_test,y_test)*100

76.2572590072474

In [76]:
model_8=ElasticNetCV()
model_8.fit(X_train,y_train)
model_8.score(X_test,y_test)
model_8.score(X_test,y_test)*100

7.429381549049674