In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [4]:
from sklearn.impute import SimpleImputer

In [5]:
mean_imputer = SimpleImputer(strategy='mean',missing_values=np.nan)
mode_imputer = SimpleImputer(strategy='most_frequent', missing_values=np.nan)

In [None]:
df['age']=mean_imputer.fit_transform(df['age'].values.reshape(-1,1))
df['bmi']=mean_imputer.fit_transform(df['bmi'].values.reshape(-1,1))
df['region']= mode_imputer.fit_transform(df['region'].values.reshape(-1,1))

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
le = LabelEncoder()

In [8]:
df['sex']=le.fit_transform(df['sex'])
df['smoker']=le.fit_transform(df['smoker'])
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,southwest,16884.92400
1,18,1,33.770,1,0,southeast,1725.55230
2,28,1,33.000,3,0,southeast,4449.46200
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.880,0,0,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,northwest,10600.54830
1334,18,0,31.920,0,0,northeast,2205.98080
1335,18,0,36.850,0,0,southeast,1629.83350
1336,21,0,25.800,0,0,southwest,2007.94500


In [9]:
df = pd.get_dummies(df, drop_first=True, dtype= int)
df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,16884.92400,0,0,1
1,18,1,33.770,1,0,1725.55230,0,1,0
2,28,1,33.000,3,0,4449.46200,0,1,0
3,33,1,22.705,0,0,21984.47061,1,0,0
4,32,1,28.880,0,0,3866.85520,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830,1,0,0
1334,18,0,31.920,0,0,2205.98080,0,0,0
1335,18,0,36.850,0,0,1629.83350,0,1,0
1336,21,0,25.800,0,0,2007.94500,0,0,1


In [10]:
from sklearn.preprocessing import MinMaxScaler

In [11]:
x = df.drop('charges',axis=1)
y =df['charges']

In [12]:
mxc = MinMaxScaler()
x_scl = mxc.fit_transform(x)
x_scl.shape

(1338, 8)

In [13]:
x_scl

array([[0.02173913, 0.        , 0.3212268 , ..., 0.        , 0.        ,
        1.        ],
       [0.        , 1.        , 0.47914985, ..., 0.        , 1.        ,
        0.        ],
       [0.2173913 , 1.        , 0.45843422, ..., 0.        , 1.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.56201238, ..., 0.        , 1.        ,
        0.        ],
       [0.06521739, 0.        , 0.26472962, ..., 0.        , 0.        ,
        1.        ],
       [0.93478261, 0.        , 0.35270379, ..., 1.        , 0.        ,
        0.        ]])

In [14]:
from sklearn.linear_model import LinearRegression

In [15]:
lr = LinearRegression()

In [16]:
lr.fit(x_scl,y)

In [17]:
preds = lr.predict(x_scl)

In [18]:
from sklearn.metrics import r2_score

In [19]:
print('r2 score', r2_score(y,preds))

r2 score 0.7509130345985207


In [20]:
from sklearn.preprocessing import PolynomialFeatures

In [21]:
poly = PolynomialFeatures(degree=3)
x_poly = poly.fit_transform(x_scl)

In [22]:
x_scaled_poly = mxc.fit_transform(x_poly)

In [23]:
lr.fit(x_scaled_poly,y)

In [24]:
preds_poly = lr.predict(x_scaled_poly)

In [25]:
print('r2 score', r2_score(y,preds_poly))

r2 score 0.8575495296011789


In [26]:
# Model evaluation and splitting the data
from sklearn.model_selection import train_test_split

In [27]:
x_train,y_train,x_test,y_test= train_test_split(x,y,test_size=0.2)

In [28]:
lr.fit(x_train, x_test)

In [29]:
train_preds = lr.predict(x_train)

In [30]:
test_preds = lr.predict(y_train)

In [31]:
print('r2 score train', r2_score(x_test,train_preds))
print('r2 score test', r2_score(y_test,test_preds))

r2 score train 0.7436219724472379
r2 score test 0.7775998131325909


In [32]:
x_train,y_train,x_test,y_test= train_test_split(x_scaled_poly,y,test_size=0.2)

In [33]:
lr.fit(x_train, x_test)

In [34]:
train_preds = lr.predict(x_train)
test_preds = lr.predict(y_train)

In [35]:
print('r2 score train', r2_score(x_test,train_preds))
print('r2 score test', r2_score(y_test,test_preds))

r2 score train 0.8601525946769304
r2 score test 0.8165560967609221
