In [1]:
import pandas as pd 
import numpy as np 

# data visualization#
import matplotlib.pyplot as plt 
import seaborn as sns

# models
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.neighbors import KNeighborsRegressor

# model evalution matrices
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV

import warnings 
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv(r"D:\vijay\medical_insurance (1).csv")
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [3]:
from sklearn.preprocessing import LabelEncoder

In [4]:
L1=LabelEncoder()
df["sex"]=L1.fit_transform(df['sex'])
df['smoker']=L1.fit_transform(df['smoker'])


In [5]:
sex_value = {'female':0,'male':1}
smoker_value = {'yes':1,'no':0}

In [6]:
df = pd.get_dummies(df,columns= ['region'])
df

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,16884.92400,0,0,0,1
1,18,1,33.770,1,0,1725.55230,0,0,1,0
2,28,1,33.000,3,0,4449.46200,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.880,0,0,3866.85520,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,10600.54830,0,1,0,0
1334,18,0,31.920,0,0,2205.98080,1,0,0,0
1335,18,0,36.850,0,0,1629.83350,0,0,1,0
1336,21,0,25.800,0,0,2007.94500,0,0,0,1


# Spliting Data

In [7]:
x = df.drop(['charges'],axis = 1)
y = df['charges']

In [14]:
x

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.900,0,1,0,0,0,1
1,18,1,33.770,1,0,0,0,1,0
2,28,1,33.000,3,0,0,0,1,0
3,33,1,22.705,0,0,0,1,0,0
4,32,1,28.880,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,0,1,0,0
1334,18,0,31.920,0,0,1,0,0,0
1335,18,0,36.850,0,0,0,0,1,0
1336,21,0,25.800,0,0,0,0,0,1


In [16]:
y

0       16884.92400
1        1725.55230
2        4449.46200
3       21984.47061
4        3866.85520
           ...     
1333    10600.54830
1334     2205.98080
1335     1629.83350
1336     2007.94500
1337    29141.36030
Name: charges, Length: 1338, dtype: float64

In [24]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=100)

# Model Training 

In [25]:
linear_reg_model = LinearRegression()
linear_reg_model.fit(x_train,y_train)

In [26]:
linear_reg_model.score(x_test,y_test)

0.7946968492150814

In [27]:
linear_reg_model.score(x_train,y_train)

0.7380636904176421

In [28]:
linear_reg_model.intercept_

-11958.930496823798

In [29]:
linear_reg_model.coef_

array([ 2.63750608e+02,  1.26152749e+01,  3.11571755e+02,  4.73045625e+02,
        2.34931453e+04,  6.09598800e+02,  2.05975445e+02, -3.07731627e+02,
       -5.07842618e+02])

In [30]:
linear_reg_model.predict([[-1.43876426, -1.0105187 , -0.45332   , -0.90861367,  1.97058663,
       -0.56526686, -0.56641788, -0.61132367,  1.76548098]])

array([32203.35023372])

In [18]:
from sklearn.preprocessing import StandardScaler

In [19]:
scaler=StandardScaler()

In [20]:
x_data=scaler.fit_transform(x)

In [21]:
x_data

array([[-1.43876426, -1.0105187 , -0.45332   , ..., -0.56641788,
        -0.61132367,  1.76548098],
       [-1.50996545,  0.98959079,  0.5096211 , ..., -0.56641788,
         1.63579466, -0.56641788],
       [-0.79795355,  0.98959079,  0.38330685, ..., -0.56641788,
         1.63579466, -0.56641788],
       ...,
       [-1.50996545, -1.0105187 ,  1.0148781 , ..., -0.56641788,
         1.63579466, -0.56641788],
       [-1.29636188, -1.0105187 , -0.79781341, ..., -0.56641788,
        -0.61132367,  1.76548098],
       [ 1.55168573, -1.0105187 , -0.26138796, ...,  1.76548098,
        -0.61132367, -0.56641788]])

In [22]:
x_data[0]

array([-1.43876426, -1.0105187 , -0.45332   , -0.90861367,  1.97058663,
       -0.56526686, -0.56641788, -0.61132367,  1.76548098])

In [None]:
x_data[0]