In [1]:
# Import the base Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Load the data
df_insurance_main = pd.read_csv("insurance.csv")
df_insurance_main.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552



# Overview of the data

In [3]:
df_insurance_main.shape

(1338, 7)

In [4]:
# 1338 observations
# 7 Features

In [5]:
df_insurance_main.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [6]:
df_insurance_main.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [7]:
df_insurance_main.isnull().sum()*100/len(df_insurance_main)

age         0.0
sex         0.0
bmi         0.0
children    0.0
smoker      0.0
region      0.0
charges     0.0
dtype: float64

In [8]:
df_insurance_main.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [9]:
## Saperate the Target Veribale
df_target = df_insurance_main["charges"]
df_insurance_main.drop("charges",axis=1,inplace=True)

# Encoding & Scalling

## Scalling

In [10]:
# numerical df
df_insurance_main_num = df_insurance_main.select_dtypes(include="number")
df_insurance_main_num.head()

Unnamed: 0,age,bmi,children
0,19,27.9,0
1,18,33.77,1
2,28,33.0,3
3,33,22.705,0
4,32,28.88,0


In [11]:
# Scale the numerical data using Standard SWcaller
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
scalled = ss.fit_transform(df_insurance_main_num)

In [12]:
scalled_df = pd.DataFrame(scalled,columns=df_insurance_main_num.columns)
scalled_df.head()

Unnamed: 0,age,bmi,children
0,-1.438764,-0.45332,-0.908614
1,-1.509965,0.509621,-0.078767
2,-0.797954,0.383307,1.580926
3,-0.441948,-1.305531,-0.908614
4,-0.513149,-0.292556,-0.908614


## Encoding

In [13]:
# Saperate the Cat. Veriabls
df_insurance_main_cat = df_insurance_main.select_dtypes(include="object")
df_insurance_main_cat.head()

Unnamed: 0,sex,smoker,region
0,female,yes,southwest
1,male,no,southeast
2,male,no,southeast
3,male,no,northwest
4,male,no,northwest


In [14]:
# Encode using Dummy Encoding
encoded = pd.get_dummies(df_insurance_main_cat)
encoded.head()

Unnamed: 0,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,1,0,0,1,0,0,0,1
1,0,1,1,0,0,0,1,0
2,0,1,1,0,0,0,1,0
3,0,1,1,0,0,1,0,0
4,0,1,1,0,0,1,0,0


In [15]:
#### Concate the Scalled and encoded Dataframe
X = pd.concat([scalled_df,encoded],axis=1)
X.head()

Unnamed: 0,age,bmi,children,sex_female,sex_male,smoker_no,smoker_yes,region_northeast,region_northwest,region_southeast,region_southwest
0,-1.438764,-0.45332,-0.908614,1,0,0,1,0,0,0,1
1,-1.509965,0.509621,-0.078767,0,1,1,0,0,0,1,0
2,-0.797954,0.383307,1.580926,0,1,1,0,0,0,1,0
3,-0.441948,-1.305531,-0.908614,0,1,1,0,0,1,0,0
4,-0.513149,-0.292556,-0.908614,0,1,1,0,0,1,0,0


In [16]:
## Splitting the data for Training and Testing
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test = train_test_split(X,df_target,random_state= 1, test_size = 0.2)


# Traing Data



print("X_train---indepandant veriabls for traing ",X_train.shape)
print("y_train---dependant veriabls for traing ",Y_train.shape)

print("__________________________________________")

# Testing Data
print("X_test---indepandant veriabls for testing ",X_test.shape)
print("y_test---dependant veriabls for testing ",Y_test.shape)


X_train---indepandant veriabls for traing  (1070, 11)
y_train---dependant veriabls for traing  (1070,)
__________________________________________
X_test---indepandant veriabls for testing  (268, 11)
y_test---dependant veriabls for testing  (268,)


# Base model

In [17]:
# OLS (Ordinory Least Squired Method)
import statsmodels.api as sm

# Build the model
base_model_df_insurance_main = sm.OLS(Y_train,X_train).fit()

# Print the summary
base_model_df_insurance_main.summary()

0,1,2,3
Dep. Variable:,charges,R-squared:,0.748
Model:,OLS,Adj. R-squared:,0.746
Method:,Least Squares,F-statistic:,393.2
Date:,"Wed, 07 Sep 2022",Prob (F-statistic):,4.76e-311
Time:,09:29:39,Log-Likelihood:,-10838.0
No. Observations:,1070,AIC:,21690.0
Df Residuals:,1061,BIC:,21740.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
age,3616.3756,188.872,19.147,0.000,3245.770,3986.981
bmi,1960.5774,194.806,10.064,0.000,1578.328,2342.826
children,491.7307,186.451,2.637,0.008,125.875,857.586
sex_female,8238.8444,213.372,38.613,0.000,7820.165,8657.523
sex_male,7996.6914,203.714,39.255,0.000,7596.963,8396.419
smoker_no,-3775.4751,191.749,-19.690,0.000,-4151.725,-3399.225
smoker_yes,2.001e+04,295.679,67.678,0.000,1.94e+04,2.06e+04
region_northeast,4643.2603,332.008,13.985,0.000,3991.793,5294.728
region_northwest,4247.1638,334.245,12.707,0.000,3591.307,4903.020

0,1,2,3
Omnibus:,241.621,Durbin-Watson:,1.967
Prob(Omnibus):,0.0,Jarque-Bera (JB):,566.273
Skew:,1.217,Prob(JB):,1.08e-123
Kurtosis:,5.604,Cond. No.,7970000000000000.0


In [18]:
from sklearn.linear_model import SGDRegressor, LinearRegression

lr = LinearRegression()
sgd = SGDRegressor()
model1 = lr.fit(X_train, Y_train)
predected=model1.predict(X_test)

In [19]:
predected

array([ 4383.68089988, 12885.03892192, 12589.21653212, 13286.22919217,
         544.72832757, 32117.58400779, 12919.04237221, 12318.62183013,
        3784.29145555, 29468.45725408, 11002.8139431 , 17539.69473777,
        8681.35471964,  8349.04325528,  3130.12725504, 10445.83896118,
        3863.74357865,  6944.62510786, 15009.63121084, 14441.59911874,
       12543.65768867, 32958.72553095,  9072.63608136,  8986.85860053,
        3022.85773294,  8164.97136102,  9556.07558002, 10743.20363927,
        7694.01743692,  4373.43771674, 14140.93557984,  5811.78545062,
       34631.91316718, 27009.11191231, 33348.14098668,  9532.96786929,
       30421.65017927, 26648.91186842, 15157.78333287, 33895.76121465,
        6303.38552088, 14059.15156303, 10713.4467824 , 15089.36171493,
        4187.95334069, 13106.4297513 ,  4336.19603407, 28607.05556216,
        7243.57117377, 14269.4643165 , 13282.36924936, 12329.61280721,
        1851.87215658,  8876.2837892 , 26089.18341811, 10125.8221046 ,
      