In [1]:
from datetime import datetime
now = datetime.now()
print(now)

2021-01-15 19:30:20.570568


In [2]:
print("Name : Ramesh Bhutka")
print("Sap ID:- 53004190003")

Name : Ramesh Bhutka
Sap ID:- 53004190003


## Multiple linear regression

Columns

age: age of primary beneficiary

sex: insurance contractor gender, female, male

bmi: Body mass index, providing an understanding of body, weights that are relatively high or low relative to height,
objective index of body weight (kg / m ^ 2) using the ratio of height to weight, ideally 18.5 to 24.9

children: Number of children covered by health insurance / Number of dependents

smoker: Smoking, yes, no

region: the beneficiary's residential area in the US, northeast, southeast, southwest, northwest.

charges: Individual medical costs billed by health insurance

In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
df = pd.read_csv("https://raw.githubusercontent.com/Ramesh-Bhutka/Multiple-Linear-Regression/main/insurance.csv")

In [5]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [6]:
df.isnull().any()

age         False
sex         False
bmi         False
children    False
smoker      False
region      False
charges     False
dtype: bool

In [7]:
df.shape

(1338, 7)

In [8]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 57.6+ KB


In [10]:
# By looking at columns — ‘sex’, ‘smoker’ and ‘region’ are in string format, 
# so we can work on converting them to numerical values using Label Encoding and one hot encoding 

In [11]:
# Import label encoder
from sklearn import preprocessing 
# label_encoder object knows how to understand word labels. 
label_encoder = preprocessing.LabelEncoder() 
# Encode labels in column 'sex'. 
df['sex']= label_encoder.fit_transform(df['sex']) 
df['sex'].unique() 


array([0, 1])

In [12]:
# Encode labels in column 'smoker'. 
df['smoker']= label_encoder.fit_transform(df['smoker']) 
df['smoker'].unique() 

array([1, 0])

In [13]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [14]:
# we will perform One-Hot Encoding on region column
df = pd.concat([df, pd.get_dummies(df['region'], prefix='region')], axis=1)
df.drop(["region"], axis = 1, inplace = True) 
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


In [15]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


In [16]:
df = df[['age', 'sex', 'bmi','children', 'smoker', 'region_northeast', 'region_northwest', 'region_southeast', 'region_southwest', 'charges']]

In [17]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest,charges
0,19,0,27.9,0,1,0,0,0,1,16884.924
1,18,1,33.77,1,0,0,0,1,0,1725.5523
2,28,1,33.0,3,0,0,0,1,0,4449.462
3,33,1,22.705,0,0,0,1,0,0,21984.47061
4,32,1,28.88,0,0,0,1,0,0,3866.8552


In [18]:
y=df['charges']
X=df.drop('charges',axis=1)

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [20]:
X_train.shape

(1070, 9)

In [21]:
X_test.shape

(268, 9)

In [22]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression()

In [23]:
print(regressor.intercept_)

-12311.91360565046


In [24]:
print(regressor.coef_)

[ 2.53700500e+02 -1.54637279e+01  3.35962814e+02  4.36910121e+02
  2.36050173e+04  4.83840068e+02  2.23707336e+02 -4.29438766e+02
 -2.78108638e+02]


In [25]:
y_pred = regressor.predict(X_test)

In [26]:
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df

Unnamed: 0,Actual,Predicted
578,9724.53000,11169.927119
610,8547.69130,9486.709085
569,45702.02235,38181.123053
1034,12950.07120,16266.313289
198,9644.25250,6914.648007
...,...,...
1084,15019.76005,14760.230968
726,6664.68595,8277.984346
1132,20709.02034,16149.973370
725,40932.42950,32904.758143


In [27]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R Squared Error          : ', metrics.r2_score(y_test, y_pred))

Mean Absolute Error: 3933.2726494052304
Mean Squared Error: 31827950.22952382
Root Mean Squared Error: 5641.626558850188
R Squared Error          :  0.7999876970680435
