#**MULTIPLE LINEAR REGRESSION**

##Medical Cost Prediction for Insurance in the United States

Importing Libraries

In [52]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


Importing Dataset

In [53]:
data = pd.read_csv('/content/datasets_13720_18513_insurance.csv')

In [54]:
print(data)

      age     sex     bmi  children smoker     region      charges
0      19  female  27.900         0    yes  southwest  16884.92400
1      18    male  33.770         1     no  southeast   1725.55230
2      28    male  33.000         3     no  southeast   4449.46200
3      33    male  22.705         0     no  northwest  21984.47061
4      32    male  28.880         0     no  northwest   3866.85520
...   ...     ...     ...       ...    ...        ...          ...
1333   50    male  30.970         3     no  northwest  10600.54830
1334   18  female  31.920         0     no  northeast   2205.98080
1335   18  female  36.850         0     no  southeast   1629.83350
1336   21  female  25.800         0     no  southwest   2007.94500
1337   61  female  29.070         0    yes  northwest  29141.36030

[1338 rows x 7 columns]


In [55]:
X = data.iloc[:,:-1].values
Y = data.iloc[:,-1].values

In [56]:
print(X)

[[19 'female' 27.9 0 'yes' 'southwest']
 [18 'male' 33.77 1 'no' 'southeast']
 [28 'male' 33.0 3 'no' 'southeast']
 ...
 [18 'female' 36.85 0 'no' 'southeast']
 [21 'female' 25.8 0 'no' 'southwest']
 [61 'female' 29.07 0 'yes' 'northwest']]


In [57]:
print(Y)

[16884.92  1725.55  4449.46 ...  1629.83  2007.94 29141.36]


Cleaning Data

In [58]:
null_columns=data.isnull().values.any()
print(null_columns)

False


One-hot Encoding

In [59]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,4,5])], remainder='passthrough') 
X = np.array(ct.fit_transform(X)) 

In [60]:
test_data_frame = pd.DataFrame(X)
print(test_data_frame)

      0  1  2  3  4  5  6  7   8       9 10
0     1  0  0  1  0  0  0  1  19    27.9  0
1     0  1  1  0  0  0  1  0  18   33.77  1
2     0  1  1  0  0  0  1  0  28      33  3
3     0  1  1  0  0  1  0  0  33  22.705  0
4     0  1  1  0  0  1  0  0  32   28.88  0
...  .. .. .. .. .. .. .. ..  ..     ... ..
1333  0  1  1  0  0  1  0  0  50   30.97  3
1334  1  0  1  0  1  0  0  0  18   31.92  0
1335  1  0  1  0  0  0  1  0  18   36.85  0
1336  1  0  1  0  0  0  0  1  21    25.8  0
1337  1  0  0  1  0  1  0  0  61   29.07  0

[1338 rows x 11 columns]


Splitting Training and Testing Data

In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2,random_state=1)

Feature Scaling - Standardizing

In [63]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train[:,8:] = sc.fit_transform(X_train[:,8:])
X_test[:,8:] = sc.transform(X_test[:,8:])

Training the Model

In [64]:
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()
mlr.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Predicting results from the trained model

In [65]:
Y_pred = mlr.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((Y_pred.reshape(len(Y_pred),1),Y_test.reshape(len(Y_test),1)),1))

[[ 4320.    1646.43]
 [12848.   11353.23]
 [12640.    8798.59]
 [13568.   10381.48]
 [  400.    2103.08]
 [31712.   38746.36]
 [13232.    9304.7 ]
 [12416.   11658.12]
 [ 3824.    3070.81]
 [29520.   19539.24]
 [11328.   12629.9 ]
 [17600.   11538.42]
 [ 8944.    6338.08]
 [ 8304.    7050.64]
 [ 3456.    1137.47]
 [10368.    8968.33]
 [ 3728.   21984.47]
 [ 6800.    6414.18]
 [15392.   28287.9 ]
 [14496.   13462.52]
 [12672.    9722.77]
 [32992.   40932.43]
 [ 9008.    8026.67]
 [ 8960.    8444.47]
 [ 3088.    2203.47]
 [ 8032.    6664.69]
 [ 9520.    8606.22]
 [11120.    8283.68]
 [ 7632.    5375.04]
 [ 4416.    3645.09]
 [14160.   11674.13]
 [ 5680.   11737.85]
 [34336.   24873.38]
 [26688.   33750.29]
 [33296.   24180.93]
 [ 9424.    9863.47]
 [30176.   36837.47]
 [26176.   17942.11]
 [15584.   11856.41]
 [33552.   39725.52]
 [ 6576.    4349.46]
 [14032.   11743.93]
 [10928.   19749.38]
 [15120.   12347.17]
 [ 4048.    4931.65]
 [13216.   30260.  ]
 [ 4640.   27724.29]
 [28704.   34

Evaluating the Model

In [76]:
from sklearn import metrics
mlr.score(X_test, Y_test)

0.7622133026650104