In [1]:
import numpy as np
import pandas as pd

In [2]:
# Taking data into pandas dataframe.
data = pd.read_csv("insurance.csv")

In [3]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,female,27.9,0,yes,southwest,16884.92
1,18,male,33.8,1,no,southeast,1725.55
2,28,male,33.0,3,no,southeast,4449.46
3,33,male,22.7,0,no,northwest,21984.47
4,32,male,28.9,0,no,northwest,3866.86
...,...,...,...,...,...,...,...
1333,50,male,31.0,3,no,northwest,10600.55
1334,18,female,31.9,0,no,northeast,2205.98
1335,18,female,36.9,0,no,southeast,1629.83
1336,21,female,25.8,0,no,southwest,2007.95


In [4]:
# Replacing "sex" column values from string to integer.
data['sex'].replace(to_replace=['male','female'],value=[0,1],inplace=True)

In [5]:
# Replacing "smoker" column values from string to integer.
data['smoker'].replace(to_replace=['no','yes'],value=[0,1],inplace=True)

In [6]:
# Replacing "region" column values from string to integer.
data['region'].replace(to_replace=data['region'].unique(),value=list(range(len(data['region'].unique()))),
                      inplace=True)

In [7]:
data

Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,19,1,27.9,0,1,0,16884.92
1,18,0,33.8,1,0,1,1725.55
2,28,0,33.0,3,0,1,4449.46
3,33,0,22.7,0,0,2,21984.47
4,32,0,28.9,0,0,2,3866.86
...,...,...,...,...,...,...,...
1333,50,0,31.0,3,0,2,10600.55
1334,18,1,31.9,0,0,3,2205.98
1335,18,1,36.9,0,0,1,1629.83
1336,21,1,25.8,0,0,0,2007.95


In [8]:
# Labeling Values of "region" column to One Hot Encoding Vector.
region_variable_ohe = np.eye(len(data['region'].unique()),len(data['region'].unique()))[data['region']]

In [9]:
processed_data = data.drop(['region'],axis=1)

In [10]:
# Region dataframe after One Hot Vector Encoding.
region_variable_df = pd.DataFrame(data=region_variable_ohe,columns=['Region 0','Region 1','Region 2','Region 3'])

In [11]:
# Joining the processed data and region_variable_df.
processed_data = pd.concat([processed_data,region_variable_df],axis=1)

In [12]:
# Taking labels of training data to a variable labels.
labels = processed_data['expenses']

In [13]:
processed_data.drop(['expenses'],axis=1,inplace=True)

In [14]:
# Adding expense(label) column in the end of dataframe.
processed_data['expenses'] = labels

In [15]:
columns_name = processed_data.columns

In [16]:
# Taking labels of training data to a variable Y.
Y = np.array(processed_data['expenses']).reshape(processed_data.shape[0],1)

In [17]:
X_transpose = np.array(processed_data.drop(['expenses'],axis=1))

In [18]:
# Normalizing the Data.
X_transpose = (X_transpose - np.mean(X_transpose,axis=0))/np.std(X_transpose,axis=0)

In [19]:
X_transpose_df = pd.DataFrame(data=X_transpose,columns=columns_name[0:9])

In [20]:
X_transpose_df

Unnamed: 0,age,sex,bmi,children,smoker,Region 0,Region 1,Region 2,Region 3
0,-1.438764,1.010519,-0.453646,-0.908614,1.970587,1.765481,-0.611324,-0.566418,-0.565267
1,-1.509965,-0.989591,0.514186,-0.078767,-0.507463,-0.566418,1.635795,-0.566418,-0.565267
2,-0.797954,-0.989591,0.382954,1.580926,-0.507463,-0.566418,1.635795,-0.566418,-0.565267
3,-0.441948,-0.989591,-1.306650,-0.908614,-0.507463,-0.566418,-0.611324,1.765481,-0.565267
4,-0.513149,-0.989591,-0.289606,-0.908614,-0.507463,-0.566418,-0.611324,1.765481,-0.565267
...,...,...,...,...,...,...,...,...,...
1333,0.768473,-0.989591,0.054876,1.580926,-0.507463,-0.566418,-0.611324,1.765481,-0.565267
1334,-1.509965,1.010519,0.202511,-0.908614,-0.507463,-0.566418,-0.611324,-0.566418,1.769076
1335,-1.509965,1.010519,1.022707,-0.908614,-0.507463,-0.566418,1.635795,-0.566418,-0.565267
1336,-1.296362,1.010519,-0.798128,-0.908614,-0.507463,1.765481,-0.611324,-0.566418,-0.565267


In [21]:
from sklearn.preprocessing import PolynomialFeatures

In [22]:
feature_engineering_obj = PolynomialFeatures()

In [23]:
# Incresing the number of features we have with polynomail degree 2.
X_transpose_quadratic = feature_engineering_obj.fit_transform(X=X_transpose)

In [24]:
X_transpose_quadratic_df = pd.DataFrame(data = X_transpose_quadratic)

In [25]:
X_transpose_quadratic_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,54
0,1.0,-1.438764,1.010519,-0.453646,-0.908614,1.970587,1.765481,-0.611324,-0.566418,-0.565267,...,3.116923,-1.079280,-1.000000,-0.997968,0.373717,0.346265,0.345561,0.320829,0.320177,0.319527
1,1.0,-1.509965,-0.989591,0.514186,-0.078767,-0.507463,-0.566418,1.635795,-0.566418,-0.565267,...,0.320829,-0.926543,0.320829,0.320177,2.675824,-0.926543,-0.924661,0.320829,0.320177,0.319527
2,1.0,-0.797954,-0.989591,0.382954,1.580926,-0.507463,-0.566418,1.635795,-0.566418,-0.565267,...,0.320829,-0.926543,0.320829,0.320177,2.675824,-0.926543,-0.924661,0.320829,0.320177,0.319527
3,1.0,-0.441948,-0.989591,-1.306650,-0.908614,-0.507463,-0.566418,-0.611324,1.765481,-0.565267,...,0.320829,0.346265,-1.000000,0.320177,0.373717,-1.079280,0.345561,3.116923,-0.997968,0.319527
4,1.0,-0.513149,-0.989591,-0.289606,-0.908614,-0.507463,-0.566418,-0.611324,1.765481,-0.565267,...,0.320829,0.346265,-1.000000,0.320177,0.373717,-1.079280,0.345561,3.116923,-0.997968,0.319527
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1333,1.0,0.768473,-0.989591,0.054876,1.580926,-0.507463,-0.566418,-0.611324,1.765481,-0.565267,...,0.320829,0.346265,-1.000000,0.320177,0.373717,-1.079280,0.345561,3.116923,-0.997968,0.319527
1334,1.0,-1.509965,1.010519,0.202511,-0.908614,-0.507463,-0.566418,-0.611324,-0.566418,1.769076,...,0.320829,0.346265,0.320829,-1.002036,0.373717,0.346265,-1.081478,0.320829,-1.002036,3.129630
1335,1.0,-1.509965,1.010519,1.022707,-0.908614,-0.507463,-0.566418,1.635795,-0.566418,-0.565267,...,0.320829,-0.926543,0.320829,0.320177,2.675824,-0.926543,-0.924661,0.320829,0.320177,0.319527
1336,1.0,-1.296362,1.010519,-0.798128,-0.908614,-0.507463,1.765481,-0.611324,-0.566418,-0.565267,...,3.116923,-1.079280,-1.000000,-0.997968,0.373717,0.346265,0.345561,0.320829,0.320177,0.319527


In [26]:
X_transpose_quadratic = X_transpose_quadratic[:,1:]

In [27]:
X_transpose_quadratic.shape

(1338, 54)

In [28]:
# Diving data into training, cross validation and testing data.
training_data = X_transpose_quadratic[0:int(0.7*X_transpose_quadratic.shape[0])]
Cv_data = X_transpose_quadratic[int(0.7*X_transpose_quadratic.shape[0]):int(0.9*X_transpose_quadratic.shape[0])]
test_data = X_transpose_quadratic[int(0.9*X_transpose_quadratic.shape[0]):]

In [29]:
# Normalizing training labels.
Y_train = (Y[0:int(0.7*X_transpose_quadratic.shape[0])] - np.mean(Y[0:int(0.7*X_transpose_quadratic.shape[0])]))/np.std(Y[0:int(0.7*X_transpose_quadratic.shape[0])])

In [30]:
N_train = training_data.shape[0]
m = training_data.shape[1]

In [31]:
from sklearn.linear_model import LinearRegression

In [32]:
lr_obj = LinearRegression()

In [33]:
# Fit data on Linear Regression Model.
lr_obj.fit(X=training_data,y=Y_train)

LinearRegression()

In [34]:
# Trained Parameters.
theta_trained = lr_obj.coef_

In [35]:
theta_trained

array([[ 3.00154283e-01, -2.48309019e+10,  1.64520397e-01,
         8.16945639e-02,  2.11860822e+10,  7.65281892e+11,
        -7.52382893e+11,  5.30014018e+11, -1.74953664e+11,
         6.36901855e-02, -9.83428955e-03, -6.88934326e-03,
        -1.34124756e-02, -1.42860413e-02,  8.68660039e+11,
         9.01433327e+11,  8.68660039e+11,  8.67750600e+11,
         1.18649743e+12,  1.27258301e-02,  1.10168457e-02,
        -2.05993652e-03,  8.40414606e+11,  8.72122235e+11,
         8.40414606e+11,  8.39534739e+11, -1.24206543e-02,
        -5.82885742e-03,  3.13201904e-01, -1.61285734e+12,
        -1.67370812e+12, -1.61285734e+12, -1.61116877e+12,
        -1.67083740e-02, -3.38745117e-03, -3.84654033e+10,
        -3.99166473e+10, -3.84654033e+10, -3.84251321e+10,
        -1.44800365e+10,  6.96420959e+11,  7.22695916e+11,
         6.96420959e+11,  6.95691845e+11, -7.62367558e+11,
        -2.64297023e+11,  7.55911026e+10, -3.31018119e+11,
         3.81998351e+11, -1.83445322e+11, -4.78314261e+1

In [36]:
theta_trained = theta_trained.reshape(theta_trained.shape[1],1)

In [37]:
theta_trained.shape

(54, 1)

In [38]:
theta0_trained = lr_obj.intercept_

In [39]:
# Function for calculating predicted answers.
def predicted_answer(theta0,theta,X_transpose):
    
    return theta0 + np.matmul(X_transpose,theta)

In [40]:
# Function for Calculating Mean Squared Error(Without Regularization).
def compute_mse(labels,predicted_labels):
    E = (labels - predicted_labels)
    return (1/labels.shape[0])*np.matmul(E.T,E)

In [41]:
# Function for calculating Root Mean Squared Error.
def rmse(gt_labels,trained_theta0,trained_theta,X_transpose):
    
    predicted_labels = predicted_answer(trained_theta0,trained_theta,X_transpose)
    
    return np.sqrt(compute_mse(gt_labels,predicted_labels))

In [42]:
train_rmse = rmse(Y_train,theta0_trained,theta_trained,training_data)

In [43]:
train_rmse[0][0] # RMSE on Training Data.

0.3714379800850182

In [44]:
# Normalizing Cross Validation labels.
Y_cv = (Y[int(0.7*X_transpose_quadratic.shape[0]):int(0.9*X_transpose_quadratic.shape[0])] - np.mean(Y[int(0.7*X_transpose_quadratic.shape[0]):int(0.9*X_transpose_quadratic.shape[0])]))/np.std(Y[int(0.7*X_transpose_quadratic.shape[0]):int(0.9*X_transpose_quadratic.shape[0])])

In [45]:
Cv_rmse = rmse(Y_cv,theta0_trained,theta_trained,Cv_data)

In [46]:
Cv_rmse[0][0] # RMSE on Cross Validation Data.

0.4472421965747066

In [47]:
# Normalizing Testing labels.
Y_test = (Y[int(0.9*X_transpose_quadratic.shape[0]):] - np.mean(Y[int(0.9*X_transpose_quadratic.shape[0]):]))/np.std(Y[int(0.9*X_transpose_quadratic.shape[0]):])

In [48]:
test_rmse = rmse(Y_test,theta0_trained,theta_trained,test_data)

In [49]:
test_rmse[0][0] # RMSE on Testing Data.

0.4175565632687205