Import necessary dependencies

In [None]:
import numpy as np
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

Data analysis

In [None]:
df = pd.read_csv(r"Datasets/Medical charges/insurance.csv")

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

Convert text value to numeric

In [None]:
labelEncoder = LabelEncoder()

df["sex"] = labelEncoder.fit_transform(df["sex"])
# map for sex
sex = {index : label for index, label in enumerate(labelEncoder.classes_)}

df["smoker"] = labelEncoder.fit_transform(df["smoker"])
# map for smoker
smoker = {index: label for index,label in enumerate(labelEncoder.classes_)}

df["region"] = labelEncoder.fit_transform(df["region"])
# map for region
region = {index: label for index,label in enumerate(labelEncoder.classes_)}

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# convert to int32 to save memory
df["children"] = df["children"].astype(np.int32)
df["age"] = df["age"].astype(np.int32)

In [None]:
df.info()

In [None]:
scatter_matrix(df, figsize=(12,8))

In [None]:
plt.scatter(df["region"],df["charges"])

In [None]:
plt.scatter(df["children"],df["charges"])

In [None]:
plt.scatter(df["sex"],df["charges"])

In [None]:
plt.scatter(df["smoker"],df["charges"])

As observed above, We can see there is little or no obvious relationship between the region, sex, number of children and the insurance charges. However, we can observe that there is certain relation between cigarette intake and insurance charges. We can prove by inspecting the median and mean of the insurance charges of the smoking community and non-smoking community

In [None]:
smoker = df.loc[df["smoker"] == 1]
smoker.info()

In [None]:
nonSmoker = df.loc[df["smoker"] == 0]
nonSmoker.sample(frac=0.26,random_state=42).info()

As the non smoking community is larger than the smoking community, we sample a portion from the non smoking community which are roughly of the same size with the smoking community to be compared

In [None]:
smoker.charges.median()

In [None]:
nonSmoker.sample(frac=0.26,random_state=42).charges.median()

In [None]:
smoker.charges.mean()

In [None]:
nonSmoker.charges.mean()

Generally, mean and median of the insurance charges of the smoking community is a lot greater than the non smoker. Hence we can consider the smoker attribute into our Linear Regression Model

Now we drop the unnecessary column from the dataframe

In [None]:
y = np.array(df["charges"])

In [None]:
df.drop("charges",axis=1, inplace=True)
df.drop("region",axis=1, inplace=True)
df.drop("sex",axis=1, inplace=True)
df.drop("children",axis=1, inplace=True)

In [None]:
X = np.array(df)

In [None]:
# check shape of X and y
X.shape

In [None]:
y.shape

Train the model

In [None]:
model1 = LinearRegression()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.9)

In [None]:
model1.fit(X_train,y_train)

In [None]:
y_pred = model1.predict(X_test)

In [None]:
# model score
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
MAE = mean_absolute_error(y_test, y_pred)
print(f"Model 1 Score\nMSE:{MSE}\nRMSE:{RMSE}\nMAE:{MAE}")

Testing the performance of my own implementation of Linear Regression

Batch Gradient Descent

In [None]:
from LinearRegression import LinearRegression as LinearReg

In [None]:
model2 = LinearReg()

In [None]:
model2.fit(X_train,y_train,epoch=20000,learning_rate=0.00075)

In [None]:
y_pred = model2.predict(X_test)

In [None]:
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
MAE = mean_absolute_error(y_test, y_pred)
print(f"Model 2 Score\nMSE:{MSE}\nRMSE:{RMSE}\nMAE:{MAE}")

Stochastic Gradient Descent

In [None]:
model3 = LinearReg()

In [None]:
model3.fit(X_train,y_train,epoch=500,learning_rate=0.00035,optimizer="SGD")

In [None]:
y_pred = model3.predict(X_test)

In [None]:
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
MAE = mean_absolute_error(y_test, y_pred)
print(f"Model 3 Score\nMSE:{MSE}\nRMSE:{RMSE}\nMAE:{MAE}")

Mini-batch Gradient Descent

In [None]:
model4 = LinearReg()

In [None]:
model4.fit(X_train,y_train,epoch=7500,learning_rate=0.00035,optimizer="SGD",batch_size=64)

In [None]:
y_pred = model4.predict(X_test)

In [None]:
MSE = mean_squared_error(y_test, y_pred)
RMSE = np.sqrt(mean_squared_error(y_test, y_pred))
MAE = mean_absolute_error(y_test, y_pred)
print(f"Model 4 Score\nMSE:{MSE}\nRMSE:{RMSE}\nMAE:{MAE}")