<a href="https://colab.research.google.com/github/WoradeeKongthong/medical_cost_regression/blob/master/03_Medical_Cost_Multiple_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd

In [0]:
## Data Preprocessing

# importing the dataset from part1 (Data Cleansing and EDA)
df = pd.read_csv('/content/drive/My Drive/life-long learning/MyProjects/Medical Cost/cleaned_insurance.csv', usecols=[0,1,2,3,4,5,6])

In [75]:
df.head()   

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,True,southwest,16884.924
1,18,male,33.77,1,False,southeast,1725.5523
2,28,male,33.0,3,False,southeast,4449.462
3,33,male,22.705,0,False,northwest,21984.47061
4,32,male,28.88,0,False,northwest,3866.8552


In [0]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [101]:
print(X)

[[19 'female' 27.9 0 True 'southwest']
 [18 'male' 33.77 1 False 'southeast']
 [28 'male' 33.0 3 False 'southeast']
 ...
 [18 'female' 36.85 0 False 'southeast']
 [21 'female' 25.8 0 False 'southwest']
 [61 'female' 29.07 0 True 'northwest']]


In [102]:
print(y)

[16884.92  1725.55  4449.46 ...  1629.83  2007.94 29141.36]


In [0]:
# encoding categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [1,4,5])], remainder='passthrough')
X = ct.fit_transform(X)

In [104]:
print('X.shape = ',X.shape)
print(X[:5])

X.shape =  (1338, 8)
[[0.0 1.0 0.0 0.0 1.0 19 27.9 0]
 [1.0 0.0 0.0 1.0 0.0 18 33.77 1]
 [1.0 0.0 0.0 1.0 0.0 28 33.0 3]
 [1.0 0.0 1.0 0.0 0.0 33 22.705 0]
 [1.0 0.0 1.0 0.0 0.0 32 28.88 0]]


In [0]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [106]:
# Training the Multiple Linear Regression model on the Training set
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [107]:
# Predicting the Test set results
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)[:10])

[[ 7534.8   2755.02]
 [10138.99  9447.38]
 [ 9830.23  9282.48]
 [14646.14 11944.59]
 [ 8243.76  7626.99]
 [12791.65 14001.13]
 [12653.76 12096.65]
 [35121.75 41661.6 ]
 [ 1953.39  2710.83]
 [11247.14  9620.33]]


In [108]:
# Model Evaluation
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
n = X_test.shape[0]
p = X_test.shape[1]
r_square = r2_score(y_test, y_pred)
adj_r_square = 1 - (1 - r_square) * ((n - 1) / (n - p - 1)) 
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("R-square : ",r_square)
print("Adjusted R-square : ",adj_r_square)
print("MAE : ", mae)
print("MSE : ",mse)

R-square :  0.778643304401302
Adjusted R-square :  0.771806031950377
MAE :  3836.186105975616
MSE :  30096261.49713123


In [109]:
# Applying K-Fold Cross Validation
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
k_fold_accuracy = accuracies.mean()
k_fold_sd = accuracies.std()
print("K-fold Accuracy: {:.2f} %".format(k_fold_accuracy*100))
print("K-fold Standard Deviation: {:.2f} %".format(k_fold_sd*100))

K-fold Accuracy: 73.82 %
K-fold Standard Deviation: 3.72 %


In [121]:
k_fold_accuracy

0.7382060792628369

In [0]:
## Improving the model

# Store Feature Performance 
# get x column name
enc_group = len(ct.named_transformers_['encoder'].categories_)
X_columns = []
for i in range(enc_group):
  catielem = len(ct.named_transformers_['encoder'].categories_[i])
  for j in range(1,catielem):
    X_columns.append(ct.named_transformers_['encoder'].categories_[i][j])
X_columns.extend(['age','bmi','children'])
X_columns = np.array(X_columns)

num_features = [len(X_columns)]
features_name = [str(X_columns)]
r2 = [r_square]
adjR2 = [adj_r_square]
mae_score = [mae]
mse_score = [mse]
k_fold_accuracy_mean = [k_fold_accuracy]
k_fold_accuracy_sd = [k_fold_sd]

In [0]:
# Backward Elimination
# p values
from sklearn.feature_selection import f_regression

In [0]:
for i in range(len(X[0])):
  _,p_values = f_regression(X,y)
  max_p = p_values.max()
  max_arg = p_values.argmax()

  if max_p > 0.05 :
    # Re-train the model
    X = np.delete(X,max_arg,1)
    X_columns = np.delete(X_columns,max_arg,0)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    y_pred = regressor.predict(X_test)

    n = X_test.shape[0]
    p = X_test.shape[1]
    r_square = r2_score(y_test, y_pred)
    adj_r_square = 1 - (1 - r_square) * ((n - 1) / (n - p - 1)) 
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)

    accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
    k_fold_accuracy = accuracies.mean()
    k_fold_sd = accuracies.std()

    # Store Feature Performance
    num_features.append(len(X_columns))
    features_name.append(str(X_columns))
    r2.append(r_square)
    adjR2.append(adj_r_square)
    mae_score.append(mae)
    mse_score.append(mse)
    k_fold_accuracy_mean.append(k_fold_accuracy)
    k_fold_accuracy_sd.append(k_fold_sd)

  else:
    break

In [146]:
performance_df = pd.DataFrame({'num features':num_features, 'features':features_name,'R-sq':r2,'AdjR-sq':adjR2,'MAE':mae,
                              'MSE':mse,'k_fold_acc_mean':k_fold_accuracy_mean,'k_fold_acc_sd':k_fold_accuracy_sd})
performance_df

Unnamed: 0,num features,features,R-sq,AdjR-sq,MAE,MSE,k_fold_acc_mean,k_fold_acc_sd
0,8,['male' 'True' 'northwest' 'southeast' 'southw...,0.778643,0.771806,4568.698448,43249040.0,0.738206,0.037168
1,7,['male' 'True' 'southeast' 'southwest' 'age' '...,0.74148,0.73452,4568.698448,43249040.0,0.739627,0.060179
2,6,['male' 'True' 'southeast' 'age' 'bmi' 'childr...,0.724661,0.718332,4568.698448,43249040.0,0.728742,0.075396


In [0]:
# Note : original 8-argument-X is most optimal for this model