<a href="https://colab.research.google.com/github/WoradeeKongthong/medical_cost_regression/blob/master/05_Medical_Cost_SVR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd

In [0]:
## Data Preprocessing

# importing the dataset
df = pd.read_csv('/content/drive/My Drive/life-long learning/MyProjects/Medical Cost/cleaned_insurance.csv', usecols=[0,1,2,3,4,5,6])

In [3]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,True,southwest,16884.924
1,18,male,33.77,1,False,southeast,1725.5523
2,28,male,33.0,3,False,southeast,4449.462
3,33,male,22.705,0,False,northwest,21984.47061
4,32,male,28.88,0,False,northwest,3866.8552


In [0]:
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [5]:
print(X)

[[19 'female' 27.9 0 True 'southwest']
 [18 'male' 33.77 1 False 'southeast']
 [28 'male' 33.0 3 False 'southeast']
 ...
 [18 'female' 36.85 0 False 'southeast']
 [21 'female' 25.8 0 False 'southwest']
 [61 'female' 29.07 0 True 'northwest']]


In [6]:
print(y)

[16884.924   1725.5523  4449.462  ...  1629.8335  2007.945  29141.3603]


In [0]:
y = y.reshape(len(y),1)

In [0]:
# encoding categorical data
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(drop='first'), [1,4,5])], remainder='passthrough')
X = ct.fit_transform(X)

In [9]:
print(X[:5])

[[0.0 1.0 0.0 0.0 1.0 19 27.9 0]
 [1.0 0.0 0.0 1.0 0.0 18 33.77 1]
 [1.0 0.0 0.0 1.0 0.0 28 33.0 3]
 [1.0 0.0 1.0 0.0 0.0 33 22.705 0]
 [1.0 0.0 1.0 0.0 0.0 32 28.88 0]]


In [0]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [0]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
sc_y = StandardScaler()

X_train[:,5:] = sc_X.fit_transform(X_train[:,5:])
X_test[:,5:] = sc_X.transform(X_test[:,5:])

y_train = sc_y.fit_transform(y_train)

In [12]:
# Training the SVR model on the Training set
from sklearn.svm import SVR

regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [13]:
# Predicting the Test set results
y_pred = sc_y.inverse_transform(regressor.predict(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)[:10])

[[ 3634.57  2775.19]
 [42020.51 39871.7 ]
 [44746.79 47462.89]
 [16819.75 15612.19]
 [ 3148.9   2055.32]
 [ 3035.1   1826.84]
 [ 6481.66  5375.04]
 [13040.29 11848.14]
 [ 7226.46  6393.6 ]
 [14408.48 13451.12]]


In [14]:
# Model Evaluation
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
n = X_test.shape[0]
p = X_test.shape[1]
r_square = r2_score(y_test, y_pred)
adj_r_square = 1 - (1 - r_square) * ((n - 1) / (n - p - 1)) 
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("R-square : ",r_square)
print("Adjusted R-square : ",adj_r_square)
print("MAE : ", mae)
print("MSE : ",mse)

R-square :  0.8893015160965227
Adjusted R-square :  0.8858822579064539
MAE :  2266.3752049571685
MSE :  16255067.913474571


In [15]:
# Applying K-Fold Cross Validation
from sklearn.model_selection import cross_val_score
y_train = y_train.reshape(len(y_train),)
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("Accuracy: {}".format(accuracies.mean()))
print("Standard Deviation: {}".format(accuracies.std()))

Accuracy: 0.8043097651890665
Standard Deviation: 0.053540733813163224


In [17]:
# Improving the model

# Grid Search
from sklearn.model_selection import GridSearchCV
parameters = [{'C':[1,10,100],'kernel':['linear']},
              {'C':[1,10,100],'kernel':['rbf'],'gamma': ['scale','auto']},]
grid_search = GridSearchCV(estimator = regressor,
                          param_grid = parameters,
                          cv = 10)
grid_search = grid_search.fit(X_train,y_train)

best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best Accuracy : {}".format(best_accuracy))
print("Best Parameters : ", best_parameters)

Best Accuracy : 0.8130553851134656
Best Parameters :  {'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}
