In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.simplefilter("ignore")

In [44]:
df = pd.read_csv("Insurance_Analyzed.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,age,sex,bmi,children,smoker,region,expenses
0,0,19,0,27.9,0,1,southwest,16884.92
1,1,18,1,33.8,1,0,southeast,1725.55
2,2,28,1,33.0,3,0,southeast,4449.46
3,3,33,1,22.7,0,0,northwest,21984.47
4,4,32,1,28.9,0,0,northwest,3866.86


In [45]:
df.drop(["Unnamed: 0", "region"], axis = 1, inplace = True)

# ***Hyper-Parameter-Tunning***

In [46]:
X = df.drop("expenses", axis = 1)
y = (df["expenses"] ** (1/4))

In [47]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, accuracy_score

***Best Random State***

In [48]:
sc = StandardScaler()

In [49]:
train_score = []
test_score = []
cv_score = []
MAE = []

for i in range(100):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = i)
    X_train.loc[:, ["age", "bmi", "children"]] = sc.fit_transform(X_train.loc[:, ["age", "bmi", "children"]])
    X_test.loc[:, ["age", "bmi", "children"]] = sc.transform(X_test.loc[:, ["age", "bmi", "children"]])
    model = LinearRegression().fit(X_train, y_train)
    
    pred_train = model.predict(X_train)
    pred_test = model.predict(X_test)
    
    train_accuracy = model.score(X_train, y_train)
    test_accuracy = model.score(X_test, y_test)
    score = cross_val_score(model, X_train, y_train, cv = 5).mean()
    mae = mean_absolute_error(y_test, pred_test)
    
    train_score.append(train_accuracy)
    test_score.append(test_accuracy)
    cv_score.append(score)
    MAE.append(mae)

random_state = pd.DataFrame({
    "Train": train_score,
    "Test": test_score,
    "CV": cv_score,
    "MAE": mae
})

In [50]:
random_state.sort_values(by = "MAE", ascending = True)

Unnamed: 0,Train,Test,CV,MAE
0,0.775671,0.777442,0.772798,0.73167
72,0.787826,0.725591,0.782489,0.73167
71,0.769614,0.803231,0.768008,0.73167
70,0.781086,0.755970,0.776724,0.73167
69,0.778294,0.768328,0.775275,0.73167
...,...,...,...,...
28,0.779120,0.763917,0.776195,0.73167
27,0.781605,0.750047,0.773452,0.73167
26,0.776114,0.776855,0.772355,0.73167
36,0.776559,0.771586,0.771000,0.73167


## ***GridSearchCV For - PolynomialFeatures***

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)
scaler = StandardScaler()
X_train.loc[:, ["age", "bmi", "children"]] = scaler.fit_transform(X_train.loc[:, ["age", "bmi", "children"]])
X_test.loc[:, ["age", "bmi", "children"]] = scaler.transform(X_test.loc[:, ["age", "bmi", "children"]])

In [52]:
estimator = PolynomialFeatures()

param_grid = {"degree": list(range(1, 10))}

grid_model = GridSearchCV(estimator, param_grid, cv = 5, scoring = "r2")

grid_model.fit(X_train, y_train)

grid_model.best_params_

{'degree': 1}

# ***GridSearchCV - Ridge Regression***

In [54]:
estimator = Ridge()

param_grid = {"alpha": [0.001, 0.1, 1, 2, 3, 4, 10, 12]}

grid_model = GridSearchCV(estimator, param_grid, cv = 5, scoring = "r2")

grid_model.fit(X_train, y_train)

grid_model.best_params_

{'alpha': 0.1}

# ***GridSearchCV - Lasso Regression***

In [55]:
estimator = Lasso()

param_grid = {"alpha": [0.001, 0.1, 1, 2, 3, 4]}

grid_model = GridSearchCV(estimator, param_grid, cv = 5, scoring = "r2")

grid_model.fit(X_train, y_train)

grid_model.best_params_

{'alpha': 0.001}

# ***GridSearchCV - ElasticNet***

In [33]:
estimator = ElasticNet()

param_grid = {"alpha": list(range(1, 15)), "l1_ratio": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

grid_model = GridSearchCV(estimator, param_grid, cv = 5, scoring = "r2")

grid_model.fit(X_train, y_train)

grid_model.best_params_

{'alpha': 1, 'l1_ratio': 0}

# ***GridSearchCV - SVC***

In [56]:
estimator = SVR()

param_grid = {"C": [0.001, 0.1, 1, 10, 100], "kernel": ["linear", "poly", "sigmoid", "rbf"]}

grid_model = GridSearchCV(estimator, param_grid, cv = 5, scoring = "r2")

grid_model.fit(X_train, y_train)

grid_model.best_params_

{'C': 10, 'kernel': 'rbf'}

# ***GridSearchCV - KNearestNeighbors - KNeighborsRegressor***

In [57]:
estimator = KNeighborsRegressor()

param_grid = {"n_neighbors": list(range(1, 20))}

grid_model = GridSearchCV(estimator, param_grid, cv = 5, scoring = "r2")

grid_model.fit(X_train, y_train)

grid_model.best_params_

{'n_neighbors': 6}

# ***GridSearchCV - Decision Tree - DecisionTreeRegressor***

In [58]:
estimator = DecisionTreeRegressor(random_state = 0)

param_grid = {"max_depth": list(range(1, 20))}

grid_model = GridSearchCV(estimator, param_grid, cv = 5, scoring = "r2")

grid_model.fit(X_train, y_train)

print(grid_model.best_params_)

grid_model.best_estimator_.feature_importances_

{'max_depth': 4}


array([0.30699037, 0.        , 0.05862965, 0.0151286 , 0.61925138])

# ***GridSearchCV - RandomForestRegressor***

In [59]:
estimator = RandomForestRegressor(random_state = 0)

param_grid = {"n_estimators": list(range(1, 40))}

grid_model = GridSearchCV(estimator, param_grid, cv = 5, scoring = "r2")

grid_model.fit(X_train, y_train)

print("Best Params :", grid_model.best_params_)

grid_model.best_estimator_.feature_importances_

Best Params : {'n_estimators': 32}


array([0.29640107, 0.0113004 , 0.14101859, 0.03286646, 0.51841349])

# ***AdaBoostRegressor***

In [60]:
estimator = AdaBoostRegressor(random_state = 0)

param_grid = {"n_estimators": list(range(1, 40))}

grid_model = GridSearchCV(estimator, param_grid, cv = 5, scoring = "r2")

grid_model.fit(X_train, y_train)

print("Best Params :", grid_model.best_params_)

grid_model.best_estimator_.feature_importances_

Best Params : {'n_estimators': 1}


array([0.28089139, 0.        , 0.05664248, 0.        , 0.66246613])

# ***GradientBoostingRegressor***

In [61]:
estimator = GradientBoostingRegressor(random_state = 0)

param_grid = {"n_estimators": list(range(1, 20)), "learning_rate": [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1]}

grid_model = GridSearchCV(estimator, param_grid, cv = 5, scoring = "r2")

grid_model.fit(X_train, y_train)

print("Best Params :", grid_model.best_params_)

grid_model.best_estimator_.feature_importances_

Best Params : {'learning_rate': 0.2, 'n_estimators': 18}


array([0.30491808, 0.00134019, 0.06100893, 0.0230108 , 0.60972199])

# ***XGBRegressor***

In [62]:
estimator = XGBRegressor(random_state = 0)

param_grid = {"n_estimators": list(range(1, 20)), "learning_rate": [0, 0.1, 0.2, 0.3],
             "gamma":[0, 0.1, 0.2, 0.3] }

grid_model = GridSearchCV(estimator, param_grid, cv = 5, scoring = "r2")

grid_model.fit(X_train, y_train)

print("Best Params :", grid_model.best_params_)

grid_model.best_estimator_.feature_importances_

Best Params : {'gamma': 0.3, 'learning_rate': 0.2, 'n_estimators': 13}


array([0.04038018, 0.0031036 , 0.01093058, 0.00645762, 0.93912804],
      dtype=float32)