In [41]:
# Import libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [42]:
# Load dataset
ins_data = pd.read_csv('insurance.csv')
ins_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [43]:
cat_cols = ins_data.select_dtypes('object').columns
num_cols = ins_data.select_dtypes(exclude='object').drop(columns=['charges']).columns
target_col = ['charges']

In [44]:
x = ins_data.iloc[:,:-1]
y = ins_data['charges']

In [45]:
X_train, X_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=23)

In [46]:
X_train

Unnamed: 0,age,sex,bmi,children,smoker,region
708,31,female,30.495,3,no,northeast
45,55,male,37.300,0,no,southwest
434,31,male,28.595,1,no,northwest
995,39,female,23.275,3,no,northeast
1299,19,female,25.745,1,no,northwest
...,...,...,...,...,...,...
31,18,female,26.315,0,no,northeast
950,57,male,18.335,0,no,northeast
1064,29,female,25.600,4,no,southwest
742,53,male,34.105,0,yes,northeast


In [47]:
cat_pipeline = Pipeline([
    ('si', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder()),
    ('ss', StandardScaler(with_mean=False))
])

In [48]:
num_pipeline = Pipeline([
    ('si', SimpleImputer(strategy='median')),
    ('ss', StandardScaler(with_mean=False))
])

In [49]:
transformer = ColumnTransformer([
    ('cat_pipeline', cat_pipeline, cat_cols),
    ('num_pipeline', num_pipeline, num_cols)
])

In [50]:
X_train_new = transformer.fit_transform(X_train)
X_test_new = transformer.transform(X_test)

In [51]:
def evaluate_models(actual, predicted):
    mae = mean_absolute_error(actual, predicted)
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mean_squared_error(actual, predicted))
    r2_sco = r2_score(actual, predicted)
    return mae, mse, rmse, r2_sco

In [52]:
models ={
    "Linear Regression": LinearRegression(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest": RandomForestRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "Gradient Boost": GradientBoostingRegressor(),
    "K-Nearest Regressor": KNeighborsRegressor(),
    "Support Vector Regressor": SVR()
}

model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train_new, y_train)
    
    y_train_pred = model.predict(X_train_new)
    y_test_pred = model.predict(X_test_new)
    
    train_model_mae, train_model_mse, train_model_rmse, train_model_r2_sco = evaluate_models(y_train, y_train_pred)
    test_model_mae, test_model_mse, test_model_rmse, test_model_r2_sco = evaluate_models(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    
    print("Model Performance for training set:")
    print("Mean Absolute Error: {:.2f}".format(train_model_mae))
    print("Mean Squared Error: {:.2f}".format(train_model_mse))
    print("Root Mean Squared Error: {:.2f}".format(train_model_rmse))
    print("R2_scorer: {:.2f}".format(train_model_r2_sco))
    
    print("Model Performance for testing set:")
    print("Mean Absolute Error: {:.2f}".format(test_model_mae))
    print("Mean Squared Error: {:.2f}".format(test_model_mse))
    print("Root Mean Squared Error: {:.2f}".format(test_model_rmse))
    print("R2_scorer: {:.2f}".format(test_model_r2_sco))
    
    r2_list.append(test_model_r2_sco)
    
    print("="*35)
    print("\n")
    

Linear Regression
Model Performance for training set:
Mean Absolute Error: 4265.72
Mean Squared Error: 37227579.75
Root Mean Squared Error: 6101.44
R2_scorer: 0.75
Model Performance for testing set:
Mean Absolute Error: 4074.18
Mean Squared Error: 35546664.46
Root Mean Squared Error: 5962.10
R2_scorer: 0.75


Decision Tree
Model Performance for training set:
Mean Absolute Error: 17.89
Mean Squared Error: 171211.32
Root Mean Squared Error: 413.78
R2_scorer: 1.00
Model Performance for testing set:
Mean Absolute Error: 3210.41
Mean Squared Error: 44151320.36
Root Mean Squared Error: 6644.65
R2_scorer: 0.69


Random Forest
Model Performance for training set:
Mean Absolute Error: 1005.70
Mean Squared Error: 3464401.74
Root Mean Squared Error: 1861.29
R2_scorer: 0.98
Model Performance for testing set:
Mean Absolute Error: 2675.43
Mean Squared Error: 22950083.66
Root Mean Squared Error: 4790.62
R2_scorer: 0.84


AdaBoost
Model Performance for training set:
Mean Absolute Error: 3760.41
Mean Sq

In [53]:
model_performance = pd.DataFrame(list(zip(model_list, r2_list)), columns= ['Models', 'R2_score']).sort_values(by=['R2_score'],ascending=False)
model_performance

Unnamed: 0,Models,R2_score
4,Gradient Boost,0.862709
2,Random Forest,0.836838
3,AdaBoost,0.812665
5,K-Nearest Regressor,0.800318
0,Linear Regression,0.747284
1,Decision Tree,0.68611
6,Support Vector Regressor,-0.056556


## Gradient Boosting

In [54]:
g_boost = GradientBoostingRegressor()
g_boost.fit(X_train_new, y_train)

y_test_pred_boost = g_boost.predict(X_test_new)

In [55]:
performance = pd.DataFrame(list(zip(y_test, y_test_pred_boost)), columns=['Actual', 'Predicted'])
performance['Difference'] = round(abs(performance['Actual'] - performance['Predicted']),2)
performance

Unnamed: 0,Actual,Predicted,Difference
0,13041.92100,14238.316195,1196.40
1,5031.26955,7807.126812,2775.86
2,20984.09360,21738.509421,754.42
3,25656.57526,10560.919496,15095.66
4,3201.24515,4603.793875,1402.55
...,...,...,...
263,38746.35510,39622.845763,876.49
264,3481.86800,6827.401191,3345.53
265,19964.74630,22911.046307,2946.30
266,20462.99766,14195.008430,6267.99
