In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
import warnings

In [3]:
df = pd.read_csv('data/calories.csv')

In [4]:
df.head()

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8,231.0
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3,66.0
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7,26.0
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5,71.0
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8,35.0


In [5]:
X = df.drop(columns=['Calories'], axis=1)

In [6]:
y = df['Calories']

In [7]:
X.head()

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8


In [8]:
num_features = X.select_dtypes(exclude='object').columns
cat_features = X.select_dtypes(include='object').columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from  sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ('OneHotEncoder', oh_transformer, cat_features),
        ('StandardScaler', numeric_transformer, num_features)
    ]
)

In [9]:
X = preprocessor.fit_transform(X)

In [11]:
X.shape

(15000, 9)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((12000, 9), (3000, 9))

In [21]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_sqare = r2_score(true, predicted)
    return mae, rmse, r2_sqare

In [22]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "KNeighborsRegressor": KNeighborsRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "RandomForestRegressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(),
    "AdaBoostRegressor": AdaBoostRegressor(),
    "SVR": SVR(),
    "CatBoostRegressor": CatBoostRegressor()
}
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)

    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae, model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model performance for Training set')
    print("- Root mean squared error: {: 4f}".format(model_train_rmse))
    print("- Mean absolute error: {: 4f}".format(model_test_mae))
    print("- R2 score: {: 4f}".format(model_train_r2))

    print("--------------------------------")

    print('Model performance for Testing set')
    print("- Root mean squared error: {: 4f}".format(model_test_rmse))
    print("- Mean absolute error: {: 4f}".format(model_test_mae))
    print("- R2 score: {: 4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('=' * 35)
    print('\n')


Linear Regression
Model performance for Training set
- Root mean squared error:  11.266085
- Mean absolute error:  8.442604
- R2 score:  0.967168
--------------------------------
Model performance for Testing set
- Root mean squared error:  11.494940
- Mean absolute error:  8.442604
- R2 score:  0.967260


Lasso
Model performance for Training set
- Root mean squared error:  12.254423
- Mean absolute error:  9.147320
- R2 score:  0.961154
--------------------------------
Model performance for Testing set
- Root mean squared error:  12.565645
- Mean absolute error:  9.147320
- R2 score:  0.960876


Ridge
Model performance for Training set
- Root mean squared error:  11.265870
- Mean absolute error:  8.443383
- R2 score:  0.967169
--------------------------------
Model performance for Testing set
- Root mean squared error:  11.492341
- Mean absolute error:  8.443383
- R2 score:  0.967274


KNeighborsRegressor
Model performance for Training set
- Root mean squared error:  5.581902
- Mean a

In [24]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2 score']).sort_values(by=["R2 score"], ascending=False)

Unnamed: 0,Model Name,R2 score
9,CatBoostRegressor,0.999894
6,XGBRegressor,0.99879
5,RandomForestRegressor,0.998013
4,DecisionTreeRegressor,0.992375
8,SVR,0.989873
3,KNeighborsRegressor,0.989264
2,Ridge,0.967274
0,Linear Regression,0.96726
7,AdaBoostRegressor,0.966326
1,Lasso,0.960876
