Feature Enginnering

#### 1.1 Import Data and Required Packages
##### Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.

In [None]:
# Basic Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
import warnings

In [102]:
# Importing the Csv data as Pandas data frame
data = pd.read_csv('Calories.csv')

In [103]:
data.head()

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,14733363,male,68,190,94,29,105,40.8,231
1,14861698,female,20,166,60,14,94,40.3,66
2,11179863,male,69,179,79,5,88,38.7,26
3,16180408,female,34,179,71,13,100,40.5,71
4,17771927,female,27,154,58,10,81,39.8,35


In [104]:
to_remove = ['User_ID', 'Weight']
data.drop(to_remove, axis=1, inplace=True)

In [105]:
data.head()

Unnamed: 0,Gender,Age,Height,Duration,Heart_Rate,Body_Temp,Calories
0,male,68,190,29,105,40.8,231
1,female,20,166,14,94,40.3,66
2,male,69,179,5,88,38.7,26
3,female,34,179,13,100,40.5,71
4,female,27,154,10,81,39.8,35


In [106]:
# Preparing X and Y
X = data.drop(columns= 'Calories' , axis = 1)
y = data['Calories']

In [107]:
# Since all are numerical variable except One Categorical variable Gender .
# We do one Label encoding on Categorical variables and Standard scaling on Numerical variable.
num_features = [feat for feat in X.columns if X[feat].dtype != 'O' and X[feat].nunique() >= 15]

cat_features = [feat for feat in X.columns if X[feat].nunique() < 15]

In [108]:
print(num_features)
print(cat_features)

['Age', 'Height', 'Duration', 'Heart_Rate', 'Body_Temp']
['Gender']


In [109]:
# Colomn transformer with 3 types of transformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop= 'first')

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
         ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [110]:
X = preprocessor.fit_transform(X)

In [111]:
X.shape

(15000, 6)

In [112]:
X[0]

array([1.        , 1.48472604, 1.08958204, 1.61912727, 0.98940395,
       0.99402302])

In [113]:
# Seperate dataset for Train and Test
from sklearn.model_selection import train_test_split
X_train,X_test , y_train , y_test = train_test_split(X,y,test_size= 0.2 , random_state= 42)
print(X_train.shape , y_train.shape)
print(X_test.shape , y_test.shape)

(12000, 6) (12000,)
(3000, 6) (3000,)


In [114]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:

models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression" : Ridge(),
    "Lasso Regression" : Lasso(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 11.3108
- Mean Absolute Error: 8.3299
- R2 Score: 0.9669
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 11.5548
- Mean Absolute Error: 8.4814
- R2 Score: 0.9669


Ridge Regression
Model performance for Training set
- Root Mean Squared Error: 11.3109
- Mean Absolute Error: 8.3297
- R2 Score: 0.9669
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 11.5550
- Mean Absolute Error: 8.4812
- R2 Score: 0.9669


Lasso Regression
Model performance for Training set
- Root Mean Squared Error: 12.2837
- Mean Absolute Error: 9.0226
- R2 Score: 0.9610
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 12.6030
- Mean Absolute Error: 9.1554
- R2 Score: 0.9606


Decision Tree
Model performance for Training set
- Root Mean Squared Error: 0.0091
- Mean Absolute Error: 0.0002
- R2 Score: 1.0000
---

In [116]:
pd.DataFrame(list(zip(model_list,r2_list)),columns= ["Model Name", "R2_score"]).sort_values(by = ['R2_score'],ascending=False)

Unnamed: 0,Model Name,R2_score
5,XGBRegressor,0.998521
4,Random Forest Regressor,0.997997
3,Decision Tree,0.993313
0,Linear Regression,0.966918
1,Ridge Regression,0.966917
6,AdaBoost Regressor,0.964235
2,Lasso Regression,0.960643


In [None]:
#Initialize few parameter for Hyperparamter tuning

linear_params = {}

decision_tree_params = {
            "criterion": ['squared_error', 'absolute_error'],
            #"criterion":['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
            "max_depth": [None, 5, 10, 15],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 5],
            # 'splitter':['best','random'],
            "max_features": [None, 'sqrt', 'log2']}

xgboost_reg_params = {
    "learning_rate": [0.1, 0.05, 0.01],
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 4, 5, 6],
    "min_child_weight": [1, 3, 5],
    "subsample": [0.7, 0.8, 0.9],
    "colsample_bytree": [0.5, 0.7, 0.9],
    "gamma": [0, 0.1, 0.3]
}

random_forest_params = {
    "n_estimators": [100, 300, 500],
    "max_depth": [None, 5, 10, 15],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5],
    "max_features": ['sqrt', 'log2', None]
}

In [None]:
# Models list for Hyperparameter tuning
gridcv_models = [
                ("Linear",LinearRegression(),linear_params),  
                ("Decision",DecisionTreeRegressor(),decision_tree_params),
                ("XGBoostRegressor",XGBRegressor(),xgboost_reg_params), 
                ("RandomForest", RandomForestRegressor(),random_forest_params)
               ]

In [None]:
##Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV

model_param = {}
for name, model, params in gridcv_models:
    grid = GridSearchCV(estimator=model,
                                   param_grid=params,
                                   cv=3,
                                   verbose=2,
                                   n_jobs=-1)
    grid.fit(X_train, y_train)
    model_param[name] = grid.best_params_

for model_name in model_param:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 1 candidates, totalling 3 fits
Fitting 3 folds for each of 216 candidates, totalling 648 fits
Fitting 3 folds for each of 2916 candidates, totalling 8748 fits


In [None]:
## Retraining the models with best parameters
models = {
    "Linear Regression": LinearRegression(),
    "Decision Tree Regression": DecisionTreeRegressor(criterion = 'squared_error', max_depth = 15, max_features = None, min_samples_leaf = 2, min_samples_split = 5),
    "XGBoost Regression": XGBRegressor(colsample_bytree = 0.9, gamma = 0, learning_rate = 0.1, max_depth = 6, min_child_weight = 3, n_estimators = 300, subsample = 0.7),
    "Random Forest Regression" : RandomForestRegressor(max_depth = 15, max_features = None, min_samples_leaf = 1, min_samples_split = 2, n_estimators = 300)
}

model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')

Linear Regression
Model performance for Training set
- Root Mean Squared Error: 11.3108
- Mean Absolute Error: 8.3299
- R2 Score: 0.9669
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 11.5548
- Mean Absolute Error: 8.4814
- R2 Score: 0.9669


Decision Tree Regression
Model performance for Training set
- Root Mean Squared Error: 1.8695
- Mean Absolute Error: 1.1837
- R2 Score: 0.9991
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 4.9538
- Mean Absolute Error: 3.2924
- R2 Score: 0.9939


XGBoost Regression
Model performance for Training set
- Root Mean Squared Error: 1.1399
- Mean Absolute Error: 0.8516
- R2 Score: 0.9997
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 1.8763
- Mean Absolute Error: 1.3247
- R2 Score: 0.9991


Random Forrest Regression
Model performance for Training set
- Root Mean Squared Error: 1.1362
- Mean Absolute Error: 0.7279
- R2

In [None]:
pd.DataFrame(list(zip(model_list,r2_list)),columns= ["Model Name", "R2_score"]).sort_values(by = ['R2_score'],ascending=False)

Unnamed: 0,Model Name,R2_score
2,XGBoost Regression,0.999128
3,Random Forrest Regression,0.998013
1,Decision Tree Regression,0.993919
0,Linear Regression,0.966918


Among all evaluated models, XGBoost Regression achieved the highest predictive performance with an RÂ² score of 0.999, indicating an excellent fit to the data.