In [41]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv('final.csv', index_col=0)

In [3]:
df.head()

Unnamed: 0,Age,Weight,Duration,Heart_Rate,Body_Temp,Calories
0,68,94.0,29.0,105.0,40.8,231.0
1,20,60.0,14.0,94.0,40.3,66.0
2,69,79.0,5.0,88.0,38.7,26.0
3,34,71.0,13.0,100.0,40.5,71.0
4,27,58.0,10.0,81.0,39.8,35.0


In [4]:
df.shape

(14697, 6)

In [5]:
pipeline_lr = Pipeline([('scaler1', StandardScaler()),
                        ('Linear Regression', LinearRegression())])

In [6]:
pipeline_r = Pipeline([('scaler2', StandardScaler()),
                        ('Ridge Regression', Ridge())])

In [7]:
pipeline_l = Pipeline([('scaler3', StandardScaler()),
                        ('Lasso Regression', Lasso())])

In [8]:
pipeline_svm = Pipeline([('scaler4', StandardScaler()),
                        ('Support Vector Machine', SVR())])

In [9]:
pipeline_dt = Pipeline([('scaler5', StandardScaler()),
                        ('Decision Tree Regression', DecisionTreeRegressor())])

In [10]:
pipeline_rf = Pipeline([('scaler6', StandardScaler()),
                        ('Random Forest Regression', RandomForestRegressor())])

In [11]:
pipeline_ab = Pipeline([('scaler7', StandardScaler()),
                        ('Adda Boost Regression', AdaBoostRegressor())])

In [12]:
pipeline_gb = Pipeline([('scaler8', StandardScaler()),
                        ('Gradient Boost Regression', GradientBoostingRegressor())])

In [13]:
pipeline_xb = Pipeline([('scaler9', StandardScaler()),
                        ('XGboost Regression', XGBRegressor())])

In [14]:
pipeline_nn = Pipeline([('scaler10', StandardScaler()),
                        ('Nearest Neighbours Regression', KNeighborsRegressor())])

In [15]:
pipelines = [pipeline_lr, pipeline_r, pipeline_l, pipeline_svm, pipeline_dt, pipeline_rf, pipeline_ab, pipeline_gb, pipeline_xb, pipeline_nn]

In [16]:
len(pipelines)

10

In [17]:
best_accuracy=0.0
best_regressor=0
best_pipeline=""

In [18]:
pipe_dict = {0: 'Linear Regressor', 1: 'Ridge Regressor', 2: 'Lasso Regressor', 3: 'Support Vector Machine Regression',
            4: 'Decision Tree Regressor', 5: 'Random Forest Tree Regressor', 6: 'Adda Boost Regressor', 7: 'Gradient Boost Regressor',
            8: 'XGBOost Regressor', 9: 'KNearest Neighbours Regressor'}

In [19]:
X = df.drop(labels=['Calories'], axis=1)
y = df['Calories']
print(X.shape, y.shape)

(14697, 5) (14697,)


In [20]:
X.head()

Unnamed: 0,Age,Weight,Duration,Heart_Rate,Body_Temp
0,68,94.0,29.0,105.0,40.8
1,20,60.0,14.0,94.0,40.3
2,69,79.0,5.0,88.0,38.7
3,34,71.0,13.0,100.0,40.5
4,27,58.0,10.0,81.0,39.8


In [21]:
y.head()

0    231.0
1     66.0
2     26.0
3     71.0
4     35.0
Name: Calories, dtype: float64

In [49]:
X_train , X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [51]:
X_train.head()

Unnamed: 0,Age,Weight,Duration,Heart_Rate,Body_Temp
12371,40,84.0,27.0,114.0,40.8
7711,79,83.0,2.0,89.0,38.3
2287,57,78.0,14.0,85.0,40.0
4251,37,66.0,15.0,103.0,39.9
7964,27,104.0,10.0,90.0,39.6


In [23]:
print(f"Train size: {X_train.shape, y_train.shape}, Test size: {X_test.shape, y_test.shape}")

Train size: ((10287, 5), (10287,)), Test size: ((4410, 5), (4410,))


In [24]:
for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [25]:
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

Linear Regressor Test Accuracy: 0.9666797545316131
Ridge Regressor Test Accuracy: 0.9666814056330361
Lasso Regressor Test Accuracy: 0.960687052906832
Support Vector Machine Regression Test Accuracy: 0.9887353058948671
Decision Tree Regressor Test Accuracy: 0.9873776054923608
Random Forest Tree Regressor Test Accuracy: 0.9950711379199962
Adda Boost Regressor Test Accuracy: 0.9665481158888034
Gradient Boost Regressor Test Accuracy: 0.9943053827243243
XGBOost Regressor Test Accuracy: 0.9952566530553637
KNearest Neighbours Regressor Test Accuracy: 0.9911672472955586


In [26]:
for i,model in enumerate(pipelines):
    if model.score(X_test,y_test)>best_accuracy:
        best_accuracy=model.score(X_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy:XGBOost Regressor


In [27]:
accuracies = cross_val_score(estimator = XGBRegressor(), X = X_train, y = y_train, cv = 10)

In [28]:
accuracies

array([0.99402655, 0.99550486, 0.99479908, 0.995653  , 0.99434361,
       0.99539369, 0.99359089, 0.99383645, 0.99610361, 0.99590017])

In [29]:
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 99.49 %
Standard Deviation: 0.09 %


In [31]:
xbg = XGBRegressor()

In [64]:
xbg.fit(X_train.values, y_train.values)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.300000012, max_delta_step=0, max_depth=6,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [65]:
y_pred = xbg.predict(X_test.values)

In [66]:
r2_score(y_test, y_pred)

0.9952352117645191

In [68]:
xbg.predict(np.array([[68,94.0,29.0,105.0,40.8]]))

array([236.6492], dtype=float32)

In [69]:
import pickle
file = open('my_model.pkl', 'wb')
pickle.dump(xbg, file)