In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pickle

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
calories = pd.read_csv('/content/drive/MyDrive/Calorie/calories.csv')
exercise = pd.read_csv('/content/drive/MyDrive/Calorie/exercise.csv')

In [5]:
calories.head()

Unnamed: 0,User_ID,Calories
0,14733363,231.0
1,14861698,66.0
2,11179863,26.0
3,16180408,71.0
4,17771927,35.0


In [6]:
exercise.head()

Unnamed: 0,User_ID,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,14733363,male,68,190.0,94.0,29.0,105.0,40.8
1,14861698,female,20,166.0,60.0,14.0,94.0,40.3
2,11179863,male,69,179.0,79.0,5.0,88.0,38.7
3,16180408,female,34,179.0,71.0,13.0,100.0,40.5
4,17771927,female,27,154.0,58.0,10.0,81.0,39.8


In [7]:
data = pd.merge(calories, exercise, on = 'User_ID')

In [8]:

data.head()

Unnamed: 0,User_ID,Calories,Gender,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
0,14733363,231.0,male,68,190.0,94.0,29.0,105.0,40.8
1,14861698,66.0,female,20,166.0,60.0,14.0,94.0,40.3
2,11179863,26.0,male,69,179.0,79.0,5.0,88.0,38.7
3,16180408,71.0,female,34,179.0,71.0,13.0,100.0,40.5
4,17771927,35.0,female,27,154.0,58.0,10.0,81.0,39.8


In [9]:
data.shape

(15000, 9)

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   User_ID     15000 non-null  int64  
 1   Calories    15000 non-null  float64
 2   Gender      15000 non-null  object 
 3   Age         15000 non-null  int64  
 4   Height      15000 non-null  float64
 5   Weight      15000 non-null  float64
 6   Duration    15000 non-null  float64
 7   Heart_Rate  15000 non-null  float64
 8   Body_Temp   15000 non-null  float64
dtypes: float64(6), int64(2), object(1)
memory usage: 1.0+ MB


In [11]:
data.describe()

Unnamed: 0,User_ID,Calories,Age,Height,Weight,Duration,Heart_Rate,Body_Temp
count,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0,15000.0
mean,14977360.0,89.539533,42.7898,174.465133,74.966867,15.5306,95.518533,40.025453
std,2872851.0,62.456978,16.980264,14.258114,15.035657,8.319203,9.583328,0.77923
min,10001160.0,1.0,20.0,123.0,36.0,1.0,67.0,37.1
25%,12474190.0,35.0,28.0,164.0,63.0,8.0,88.0,39.6
50%,14997280.0,79.0,39.0,175.0,74.0,16.0,96.0,40.2
75%,17449280.0,138.0,56.0,185.0,87.0,23.0,103.0,40.6
max,19999650.0,314.0,79.0,222.0,132.0,30.0,128.0,41.5


In [12]:
data.isnull().sum()

Unnamed: 0,0
User_ID,0
Calories,0
Gender,0
Age,0
Height,0
Weight,0
Duration,0
Heart_Rate,0
Body_Temp,0


In [13]:
data.duplicated().any()

np.False_

In [14]:
X  = data.drop(columns  = ['User_ID', 'Calories'])
y = data['Calories']

In [15]:
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [16]:
data.columns

Index(['User_ID', 'Calories', 'Gender', 'Age', 'Height', 'Weight', 'Duration',
       'Heart_Rate', 'Body_Temp'],
      dtype='object')

In [17]:
preprocessor = ColumnTransformer(transformers=[
    ('ordinal', OrdinalEncoder(), ['Gender']),
    ('num', StandardScaler(), ['Age', 'Height', 'Weight', 'Duration','Heart_Rate', 'Body_Temp'])
], remainder='passthrough')

In [18]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', XGBRegressor())]
)

In [19]:
pipeline.fit(X_train, y_train)

In [20]:
y_pred = pipeline.predict(X_test)

In [21]:
r2_score(y_test, y_pred)

0.9988678909361673

In [22]:
mean_absolute_error(y_test, y_pred)

1.4981198125282924

In [23]:
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_results = cross_val_score(pipeline, X, y, cv = kfold, scoring = 'r2')

In [24]:
cv_results.mean()

np.float64(0.9988510864545181)

In [26]:
file_path = '/content/drive/MyDrive/Calorie/pipeline_model.pkl'


with open(file_path, 'wb') as f:
    pickle.dump(pipeline, f)