### Import Library and Data

In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
df_test['Calories'] = 0

### Data Preparation Function

In [None]:
def Preprocess(df_train, df_test):

    df = pd.concat([df_train,df_test])
    df.drop(columns='id', axis=1, inplace=True)

# Create Feature Columns (BMI)

    df['height'] = df['Height']/100
    df['BMI'] = df['Weight']/(df['height']**2)
    df = pd.get_dummies(df,columns=['Sex'])

# Create Bins columns to split Data

    df['Age'] = pd.cut(df['Age'], bins=4, labels=[1,2,3,4])
    df['Age'] = pd.to_numeric(df['Age'])
    bin = [0,18.5,24.9,29.9, float('inf')]
    df['BMI'] = pd.cut(df['BMI'], bins=bin, labels=[1,2,3,4])
    df['BMI'] = pd.to_numeric(df['BMI'])

    df_train, df_test = df[:len(df_train)], df[len(df_train):]
     
    return df_train, df_test

In [16]:
train, test = Preprocess(df_train, df_test)
train.head()

Unnamed: 0,Age,Height,Weight,Duration,Heart_Rate,Body_Temp,Calories,height,BMI,Sex_female,Sex_male
0,2,189.0,82.0,26.0,101.0,41.0,150.0,1.89,2,False,True
1,3,163.0,60.0,8.0,85.0,39.7,34.0,1.63,2,True,False
2,3,161.0,64.0,7.0,84.0,39.8,29.0,1.61,2,True,False
3,1,192.0,90.0,25.0,105.0,40.7,140.0,1.92,2,False,True
4,2,166.0,61.0,25.0,102.0,40.6,146.0,1.66,2,True,False


### Model Training

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_log_error

In [17]:
X = train.drop(columns=['Calories'])
y = train['Calories']
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)

In [None]:
# Implementing feature Standardization

scaled = StandardScaler()
X_train_scaled = scaled.fit_transform(X_train)
X_test_scaled = scaled.transform(X_test)

In [8]:
model_1 = LinearRegression()
model_1.fit(X_train_scaled, Y_train)
accuracy = model_1.score(X_test_scaled, Y_test)
accuracy

0.9672362959074906

In [28]:
model_2 = DecisionTreeRegressor()
model_2.fit(X_train_scaled, Y_train)
accuracy = model_2.predict(X_test_scaled)
mean_squared_log_error(Y_test,accuracy)

np.float64(0.009445910599175112)

In [19]:
model_3 = XGBRegressor()
model_3.fit(X_train_scaled, Y_train)
accuracy = model_3.predict(X_test_scaled)
mean_squared_log_error(Y_test, accuracy)

np.float64(0.005239071678137012)

### Overfitting Detection

In [23]:
score = cross_val_score(model_1, X, y, cv=5)
score, score.mean()

(array([0.96730887, 0.96705561, 0.96697812, 0.96714344, 0.96689002]),
 np.float64(0.9670752117007883))

In [26]:
score = cross_val_score(model_2, X, y, cv=5)
score, score.mean()

(array([0.98938592, 0.98941202, 0.98926157, 0.98938554, 0.98931222]),
 np.float64(0.9893514537751866))

In [None]:
score = cross_val_score(model_3, X, y, cv=5)
score, score.mean()

np.float64(0.9943922232646505)

### Create Final Data (Submission)

In [30]:
test.drop(columns=['Calories'], axis=1, inplace=True)
accuracy = model_2.predict(test)
final = pd.DataFrame()
final['id'] = df_test['id']
final['Calories'] = accuracy
final.to_csv('Submission.csv', index=False)

