In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

from IPython.display import HTML
import plotly.express as px

import warnings
warnings.filterwarnings('ignore')

In [None]:
import seaborn as sns

In [None]:
#Reading the csv files using read_csv() function in pandas library
calories = pd.read_csv("calories.csv")
exercise = pd.read_csv("exercise.csv")

In [None]:
calories.head()

In [None]:
exercise.head()

In [None]:
#Merging two csv files based on the User_ID in the two files
exercise_df = exercise.merge(calories, on = "User_ID")
exercise_df.head()

In [None]:
#Shape contains the number of shape[1] (columns) and shape[0] (all the values)
print("This Dataset has ",exercise_df.shape[0], " instances and ",len(exercise_df.columns)," columns ")

In [None]:
#give the name of the columns
exercise_df.columns

In [None]:
#Interate through each column
print("Columns : ")
for i, column in zip(range(len(exercise_df.columns)), exercise_df.columns):
    print("\t", i + 1, ".", column)

In [None]:
#Describe the statistic data
exercise_df.describe()

In [None]:
#Boxplotting all the column using seaborn library
c = ['b','g','r','c','m','y','k','w','b']
fig1, axes = plt.subplots(len(exercise_df.columns), 1 , figsize = (10, 20))
plt.subplots_adjust(wspace = 0.3, hspace = 0.7)
axes = axes.flatten()
for i, column in zip(range(len(exercise_df.columns)), exercise_df.columns):
    try:
        sns.boxplot(data = exercise_df, x = column, color = c[i], ax = axes[i])
    except:
        fig1.delaxes(axes[i])
        continue

In [None]:
#info() gives the column name count dtype
exercise_df.info()

In [None]:
#Used to see the defect values
sns.heatmap(exercise_df.isnull(), yticklabels = False, cbar = False, cmap = "viridis")

In [None]:
print("The shape of the dataset before duplicates : ", exercise_df.shape)
exercise_df.drop_duplicates(subset = ['User_ID'], keep = 'last',inplace = True)
print("The shape of the dataset before duplicates : ", exercise_df.shape)

In [None]:
exercise_train_data, exercise_test_data = train_test_split(exercise_df, test_size = 0.2 , random_state = 1)
print("Shape of train dataset : ", exercise_train_data.shape)
print("Shape of test dataset : ",exercise_test_data.shape)

In [None]:
c = ['b','g','r','c','m','y','k','w','b']
fig1, axes = plt.subplots(len(exercise_train_data.columns) , 2 , figsize = (10, 20))
plt.subplots_adjust(wspace = 0.3, hspace = 0.7)
axes = axes.flatten()

for i, column, color in zip(range(0, len(exercise_train_data.columns) *2, 2) , exercise_train_data.columns , c):
    try:
        axes[i].title.set_text(column + " Train Distribution")
        sns.kdeplot(data = exercise_train_data, x = column, ax = axes[i], color = color)
    except:
        fig1.delaxes(axes[i])
        continue

for i, column, color in zip(range(1, len(exercise_test_data.columns) *2, 2) , exercise_test_data.columns , c):
    try:
        axes[i].title.set_text(column + " Test Distribution")
        sns.kdeplot(data = exercise_test_data, x = column, ax = axes[i], color = color)
    except:
        fig1.delaxes(axes[i])
        continue
plt.show()

In [None]:
sns.pairplot(exercise_train_data[["Weight", "Height" , "Duration" , "Heart_Rate" , "Calories" , "Gender"]], hue = "Gender")

In [None]:
print("Minimum age in dataset is : ", exercise_train_data["Age"].min())
print("Maximum age in dataset is : ", exercise_train_data["Age"].max())


In [None]:
age_groups = ["Young", "Middled-Aged" , "old"]
exercise_train_data["Age_groups"] = pd.cut(exercise_train_data["Age"], bins = (20, 40, 60, 80) , right = False, labels = age_groups)
exercise_train_data["Age_groups"].head()

In [None]:
exercise_train_data["Age_groups"].value_counts()

In [None]:
plt.rcParams["figure.figsize"] = 8 , 6
sns.countplot(data = exercise_train_data , x = "Age_groups")

In [None]:
fig = px.box(exercise_train_data, x = "Age_groups", y = "Calories", color = "Gender")

fig.update_layout(
    width = 700, 
    height = 450
)
fig.show()

In [None]:
fig = px.box(exercise_train_data, x = "Age_groups", y = "Duration", color = "Gender")

fig.update_layout(
    width = 700, 
    height = 450
)
fig.show()

In [None]:
print("Dataset's median exercise duration in minutes : ", exercise_train_data["Duration"].median())
print("Dataset's mean exercise duration in minutes : ", exercise_train_data["Duration"].mean())

In [None]:
plt.rcParams["figure.figsize"] = 8 , 6
sns.countplot(data = exercise_train_data , x = "Gender")

In [None]:
fig = px.box(exercise_train_data , x = "Gender" , y = "Duration")

fig.update_layout(
    width = 700,
    height = 450
)
fig.show()

In [None]:
fig = px.box(exercise_train_data , x = "Gender" , y = "Heart_Rate")

fig.update_layout(
    width = 700,
    height = 450
)
fig.show()

In [None]:
for data in [exercise_train_data , exercise_test_data]:
    data["BMI"] = data["Weight"]/((data["Height"]/100) ** 2)
    data["BMI"] = round(data["BMI"], 2)

In [None]:
bmi_category = ["Very severely underweight" , "severely underweight" ,"Underweight" , "Normal" , "Overweight" , "Obese Class I" , "Obese Class II" , "Obese Class III"]
exercise_train_data["Categorized_BMI"] = pd.cut(exercise_train_data["BMI"] , bins = (0 , 15 , 16 , 18.5 , 25 , 30 , 35 , 40 , 50) , right = False , labels = bmi_category)
exercise_train_data["Categorized_BMI"] = exercise_train_data["Categorized_BMI"].astype("object")
exercise_train_data.head()

In [None]:
ds = exercise_train_data["Categorized_BMI"].value_counts().reset_index()
ds.columns = ["Categorized_BMI" , "Count"]
ds

In [None]:
ds = ds[(ds["Categorized_BMI"] == "Normal") | (ds["Categorized_BMI"] == "Overweight")]

plt.rcParams["figure.figsize"] = 8 , 6
sns.barplot(data = ds , x= "Categorized_BMI", y = "Count")

In [None]:
ds = exercise_train_data[["Gender", "Categorized_BMI"]].value_counts().reset_index().sort_values(by = ["Gender" , "Categorized_BMI"])
ds.columns = ["Gender" , "Categorized_BMI" , "Count"]
ds

In [None]:
plt.rcParams["figure.figsize"] = 8 , 6
sns.barplot(data = ds , x = "Categorized_BMI" , y = "Count" , hue = "Gender")

In [None]:
ds = exercise_train_data[["age_groups" , "Categorized_BMI"]].value_counts().reset_index().sort_values(by = ["age_groups" , "Categorized_BMI"])
ds.columns = ["age_groups" , "Categorized_BMI" , "Count"]