# Predicting obesity level based on eating habits and physical condition

In [1]:
# import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.pipeline import make_pipeline

In [2]:
# install necessary packages
# leave installation commented when rerunning script
#!pip install ucimlrepo

In [3]:
# import dataset

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition = fetch_ucirepo(id=544) 
  
# data (as pandas dataframes) 
features = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.features
target = estimation_of_obesity_levels_based_on_eating_habits_and_physical_condition.data.targets 

# create a merged dataframe
merged_df = pd.concat([features, target], axis = 1)

In [4]:
# View dataframe
merged_df

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.000000,1.620000,64.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,0.000000,1.000000,no,Public_Transportation,Normal_Weight
1,Female,21.000000,1.520000,56.000000,yes,no,3.0,3.0,Sometimes,yes,3.000000,yes,3.000000,0.000000,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.000000,1.800000,77.000000,yes,no,2.0,3.0,Sometimes,no,2.000000,no,2.000000,1.000000,Frequently,Public_Transportation,Normal_Weight
3,Male,27.000000,1.800000,87.000000,no,no,3.0,3.0,Sometimes,no,2.000000,no,2.000000,0.000000,Frequently,Walking,Overweight_Level_I
4,Male,22.000000,1.780000,89.800000,no,no,2.0,1.0,Sometimes,no,2.000000,no,0.000000,0.000000,Sometimes,Public_Transportation,Overweight_Level_II
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,Female,20.976842,1.710730,131.408528,yes,yes,3.0,3.0,Sometimes,no,1.728139,no,1.676269,0.906247,Sometimes,Public_Transportation,Obesity_Type_III
2107,Female,21.982942,1.748584,133.742943,yes,yes,3.0,3.0,Sometimes,no,2.005130,no,1.341390,0.599270,Sometimes,Public_Transportation,Obesity_Type_III
2108,Female,22.524036,1.752206,133.689352,yes,yes,3.0,3.0,Sometimes,no,2.054193,no,1.414209,0.646288,Sometimes,Public_Transportation,Obesity_Type_III
2109,Female,24.361936,1.739450,133.346641,yes,yes,3.0,3.0,Sometimes,no,2.852339,no,1.139107,0.586035,Sometimes,Public_Transportation,Obesity_Type_III


In [5]:
# View shape of dataframe 
print(f"The dataframe has {merged_df.shape[0]} instances and {merged_df.shape[1]} features.")

The dataframe has 2111 instances and 17 features.


In [6]:
# Review summary information of dataframe, dtypes, column names 
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          2111 non-null   object 
 1   Age                             2111 non-null   float64
 2   Height                          2111 non-null   float64
 3   Weight                          2111 non-null   float64
 4   family_history_with_overweight  2111 non-null   object 
 5   FAVC                            2111 non-null   object 
 6   FCVC                            2111 non-null   float64
 7   NCP                             2111 non-null   float64
 8   CAEC                            2111 non-null   object 
 9   SMOKE                           2111 non-null   object 
 10  CH2O                            2111 non-null   float64
 11  SCC                             2111 non-null   object 
 12  FAF                             21

In [7]:
# Rename columns into meaningful names
# create dictionary for renaming

rename_dict = {
    'Gender': 'gender',
    'Age': 'age',
    'Height': 'height',
    'Weight': 'weight',
    'FAVC': 'frequent_high_calorie_intake',
    'FCVC': 'frequent_vegetable_intake',
    'NCP': 'meals_per_day',
    'CAEC': 'food_intake_between_meals',
    'SMOKE': 'smoker',
    'CH2O': 'daily_water_intake',
    'SCC': 'monitor_calories',
    'FAF': 'days_per_week_with_physical_activity',
    'TUE': 'daily_screen_time',
    'CALC': 'frequent_alcohol_intake',
    'MTRANS': 'mode_of_transportation',
    'NObeyesdad': 'obesity_level'
}

merged_df = merged_df.rename(columns = rename_dict)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 17 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   gender                                2111 non-null   object 
 1   age                                   2111 non-null   float64
 2   height                                2111 non-null   float64
 3   weight                                2111 non-null   float64
 4   family_history_with_overweight        2111 non-null   object 
 5   frequent_high_calorie_intake          2111 non-null   object 
 6   frequent_vegetable_intake             2111 non-null   float64
 7   meals_per_day                         2111 non-null   float64
 8   food_intake_between_meals             2111 non-null   object 
 9   smoker                                2111 non-null   object 
 10  daily_water_intake                    2111 non-null   float64
 11  monitor_calories 