## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.metrics import roc_curve

## Read in the data

In [2]:
train_df = pd.read_csv("data/train.csv")
X_test = pd.read_csv("data/test.csv")

In [3]:
train_df.shape

(20758, 18)

## EDA

In [4]:
train_df.describe()

Unnamed: 0,id,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0,20758.0
mean,10378.5,23.841804,1.700245,87.887768,2.445908,2.761332,2.029418,0.981747,0.616756
std,5992.46278,5.688072,0.087312,26.379443,0.533218,0.705375,0.608467,0.838302,0.602113
min,0.0,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,5189.25,20.0,1.631856,66.0,2.0,3.0,1.792022,0.008013,0.0
50%,10378.5,22.815416,1.7,84.064875,2.393837,3.0,2.0,1.0,0.573887
75%,15567.75,26.0,1.762887,111.600553,3.0,3.0,2.549617,1.587406,1.0
max,20757.0,61.0,1.975663,165.057269,3.0,4.0,3.0,3.0,2.0


In [5]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20758 entries, 0 to 20757
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              20758 non-null  int64  
 1   Gender                          20758 non-null  object 
 2   Age                             20758 non-null  float64
 3   Height                          20758 non-null  float64
 4   Weight                          20758 non-null  float64
 5   family_history_with_overweight  20758 non-null  object 
 6   FAVC                            20758 non-null  object 
 7   FCVC                            20758 non-null  float64
 8   NCP                             20758 non-null  float64
 9   CAEC                            20758 non-null  object 
 10  SMOKE                           20758 non-null  object 
 11  CH2O                            20758 non-null  float64
 12  SCC                             

<div style="border-radius:10px; border:#808080 solid; padding: 15px; background-color: ##F0E68C ; font-size:100%; text-align:left">

<h3 align="left"><font color=brown> 🔍 Inference:</font></h3>

* We don't have missing values in the dataset.

In [9]:
train_df.nunique()

id                                20758
Gender                                2
Age                                1703
Height                             1833
Weight                             1979
family_history_with_overweight        2
FAVC                                  2
FCVC                                934
NCP                                 689
CAEC                                  4
SMOKE                                 2
CH2O                               1506
SCC                                   2
FAF                                1360
TUE                                1297
CALC                                  3
MTRANS                                5
NObeyesdad                            7
dtype: int64

<div style="border-radius:10px; border:#808080 solid; padding: 15px; background-color: ##F0E68C ; font-size:100%; text-align:left">

<h3 align="left"><font color=brown> 🔍 Inference:</font></h3>

* Drop feature: `id` since it's unique identifier.
* Binary categorical features: `Gender`, `family_history_with_overweight`, `FAVC`, `SMOKE`, `SCC`
* Multiclass categorical features: `CAEC`, `CALC`
* Numerical features: `Age`, `Height`, `Weight`, `FCVC`, `NCP`, `CH2O`, `FAF`, `TUE`
* Target: `NObeyesdad` with 7 categories.

In [10]:
for col in train_df.columns:
    unique_values = train_df[col].unique()
    print(f"Unique values in '{col}': {unique_values}")

Unique values in 'id': [    0     1     2 ... 20755 20756 20757]
Unique values in 'Gender': ['Male' 'Female']
Unique values in 'Age': [24.443011 18.       20.952737 ... 25.746113 38.08886  33.852953]
Unique values in 'Height': [1.699998 1.56     1.71146  ... 1.791366 1.672594 1.536819]
Unique values in 'Weight': [ 81.66995   57.        50.165754 ... 152.063947  79.5       80.615325]
Unique values in 'family_history_with_overweight': ['yes' 'no']
Unique values in 'FAVC': ['yes' 'no']
Unique values in 'FCVC': [2.         1.880534   3.         2.679664   2.919751   1.99124
 1.397468   2.636719   1.         1.392665   2.203962   2.971588
 2.668949   1.98989905 2.417635   2.219186   2.919526   2.263245
 2.649406   1.754401   2.303656   2.020785   2.068834   2.689929
 2.979383   2.225731   2.843456   2.312528   2.962415   2.945967
 2.108638   1.826885   2.200588   2.598051   2.984425   1.387489
 2.76533    2.941627   2.490776   2.801514   2.336044   1.270448
 2.9673     2.325623   2.722161  

In [11]:
train_df.sample(n=10)

Unnamed: 0,id,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
2573,2573,Male,23.0,1.729996,82.414477,yes,yes,2.273548,2.395785,Sometimes,no,1.544357,no,0.256323,1.544357,Sometimes,Public_Transportation,Overweight_Level_II
10073,10073,Male,18.0,1.845399,80.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,0.085388,0.42954,no,Public_Transportation,Overweight_Level_I
19546,19546,Female,25.66668,1.7199,109.810012,yes,yes,3.0,3.0,Sometimes,no,2.523793,no,0.001015,0.912345,Sometimes,Public_Transportation,Obesity_Type_III
2994,2994,Female,25.966504,1.643332,104.790549,yes,yes,3.0,3.0,Sometimes,no,2.476002,no,0.0,0.413106,Sometimes,Public_Transportation,Obesity_Type_III
11683,11683,Female,19.0,1.56,42.0,no,no,3.0,3.0,Frequently,no,1.0,no,1.0,0.0,Sometimes,Public_Transportation,Insufficient_Weight
357,357,Female,20.0,1.58,48.0,no,yes,1.0,1.0,Sometimes,no,1.0,no,0.0,0.0,Sometimes,Public_Transportation,Normal_Weight
8513,8513,Female,21.28253,1.741192,133.043941,yes,yes,3.0,3.0,Sometimes,no,2.852254,no,1.537639,0.758897,Sometimes,Public_Transportation,Obesity_Type_III
2780,2780,Male,26.740655,1.759324,120.423567,yes,yes,3.0,3.0,Sometimes,no,2.827773,no,0.868721,0.333673,Sometimes,Public_Transportation,Obesity_Type_II
6088,6088,Male,19.0,1.62,70.0,no,yes,3.0,1.0,Sometimes,no,2.0,no,1.0,0.0,Sometimes,Public_Transportation,Overweight_Level_I
1950,1950,Male,21.0,1.65,75.0,yes,yes,3.0,4.0,Always,no,2.0,no,1.0,0.0,Sometimes,Public_Transportation,Overweight_Level_I


<div style="border-radius:10px; border:#808080 solid; padding: 15px; background-color: ##F0E68C ; font-size:100%; text-align:left">

<h3 align="left"><font color=brown> 🔍 Some insights:</font></h3>

* reference to research https://dergipark.org.tr/tr/download/article-file/1777821
* Multiclass categorical features: `CAEC`, `CALC` <= maybe can be encoded into decimals between 0 and 1. 
* CAEC Consumption of food between meals (0:No, 1:Sometimes, 2:Frequently, 3:Always)
* MTRANS => transportation used
* CALC Consumption of alcohol (0:No, 1:Sometimes, 2:Frequently, 3:Always)
* Numerical features: `FCVC`(Frequency of eating vegetables), `NCP`(number of main meals), `CH2O`(consumption of water), `FAF`(physical activity frequency), `TUE` (time using technology)
* Target: `NObeyesdad` with 7 categories.