In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


### Downloading Dataset

In [90]:
df_obesity = pd.read_csv("data/ObesityDataSet.csv")
df_obesity = df_obesity.rename(columns={'family_history_with_overweight': 'FAM_OVERWEIGHT', 'NObeyesdad': 'Obesity_level'})
TARGET_VAR = 'Obesity_level'
OTHER_FEATURES = df_obesity.keys().drop(['Obesity_level'])

In [None]:
df_obesity.info()

In [None]:
df_obesity

In [None]:
set(df_obesity['MTRANS'])

In [94]:
# Walking -> 0
# Bike -> 0.15
# Public -> 0.5
# Motor -> 0.75
# Automobile -> 1

df_obesity['MTRANS'] = df_obesity['MTRANS'].map({'Walking': 0.0, 'Bike': 0.15, 'Public_Transportation': 0.5, 'Motorbike': 0.75, 'Automobile': 1.0})

In [None]:
set(df_obesity['CAEC']), set(df_obesity['CALC'])

In [96]:
FREQ_CATEG_to_NUM = {'Always' : 1.0, 'Frequently' : 0.66, 'Sometimes' : 0.33, 'no' : 0.0}

df_obesity['CAEC'] = df_obesity['CAEC'].map(FREQ_CATEG_to_NUM)
df_obesity['CALC'] = df_obesity['CALC'].map(FREQ_CATEG_to_NUM)

In [None]:
set(df_obesity['FAM_OVERWEIGHT']), set(df_obesity['FAVC']), set(df_obesity['SMOKE']), set(df_obesity['SCC'])

In [98]:
BIN_CATEG_to_NUM = {'yes' : 1.0, 'no' : 0.0}

df_obesity['FAM_OVERWEIGHT'] = df_obesity['FAM_OVERWEIGHT'].map(BIN_CATEG_to_NUM)
df_obesity['FAVC'] = df_obesity['FAVC'].map(BIN_CATEG_to_NUM)
df_obesity['SMOKE'] = df_obesity['SMOKE'].map(BIN_CATEG_to_NUM)
df_obesity['SCC'] = df_obesity['SCC'].map(BIN_CATEG_to_NUM)

In [None]:
set(df_obesity['Gender'])

In [100]:
df_obesity['Gender'] = df_obesity['Gender'].map({'Male': 1.0, 'Female': 0.0})

In [None]:
set(df_obesity['Obesity_level'])


In [102]:
OBESITY_TRANSFORM_MAP = {'Insufficient_Weight' : 0,
                         'Normal_Weight'       : 1,
                         'Overweight_Level_I'  : 2,
                         'Overweight_Level_II' : 3,
                         'Obesity_Type_I'      : 4,
                         'Obesity_Type_II'     : 5,
                         'Obesity_Type_III'    : 6}

df_obesity['Obesity_level'] = df_obesity['Obesity_level'].map(OBESITY_TRANSFORM_MAP)

In [None]:
list(OBESITY_TRANSFORM_MAP.keys())

In [None]:
df_obesity.info()

In [None]:
corr = df_obesity.corr()
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(corr,  cmap='coolwarm', annot=True, fmt=".2f")
plt.show()

# Plots

In [106]:
# for i in range(len(OTHER_FEATURES)):
#     fig_bp = plt.figure()
#     ax_bp = sns.boxplot(x=TARGET_VAR, y=OTHER_FEATURES[i], data=df_obesity)

In [None]:
# sns.pairplot(df_obesity.drop(['Gender', 'SMOKE', 'FAM_OVERWEIGHT', 'SCC', 'FAVC'], axis=1), hue=TARGET_VAR, palette = 'Set1')

# Data normalization

In [None]:
from sklearn.preprocessing import LabelEncoder

enc_m = LabelEncoder()
enc_m.fit([f"{code} ({level})" for [level, code] in OBESITY_TRANSFORM_MAP.items()])
df_obesity['Obesity_level'] = enc_m.inverse_transform(df_obesity['Obesity_level'])

df_obesity

### Spliting into train and test sets

In [85]:
from sklearn.model_selection import train_test_split

X, y = df_obesity.drop(['Obesity_level'], axis=1), df_obesity['Obesity_level']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

### Standardizing the Variables

In [86]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## kNN model