In [104]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler


**Context**

Welcome to the Scaler Healthcare data analysis team! As part of our ongoing efforts to understand and combat obesity globally, we're leveraging data to gain insights into factors contributing to obesity. You've been tasked with analyzing the Obesity Dataset, applying the k-Nearest Neighbors (kNN) algorithm to predict obesity levels based on individuals' eating habits and physical conditions.


**Dataset Description**

The dataset you'll be working with contains attributes related to individuals' eating habits and physical conditions. Here's a breakdown of the features you'll encounter:

**Eating Habits Attributes**:

- FAVC (Frequent consumption of high caloric food): Indicates if the individual frequently eats high caloric food.
- FCVC (Frequency of consumption of vegetables): Reflects how often the individual consumes vegetables.
- NCP (Number of main meals): Represents the number of main meals the individual has in a day.
- CAEC (Consumption of food between meals): Shows how frequently the individual eats between meals.
- CH20 (Consumption of water daily): Details the daily water consumption.
- CALC (Consumption of alcohol): Provides information on the individual's alcohol consumption.

**Physical Condition Attributes**:
- SCC (Calories consumption monitoring): Indicates if the individual monitors their calorie intake.
- FAF (Physical activity frequency): Reflects the frequency of physical activity.
- TUE (Time using technology devices): Denotes the time spent using technology devices.
- MTRANS (Transportation used): Details the primary mode of transportation.

In [105]:

df = pd.read_csv('../data/ObesityDataSet.csv')
df.head(10)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II
5,Male,29.0,1.62,53.0,no,yes,2.0,3.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Automobile,Normal_Weight
6,Female,23.0,1.5,55.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,0.0,Sometimes,Motorbike,Normal_Weight
7,Male,22.0,1.64,53.0,no,no,2.0,3.0,Sometimes,no,2.0,no,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
8,Male,24.0,1.78,64.0,yes,yes,3.0,3.0,Sometimes,no,2.0,no,1.0,1.0,Frequently,Public_Transportation,Normal_Weight
9,Male,22.0,1.72,68.0,yes,yes,2.0,3.0,Sometimes,no,2.0,no,1.0,1.0,no,Public_Transportation,Normal_Weight


In [106]:
def skewness_based_on_mean_median(data):
    """
    Calculate skewness based on mean and median.

    """
    mean_val = np.mean(data).round(2)
    median_val = np.median(data).round(2)
    
    skewness = 3 * (mean_val - median_val) / np.std(data)
    print(f"Skewness based on mean {mean_val} and median {median_val} -->  {np.round(skewness, 2)}")
    
    if skewness > 0:
        print(f"The distribution is right-skewed.")
    elif skewness < 0:
        print(f"The distribution is left-skewed.")
    else:
        print(f"The distribution is approximately symmetric.")
    
    return skewness

numerical_features = df.select_dtypes(include=['number']).columns
for nfeat in numerical_features : 
    print (nfeat)
    skewness_based_on_mean_median(df[nfeat])
    print('*'*20)

Age
Skewness based on mean 24.31 and median 22.78 -->  0.72
The distribution is right-skewed.
********************
Height
Skewness based on mean 1.7 and median 1.7 -->  0.0
The distribution is approximately symmetric.
********************
Weight
Skewness based on mean 86.59 and median 83.0 -->  0.41
The distribution is right-skewed.
********************
FCVC
Skewness based on mean 2.42 and median 2.39 -->  0.17
The distribution is right-skewed.
********************
NCP
Skewness based on mean 2.69 and median 3.0 -->  -1.2
The distribution is left-skewed.
********************
CH2O
Skewness based on mean 2.01 and median 2.0 -->  0.05
The distribution is right-skewed.
********************
FAF
Skewness based on mean 1.01 and median 1.0 -->  0.04
The distribution is right-skewed.
********************
TUE
Skewness based on mean 0.66 and median 0.63 -->  0.15
The distribution is right-skewed.
********************


In [4]:
# split the data into training and test split

In [107]:
cat_features = df.select_dtypes(exclude=['number']).columns
cat_features

Index(['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE',
       'SCC', 'CALC', 'MTRANS', 'NObeyesdad'],
      dtype='object')

In [108]:
from sklearn.model_selection import train_test_split


# Label Encoding

In [109]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import pandas as pd

In [111]:
numeric = df.select_dtypes(include='number').columns
categorical = df.select_dtypes(exclude='number').columns



In [113]:
en = LabelEncoder()
df_encoded = pd.DataFrame()
for col in cat_features:
    df_encoded[col] = en.fit_transform(df[col])


In [114]:
df_encoded

Unnamed: 0,Gender,family_history_with_overweight,FAVC,CAEC,SMOKE,SCC,CALC,MTRANS,NObeyesdad
0,0,1,0,2,0,0,3,3,1
1,0,1,0,2,1,1,2,3,1
2,1,1,0,2,0,0,1,3,1
3,1,0,0,2,0,0,1,4,5
4,1,0,0,2,0,0,2,3,6
...,...,...,...,...,...,...,...,...,...
2106,0,1,1,2,0,0,2,3,4
2107,0,1,1,2,0,0,2,3,4
2108,0,1,1,2,0,0,2,3,4
2109,0,1,1,2,0,0,2,3,4


In [115]:
std = StandardScaler()
df_scaled = pd.DataFrame(std.fit_transform(df[numeric]), columns=numeric)

TypeError: concat() takes 1 positional argument but 2 were given

In [103]:
df

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,0,-0.522124,-0.875589,-0.862558,1,0,-0.785019,0.404153,2,0,-0.013073,0,-1.188039,0.561997,3,3,1
1,0,-0.522124,-1.947599,-1.168077,1,0,1.088342,0.404153,2,1,1.618759,1,2.339750,-1.080625,2,3,1
2,1,-0.206889,1.054029,-0.366090,1,0,-0.785019,0.404153,2,0,-0.013073,0,1.163820,0.561997,1,3,1
3,1,0.423582,1.054029,0.015808,0,0,1.088342,0.404153,2,0,-0.013073,0,1.163820,-1.080625,1,4,5
4,1,-0.364507,0.839627,0.122740,0,0,-0.785019,-2.167023,2,0,-0.013073,0,-1.188039,-1.080625,2,3,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,0,-0.525774,0.097045,1.711763,1,1,1.088342,0.404153,2,0,-0.456705,0,0.783135,0.407996,2,3,4
2107,0,-0.367195,0.502844,1.800914,1,1,1.088342,0.404153,2,0,-0.004702,0,0.389341,-0.096251,2,3,4
2108,0,-0.281909,0.541672,1.798868,1,1,1.088342,0.404153,2,0,0.075361,0,0.474971,-0.019018,2,3,4
2109,0,0.007776,0.404927,1.785780,1,1,1.088342,0.404153,2,0,1.377801,0,0.151471,-0.117991,2,3,4


In [85]:
df[cat_features]

Unnamed: 0,NObeyesdad
0,1
1,1
2,1
3,5
4,6
...,...
2106,4
2107,4
2108,4
2109,4


In [64]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.25, random_state=42)

In [66]:
X_train.shape, X_test.shape

((1583, 16), (528, 16))

In [67]:
y_train.shape, y_test.shape

((1583,), (528,))