In [27]:
import pandas as pd

#Load the data
df = pd.read_csv("../data/obesity.csv")

In [28]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
import joblib
#Encoding the data in machine understandable way

#Binary encoding for yes/no to 1/0
binary_features = ["family_history_with_overweight", "FAVC", "SMOKE", "SCC"]
df[binary_features] = df[binary_features].replace({"yes":1, "no":0})

#Encode the target column
label_encoder = LabelEncoder()
df["NObeyesdad"] = label_encoder.fit_transform(df["NObeyesdad"])

joblib.dump(label_encoder, "label_encoder.pkl")

#One-Hot Encoding for categorical features
categorical_features =["Gender", "CAEC", "CALC", "MTRANS"]
encoder = OneHotEncoder(drop="first",sparse_output=False)
encoded_cats = encoder.fit_transform(df[categorical_features])
joblib.dump(encoder, "encoder.pkl") #Save the encoder

#convert encoded categories to DataFrame
encoded_cat_df = pd.DataFrame(encoded_cats,columns=encoder.get_feature_names_out(categorical_features))

#drop original categoriacla columns and merge encoded data
df = df.drop(columns=categorical_features)
df = pd.concat([df, encoded_cat_df], axis=1)

#scale the continuous features
continuous_features = ["Age", "Height", "Weight", "NCP", "CH2O", "FAF"]
scaler = StandardScaler()
df[continuous_features] = scaler.fit_transform(df[continuous_features])
joblib.dump(scaler, "scaler.pkl")

df.info()
df.to_csv("../data/obesity_nice_encoded.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age                             2111 non-null   float64
 1   Height                          2111 non-null   float64
 2   Weight                          2111 non-null   float64
 3   family_history_with_overweight  2111 non-null   int64  
 4   FAVC                            2111 non-null   int64  
 5   FCVC                            2111 non-null   float64
 6   NCP                             2111 non-null   float64
 7   SMOKE                           2111 non-null   int64  
 8   CH2O                            2111 non-null   float64
 9   SCC                             2111 non-null   int64  
 10  FAF                             2111 non-null   float64
 11  TUE                             2111 non-null   float64
 12  NObeyesdad                      21

  df[binary_features] = df[binary_features].replace({"yes":1, "no":0})


In [29]:
df = df.sample(frac=1).reset_index(drop=True)
df = df.sample(frac=1).reset_index(drop=True)
df.dropna()
df

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,SMOKE,CH2O,SCC,...,CAEC_Frequently,CAEC_Sometimes,CAEC_no,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,-0.364399,-0.981508,-0.161010,1,1,1.88,0.159853,0,0.492730,0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.736997,0.625412,1.043117,1,1,2.32,0.404102,0,0.264272,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
2,-0.049714,0.839668,1.298607,1,1,2.11,0.404102,0,0.509048,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,-0.993768,0.411156,-0.000612,1,1,3.00,0.404102,0,-0.013141,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,-0.207057,-0.552996,-1.053507,0,0,3.00,-2.166941,0,-0.013141,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,3.726499,0.411156,-0.070882,1,1,2.54,0.404102,0,-1.400207,0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2107,-0.049714,-0.981508,0.510750,1,1,3.00,-1.627022,0,-1.644983,0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2108,0.736997,-0.017356,-0.327900,1,1,3.00,0.404102,0,-0.812744,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
2109,2.625104,-1.410020,-0.211039,1,1,2.15,0.378391,0,-0.812744,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

#Split the dataset
x = df.drop(columns=["NObeyesdad"])
y = df["NObeyesdad"]