In [49]:
import pandas as pd

#Load the data
df = pd.read_csv("../data/obesity.csv")

In [50]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
import joblib
#Encoding the data in machine understandable way

#Binary encoding for yes/no to 1/0
binary_features = ["family_history_with_overweight", "FAVC", "SMOKE", "SCC"]
df[binary_features] = df[binary_features].replace({"yes":1, "no":0})

#Encode the target column
label_encoder = LabelEncoder()
df["NObeyesdad"] = label_encoder.fit_transform(df["NObeyesdad"])

joblib.dump(label_encoder, "label_encoder.pkl")

#One-Hot Encoding for categorical features
categorical_features =["Gender", "CAEC", "CALC", "MTRANS"]
encoder = OneHotEncoder(drop="first",sparse_output=False)
encoded_cats = encoder.fit_transform(df[categorical_features])
joblib.dump(encoder, "encoder.pkl") #Save the encoder

#convert encoded categories to DataFrame
encoded_cat_df = pd.DataFrame(encoded_cats,columns=encoder.get_feature_names_out(categorical_features))

#drop original categoriacla columns and merge encoded data
df = df.drop(columns=categorical_features)
df = pd.concat([df, encoded_cat_df], axis=1)

#scale the continuous features
continuous_features = ["Age", "Height", "Weight", "NCP", "CH2O", "FAF"]
scaler = StandardScaler()
df[continuous_features] = scaler.fit_transform(df[continuous_features])
joblib.dump(scaler, "scaler.pkl")

df.info()
df.to_csv("../data/obesity_nice_encoded.csv")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2111 entries, 0 to 2110
Data columns (total 24 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Age                             2111 non-null   float64
 1   Height                          2111 non-null   float64
 2   Weight                          2111 non-null   float64
 3   family_history_with_overweight  2111 non-null   int64  
 4   FAVC                            2111 non-null   int64  
 5   FCVC                            2111 non-null   float64
 6   NCP                             2111 non-null   float64
 7   SMOKE                           2111 non-null   int64  
 8   CH2O                            2111 non-null   float64
 9   SCC                             2111 non-null   int64  
 10  FAF                             2111 non-null   float64
 11  TUE                             2111 non-null   float64
 12  NObeyesdad                      21

  df[binary_features] = df[binary_features].replace({"yes":1, "no":0})


In [51]:
df = df.sample(frac=1).reset_index(drop=True)
df = df.sample(frac=1).reset_index(drop=True)
df.dropna()
df

Unnamed: 0,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,SMOKE,CH2O,SCC,...,CAEC_Frequently,CAEC_Sometimes,CAEC_no,CALC_Frequently,CALC_Sometimes,CALC_no,MTRANS_Bike,MTRANS_Motorbike,MTRANS_Public_Transportation,MTRANS_Walking
0,-0.679083,1.589564,-0.051787,1,1,2.15,0.404102,0,0.150043,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
1,0.264970,1.161052,0.742944,1,1,3.00,0.404102,0,1.390243,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,1.366366,-1.624276,-1.206266,1,1,3.00,-2.166941,0,1.618701,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
3,-0.521741,-1.838532,-0.824368,1,0,2.00,0.404102,0,-1.644983,0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.264970,-0.660124,0.965973,1,1,3.00,0.404102,0,1.243377,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2106,-0.521741,-0.874380,-0.633418,0,1,2.00,-2.166941,0,1.292332,0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2107,-0.993768,1.375308,-1.017990,1,1,2.86,1.689623,0,-0.013141,0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2108,-0.207057,-0.231612,-0.633418,0,1,2.00,0.404102,0,-0.013141,0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0
2109,0.579654,1.375308,1.308918,1,1,3.00,0.404102,0,1.618701,0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [None]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

#Split the dataset
x = df.drop(columns=["NObeyesdad"])
y = df["NObeyesdad"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

model = RandomForestClassifier(n_estimators=500, max_depth=None, random_state=42,min_samples_split=5,min_samples_leaf=1)
model.fit(x_train, y_train)
cv_scores_rf = cross_val_score(model, x_train, y_train, cv=5, scoring='accuracy')

print("Random Forest Mean Accuracy:", cv_scores_rf.mean())
print("Random Forest Std Dev:", cv_scores_rf.std())

In [None]:
y_pred = model.predict(x_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test,y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R² Score: {r2:.4f}")  # Measures how well the model fits

NameError: name 'mean_absolute_error' is not defined