In [26]:
import pandas as pd

# Step 1: Load dataset
df = pd.read_csv('Final_combined_dataset.csv')

# Preview columns and data
print(df.columns)
print(df.head())


Index(['age', 'height', 'weight', 'family_history_with_overweight', 'favc',
       'fcvc', 'ncp', 'smoke', 'ch2o', 'scc', 'faf', 'tue', 'gender_Male',
       'caec_Always', 'caec_Frequently', 'caec_Sometimes', 'calc_Frequently',
       'calc_Sometimes', 'calc_no', 'mtrans_Bike', 'mtrans_Motorbike',
       'mtrans_Public_Transportation', 'mtrans_Walking', 'nobeyesdad'],
      dtype='object')
    age  height  weight  family_history_with_overweight  favc  fcvc  ncp  \
0  17.0    1.65    67.0                               1     1   3.0  1.0   
1  18.0    1.56    51.0                               1     1   2.0  4.0   
2  17.0    1.75    57.0                               1     1   3.0  3.0   
3  15.0    1.65    86.0                               1     1   3.0  3.0   
4  17.0    1.70    85.0                               1     0   2.0  3.0   

   smoke  ch2o  scc  ...  caec_Frequently  caec_Sometimes  calc_Frequently  \
0      0   2.0    0  ...            False            True            Fa

In [27]:
# Step 2: Drop irrelevant columns
df = df.drop(columns=['smoke', 'calc_Frequently', 'calc_Sometimes', 'calc_no'])


In [28]:
# Strip whitespace from all object (categorical) columns
df = df.apply(lambda col: col.str.strip() if col.dtype == "object" else col)

In [29]:
# Show unique values for every column in the DataFrame
for col in df.columns:
    print(f"{col} → {df[col].unique()}")

age → [17.       18.       15.       16.       14.       17.188754 17.038222
 16.30687  16.198153 17.486869 17.082867 17.000433 16.270434 17.908114
 17.120699 17.065445 17.767432 17.402028 17.70368  17.210933 17.469417
 16.496978 17.377131 16.611837 17.080493 17.764764 17.06713  17.491272
 17.580627 17.729923 16.834813 17.521754 17.405104 16.613108 16.928791
 17.758315 17.282945 16.910997 17.203917 17.671064 17.823438 17.888073
 17.925497 17.000752 17.362129 16.941489 16.172992 17.451085 16.950499
 16.38009  16.865984 16.093234 17.781183 17.08525  17.420269 17.807828
 16.240576 16.370009 17.992717 17.099015 17.25813  17.570089 17.971574
 17.178483 17.441593 17.894784 17.689057 17.997009 17.971786 17.039058
 17.6739   16.129279 16.913841 17.052914 17.073648 17.412629 17.768071
 17.503343 16.178483 17.370009 17.049121 17.024853 17.216048 17.431786
 16.120699 16.140751 17.06739  16.931489 16.261402 17.290765 17.684891
 17.288249 17.068767 16.203917 17.38009  17.504873 16.441786 17.288205]

In [None]:
# Optional: filter children only if 'Age' is available
#if 'Age' in df.columns:
#    df = df[df['Age'] <= 18]

#in pre_processing step filter made between the age 14 and 18

In [30]:
print(df.dtypes)

age                               float64
height                            float64
weight                            float64
family_history_with_overweight      int64
favc                                int64
fcvc                              float64
ncp                               float64
ch2o                              float64
scc                                 int64
faf                               float64
tue                               float64
gender_Male                          bool
caec_Always                          bool
caec_Frequently                      bool
caec_Sometimes                       bool
mtrans_Bike                          bool
mtrans_Motorbike                     bool
mtrans_Public_Transportation         bool
mtrans_Walking                       bool
nobeyesdad                          int64
dtype: object


In [None]:
# Step 3: Encode categorical columns

#from sklearn.preprocessing import LabelEncoder

#df_encoded = df.copy()
#label_encoders = {}
#for col in df_encoded.columns:
#    if df_encoded[col].dtype == 'object':
#        le = LabelEncoder()
#        df_encoded[col] = le.fit_transform(df_encoded[col])
#        label_encoders[col] = le

In [None]:
# Preview columns and data after encoded (to check all are numerical values)
#print(df_encoded.columns)
#print(df_encoded.head())

In [31]:
# Step 4: Prepare features and target
X = df.drop('nobeyesdad', axis=1)
y = df['nobeyesdad']

In [32]:
# Step 5: Split data

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [33]:
# Step 6: Train model

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [36]:
# Step 7: Evaluate

from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.8569206842923794
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.94      0.92       231
           1       0.84      0.82      0.83       182
           2       0.89      0.92      0.91        90
           3       0.00      0.00      0.00         3
           5       0.80      0.62      0.70        66
           6       0.79      0.83      0.81        71

    accuracy                           0.86       643
   macro avg       0.70      0.69      0.69       643
weighted avg       0.85      0.86      0.85       643



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [41]:
label_encoders = {
    'family_history_with_overweight': {'no': 0, 'yes': 1},
    'favc': {'no': 0, 'yes': 1},
    'scc': {'no': 0, 'yes': 1},
    'nobeyesdad': {
        'Insufficient Weight': 0,
        'Normal Weight': 1,
        'Obesity Type_I': 2,
        'Obesity Type_II': 3,
        'Obesity Type_III': 4,
        'Overweight Level_I': 5,
        'Overweight Level_II': 6
    }
}


In [42]:
# Step 8: Export model and encoders
import joblib

joblib.dump(model, 'obesity_model.pkl')
joblib.dump(label_encoders, 'encoders.pkl')

['encoders.pkl']