In [113]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
import pickle


#### load data

In [116]:
data = pd.read_csv("ObesityDataSet1.csv")
data

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Male,31,1.87,128.87,yes,yes,2.96,3.00,Sometimes,yes,1.28,no,0.90,1.875,Sometimes,Automobile,Obesity_Type_II
1,Female,18,1.59,40.00,yes,yes,2.00,1.00,Frequently,no,1.00,no,0.00,2.000,no,Public_Transportation,Insufficient_Weight
2,Female,44 years,1.59,77.00,yes,yes,2.00,3.00,Sometimes,no,2.81,no,0.00,0.000,Sometimes,Automobile,Obesity_Type_I
3,Male,31,1.68,102.00,yes,yes,2.94,2.14,Sometimes,no,1.25,no,1.18,0.778,no,Public_Transportation,Obesity_Type_II
4,Male,22,1.74,75.00,yes,yes,3.00,3.00,Frequently,no,1.00,no,1.00,0.000,no,Automobile,Normal_Weight
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1050,Female,19,1.62,69.98,yes,yes,2.25,2.75,Sometimes,no,2.33,no,0.82,0.024,no,Public_Transportation,Overweight_Level_I
1051,Male,27,1.78,113.15,yes,yes,2.22,3.00,Sometimes,no,2.09,no,0.55,1.743,Sometimes,Automobile,Obesity_Type_II
1052,Male,23,1.72,70.00,no,no,2.00,3.00,Sometimes,no,2.00,no,3.00,1.000,Frequently,Public_Transportation,Normal_Weight
1053,Male,21,1.86,89.56,yes,yes,2.00,3.00,Sometimes,no,1.01,no,0.00,0.798,Sometimes,Public_Transportation,Overweight_Level_I


In [118]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          1055 non-null   object 
 1   Age                             1055 non-null   object 
 2   Height                          1055 non-null   float64
 3   Weight                          1055 non-null   float64
 4   family_history_with_overweight  1055 non-null   object 
 5   FAVC                            1055 non-null   object 
 6   FCVC                            1055 non-null   float64
 7   NCP                             1055 non-null   float64
 8   CAEC                            1029 non-null   object 
 9   SMOKE                           1055 non-null   object 
 10  CH2O                            1055 non-null   float64
 11  SCC                             1055 non-null   object 
 12  FAF                             10

In [119]:
data.isnull().sum()

Gender                             0
Age                                0
Height                             0
Weight                             0
family_history_with_overweight     0
FAVC                               0
FCVC                               0
NCP                                0
CAEC                              26
SMOKE                              0
CH2O                               0
SCC                                0
FAF                                0
TUE                               37
CALC                               0
MTRANS                             0
NObeyesdad                         0
dtype: int64

##### isi variable TUE dengan median (karena numerik ordinal)

In [123]:
data['TUE'] = data['TUE'].fillna(data['TUE'].median())

##### isi variable CAEC dengan modus (karena baersifat kategorik)

In [126]:
data['CAEC'] = data['CAEC'].fillna(data['CAEC'].mode()[0])

In [128]:
print(data[['TUE', 'CAEC']].isnull().sum())

TUE     0
CAEC    0
dtype: int64


##### cek inkonsitensi data (seluruh variable)

In [131]:
for col in data.columns:
    print(f"\n--- {col} ---")
    print(data[col].unique())


--- Gender ---
['Male' 'Female']

--- Age ---
['31' '18' '44 years' '22' '21' '25' '56' '24' '19' '23' '41' '20' '26'
 '35' '28' '17' '33' '27' '40' '39' '30' '32' '37' '38' '42' '34' '29'
 '19 years' '16' '44' '52' '61' '21 years' '22 years' '18 years' '43'
 '23 years' '36' '26 years' '55' '20 years' '51' '45' '47' '32 years'
 '33 years']

--- Height ---
[1.87 1.59 1.68 1.74 1.69 1.51 1.79 1.7  1.82 1.56 1.62 1.76 1.54 1.88
 1.6  1.81 1.86 1.67 1.65 1.93 1.63 1.53 1.61 1.85 1.66 1.8  1.75 1.91
 1.78 1.84 1.83 1.89 1.64 1.52 1.57 1.72 1.9  1.77 1.55 1.71 1.73 1.58
 1.5  1.98 1.46 1.49 1.48 1.94 1.92]

--- Weight ---
[128.87  40.    77.   102.    75.    99.53  63.72  90.   141.92  49.
  58.    79.99  82.58  80.    45.    86.75 126.42  79.75  67.   118.56
  53.66 110.07  84.85  47.   125.42  84.49  60.    85.    86.24  94.45
 120.42  64.   107.01  68.   133.74 102.78 128.83 109.96 119.62  78.43
 103.19  50.95  55.01 105.26 106.69 101.78  43.53  99.61  56.    53.
  66.4  129.16 111.83  7

##### pembersihan variable age

In [134]:
data['Age'] = data['Age'].astype(str).apply(lambda x: float(re.sub(r'[^\d.]', '', x)))

In [136]:
data['Age'].unique()

array([31., 18., 44., 22., 21., 25., 56., 24., 19., 23., 41., 20., 26.,
       35., 28., 17., 33., 27., 40., 39., 30., 32., 37., 38., 42., 34.,
       29., 16., 52., 61., 43., 36., 55., 51., 45., 47.])

##### split fitur dan target

In [139]:
X = data.drop(columns=['NObeyesdad'])
y = data['NObeyesdad']   

##### label encoding pada target

In [142]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

##### identifikasi fitur numerik dan kategorikal

In [145]:
numerical = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
categorical = [col for col in X.columns if col not in numerical]

##### pipelines

In [148]:
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_transformer, numerical),
    ('cat', categorical_transformer, categorical)
])

##### Train-test dan split

In [151]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, stratify=y_encoded, test_size=0.2, random_state=42)

##### model pipelines

In [165]:
pipe_rf = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

pipe_xgb = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('clf', XGBClassifier(eval_metric='mlogloss', random_state=42))
])

##### training

In [170]:
pipe_rf.fit(X_train, y_train)
pipe_xgb.fit(X_train, y_train)

In [171]:
y_pred_rf = pipe_rf.predict(X_test)
y_pred_xgb = pipe_xgb.predict(X_test)

In [172]:
print("Random Forest Classification Report:\n")
print(classification_report(y_test, y_pred_rf, target_names=le.classes_))

print("\nXGBoost Classification Report:\n")
print(classification_report(y_test, y_pred_xgb, target_names=le.classes_))

Random Forest Classification Report:

                     precision    recall  f1-score   support

Insufficient_Weight       0.96      1.00      0.98        27
      Normal_Weight       0.84      0.90      0.87        29
     Obesity_Type_I       1.00      1.00      1.00        35
    Obesity_Type_II       1.00      1.00      1.00        30
   Obesity_Type_III       1.00      1.00      1.00        32
 Overweight_Level_I       0.90      0.90      0.90        29
Overweight_Level_II       0.96      0.86      0.91        29

           accuracy                           0.95       211
          macro avg       0.95      0.95      0.95       211
       weighted avg       0.95      0.95      0.95       211


XGBoost Classification Report:

                     precision    recall  f1-score   support

Insufficient_Weight       0.90      1.00      0.95        27
      Normal_Weight       0.93      0.86      0.89        29
     Obesity_Type_I       0.94      0.97      0.96        35
    Obesit

##### simpan model ke pickle

In [None]:
import pickle
with open('best_model.pkl', 'wb') as f:
    pickle.dump(pipe_xgb, f)
    
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(le, f)