In [None]:
import pandas as pd

# Load dataset
df = pd.read_csv('ObesityDataSet1.csv')

# Clean 'Age' column
df['Age'] = df['Age'].astype(str).str.replace(' years', '', regex=False)
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')

df.head()

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Male,31,1.87,128.87,yes,yes,2.96,3.0,Sometimes,yes,1.28,no,0.9,1.875,Sometimes,Automobile,Obesity_Type_II
1,Female,18,1.59,40.0,yes,yes,2.0,1.0,Frequently,no,1.0,no,0.0,2.0,no,Public_Transportation,Insufficient_Weight
2,Female,44,1.59,77.0,yes,yes,2.0,3.0,Sometimes,no,2.81,no,0.0,0.0,Sometimes,Automobile,Obesity_Type_I
3,Male,31,1.68,102.0,yes,yes,2.94,2.14,Sometimes,no,1.25,no,1.18,0.778,no,Public_Transportation,Obesity_Type_II
4,Male,22,1.74,75.0,yes,yes,3.0,3.0,Frequently,no,1.0,no,1.0,0.0,no,Automobile,Normal_Weight


In [None]:
df.info()
df.isnull().sum()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1055 entries, 0 to 1054
Data columns (total 17 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   Gender                          1055 non-null   object 
 1   Age                             1055 non-null   int64  
 2   Height                          1055 non-null   float64
 3   Weight                          1055 non-null   float64
 4   family_history_with_overweight  1055 non-null   object 
 5   FAVC                            1055 non-null   object 
 6   FCVC                            1055 non-null   float64
 7   NCP                             1055 non-null   float64
 8   CAEC                            1029 non-null   object 
 9   SMOKE                           1055 non-null   object 
 10  CH2O                            1055 non-null   float64
 11  SCC                             1055 non-null   object 
 12  FAF                             10

Unnamed: 0,0
Gender,0
Age,0
Height,0
Weight,0
family_history_with_overweight,0
FAVC,0
FCVC,0
NCP,0
CAEC,26
SMOKE,0


In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
import pickle
from sklearn.impute import SimpleImputer

# Pisahkan fitur dan target
X = df.drop('NObeyesdad', axis=1)
y = df['NObeyesdad']

# Bagi kolom berdasarkan jenis datanya
numerical = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE']
categorical = [col for col in X.columns if col not in numerical]

# Bagi data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Buat pipeline preprocessing
preprocessor = ColumnTransformer([
    ('num', Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())]), numerical),
    ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical)
])

2. Model Training & Evaluation

In [None]:
pipeline_rf = Pipeline([
    ('preprocess', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)

print("Random Forest Report:\n", classification_report(y_test, y_pred_rf))


Random Forest Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.87      0.96      0.92        28
      Normal_Weight       0.90      0.84      0.87        31
     Obesity_Type_I       0.97      0.92      0.94        37
    Obesity_Type_II       1.00      1.00      1.00        33
   Obesity_Type_III       1.00      1.00      1.00        25
 Overweight_Level_I       0.95      0.95      0.95        21
Overweight_Level_II       0.92      0.94      0.93        36

           accuracy                           0.94       211
          macro avg       0.94      0.95      0.94       211
       weighted avg       0.94      0.94      0.94       211



In [None]:
pipeline_knn = Pipeline([
    ('preprocess', preprocessor),
    ('clf', KNeighborsClassifier())
])

pipeline_knn.fit(X_train, y_train)
y_pred_knn = pipeline_knn.predict(X_test)

print("KNN Report:\n", classification_report(y_test, y_pred_knn))


KNN Report:
                      precision    recall  f1-score   support

Insufficient_Weight       0.70      0.93      0.80        28
      Normal_Weight       0.73      0.35      0.48        31
     Obesity_Type_I       0.73      0.86      0.79        37
    Obesity_Type_II       0.86      0.97      0.91        33
   Obesity_Type_III       1.00      1.00      1.00        25
 Overweight_Level_I       0.61      0.81      0.69        21
Overweight_Level_II       0.80      0.56      0.66        36

           accuracy                           0.77       211
          macro avg       0.78      0.78      0.76       211
       weighted avg       0.78      0.77      0.76       211



In [None]:
# Misal model terbaik Random Forest
with open('model.pkl', 'wb') as f:
    pickle.dump(pipeline_rf, f)
