In [9]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix

In [10]:
# Load dataset
file_path = "../data/ObesityDataSet.csv"
df = pd.read_csv(file_path)

In [11]:
# Encode categorical variables
categorical_cols = ['Gender', 'MTRANS']
df_encoded = df.copy()
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le

In [12]:
# Pisahkan fitur dan target
selected_features = ['Age', 'Height', 'Weight', 'FCVC', 'NCP', 'CH2O', 'FAF', 'TUE', 'Gender', 'MTRANS']
target = 'NObeyesdad'
label_encoder = LabelEncoder()
df_encoded[target] = label_encoder.fit_transform(df_encoded[target])

X = df_encoded[selected_features]
y = df_encoded[target]

In [13]:
# Split data menjadi train dan test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Normalisasi fitur numerik
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)

# Pastikan X_test memiliki fitur yang sama dengan X_train sebelum transform
X_test = X_test.reindex(columns=X.columns, fill_value=0)
X_test = scaler.transform(X_test)

In [14]:
# Inisialisasi model
models = {
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Support Vector Machine": SVC(kernel='rbf', C=1.0, random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
}

# Training dan evaluasi
results = {}
conf_matrices = {}
evaluation_metrics = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    results[name] = acc
    conf_matrices[name] = confusion_matrix(y_test, y_pred)
    evaluation_metrics.append([name, acc, precision, recall, f1])
    
    print(f"\n{name} Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
    print("="*60)


Random Forest Accuracy: 0.9574
                     precision    recall  f1-score   support

Insufficient_Weight       1.00      0.93      0.96        54
      Normal_Weight       0.85      0.97      0.90        58
     Obesity_Type_I       0.97      0.97      0.97        70
    Obesity_Type_II       0.98      0.98      0.98        60
   Obesity_Type_III       1.00      0.98      0.99        65
 Overweight_Level_I       0.98      0.88      0.93        58
Overweight_Level_II       0.93      0.98      0.96        58

           accuracy                           0.96       423
          macro avg       0.96      0.96      0.96       423
       weighted avg       0.96      0.96      0.96       423


Support Vector Machine Accuracy: 0.8983
                     precision    recall  f1-score   support

Insufficient_Weight       0.94      0.91      0.92        54
      Normal_Weight       0.78      0.84      0.81        58
     Obesity_Type_I       0.92      0.94      0.93        70
    Obes

In [15]:
# Simpan model terbaik
best_model = max(results, key=results.get)
with open("trained_model.pkl", "wb") as model_file:
    pickle.dump((scaler, models[best_model]), model_file)

# Fungsi untuk memuat model dan melakukan prediksi
def predict_obesity(features):
    with open("trained_model.pkl", "rb") as model_file:
        scaler, model = pickle.load(model_file)
    
    features = np.array(features).reshape(1, -1)
    
    # Pastikan fitur memiliki urutan yang sama
    features_df = pd.DataFrame(features, columns=X.columns)
    features_df = features_df.reindex(columns=X.columns, fill_value=0)
    features_scaled = scaler.transform(features_df)
    
    prediction = model.predict(features_scaled)
    
    return label_encoder.inverse_transform(prediction)[0]