In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

# IMPORT DATA

In [2]:
df = pd.read_excel('Data/obesitas smote.xlsx')

# EDA

In [3]:
df.describe()

Unnamed: 0,Age,Height,Weight,FCVC,NCP,CH2O,FAF,TUE
count,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0,2111.0
mean,24.315964,1.701677,86.586058,2.419043,2.685628,2.008011,1.010298,0.657866
std,6.357078,0.093305,26.191172,0.533927,0.778039,0.612953,0.850592,0.608927
min,14.0,1.45,39.0,1.0,1.0,1.0,0.0,0.0
25%,20.0,1.63,65.473343,2.0,2.658738,1.584812,0.124505,0.0
50%,23.0,1.700499,83.0,2.385502,3.0,2.0,1.0,0.62535
75%,26.0,1.768464,107.430682,3.0,3.0,2.47742,1.666678,1.0
max,61.0,1.98,173.0,3.0,4.0,3.0,3.0,2.0


In [4]:
categorical_features = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']

In [5]:
for column in categorical_features:
    if column != 'Age': 
        print(df[column].value_counts())
        print()

Gender
Male      1068
Female    1043
Name: count, dtype: int64

family_history_with_overweight
yes    1726
no      385
Name: count, dtype: int64

FAVC
yes    1866
no      245
Name: count, dtype: int64

CAEC
Sometimes     1765
Frequently     242
Always          53
no              51
Name: count, dtype: int64

SMOKE
no     2067
yes      44
Name: count, dtype: int64

SCC
no     2015
yes      96
Name: count, dtype: int64

CALC
Sometimes     1401
no             639
Frequently      70
Always           1
Name: count, dtype: int64

MTRANS
Public_Transportation    1580
Automobile                457
Walking                    56
Motorbike                  11
Bike                        7
Name: count, dtype: int64

NObeyesdad
Obesity_Type_I         351
Obesity_Type_III       324
Obesity_Type_II        297
Overweight_Level_I     290
Overweight_Level_II    290
Normal_Weight          287
Insufficient_Weight    272
Name: count, dtype: int64



setelah dilakukan **EDA** tidak ada missing value dan duplicate, data juga sudah seimbang karena telah dilakukan **SMOTE** oleh pembuat data.

# TRANSFORM KATEGORIK KE NUMERIK 
Melakukan tranformasi pada semua fitur dengan menggunakan LabelEncoder

In [6]:
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

In [7]:
X = df.drop('NObeyesdad', axis=1)
y = df['NObeyesdad']

# NORMALISASI DATA 
Menggunakan StandardScaler

In [8]:
scaler = StandardScaler()
X = scaler.fit_transform(X)

# SPLIT DATA
Split data menggunakan train_test_split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# MODEL ML YANG DIGUNAKAN

In [14]:
models = {
    "Decision Tree": DecisionTreeClassifier(max_depth= 50, min_samples_split= 2),
    "Gradient Boosting": GradientBoostingClassifier(learning_rate= 0.5, max_depth= 7, n_estimators= 200)
}

#  MELATIH MODEL

In [15]:
results = []

for model_name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    model.fit(X_train, y_train)  
    y_train_pred = model.predict(X_train)
    train_accuracy = accuracy_score(y_train, y_train_pred)
    y_pred = model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred) 

    report = classification_report(y_test, y_pred, output_dict=True)['weighted avg']
    results.append({
        'Model': model_name,
        'CV Mean Accuracy': scores.mean(),
        'Test Accuracy': test_accuracy,
        'Train Accuracy': train_accuracy,
        'Precision': report['precision'],
        'Recall': report['recall'],
        'F1-Score': report['f1-score']
    })


results_df = pd.DataFrame(results)

# RESULT

In [16]:
results_df

Unnamed: 0,Model,CV Mean Accuracy,Test Accuracy,Train Accuracy,Precision,Recall,F1-Score
0,Decision Tree,0.9218,0.93617,1.0,0.936681,0.93617,0.936147
1,Gradient Boosting,0.968021,0.964539,1.0,0.964739,0.964539,0.964542
