In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [2]:
# Load the dataset
data = pd.read_csv('ObesityDataset.csv')
data.head(5)

Unnamed: 0,Gender,Age,Height,Weight,family_history_with_overweight,FAVC,FCVC,NCP,CAEC,SMOKE,CH2O,SCC,FAF,TUE,CALC,MTRANS,NObeyesdad
0,Female,21.0,1.62,64.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,0.0,1.0,no,Public_Transportation,Normal_Weight
1,Female,21.0,1.52,56.0,yes,no,3.0,3.0,Sometimes,yes,3.0,yes,3.0,0.0,Sometimes,Public_Transportation,Normal_Weight
2,Male,23.0,1.8,77.0,yes,no,2.0,3.0,Sometimes,no,2.0,no,2.0,1.0,Frequently,Public_Transportation,Normal_Weight
3,Male,27.0,1.8,87.0,no,no,3.0,3.0,Sometimes,no,2.0,no,2.0,0.0,Frequently,Walking,Overweight_Level_I
4,Male,22.0,1.78,89.8,no,no,2.0,1.0,Sometimes,no,2.0,no,0.0,0.0,Sometimes,Public_Transportation,Overweight_Level_II


In [3]:
# Check for Missing Data
missing_data = data.isnull().sum()  
print("\nMissing Data Information:")
print(missing_data)


Missing Data Information:
Gender                            0
Age                               0
Height                            0
Weight                            0
family_history_with_overweight    0
FAVC                              0
FCVC                              0
NCP                               0
CAEC                              0
SMOKE                             0
CH2O                              0
SCC                               0
FAF                               0
TUE                               0
CALC                              0
MTRANS                            0
NObeyesdad                        0
dtype: int64


In [4]:
# Encoding categorical features
categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 
                       'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'NObeyesdad']

label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le  

In [5]:
# Memisahkan features and target
X = data.drop('NObeyesdad', axis=1)
y = data['NObeyesdad']

In [6]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [7]:
# Scalling numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Support Vector Classifier Model

In [8]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
svm_model = SVC(kernel='rbf', random_state=42)
svm_model.fit(X_train_scaled, y_train)
svm_y_pred = svm_model.predict(X_test_scaled)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
svm_cm = confusion_matrix(y_test, svm_y_pred)
print(f"SVM Model Accuracy: {svm_accuracy * 100:.2f}%")
print("\nSVM Classification Report:")
print(classification_report(y_test, svm_y_pred))
print("\nConfusion Matrix: ", svm_cm)

SVM Model Accuracy: 87.00%

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.93      0.93        54
           1       0.65      0.74      0.69        58
           2       0.90      0.93      0.92        70
           3       0.95      0.95      0.95        60
           4       1.00      0.98      0.99        65
           5       0.80      0.74      0.77        58
           6       0.85      0.79      0.82        58

    accuracy                           0.87       423
   macro avg       0.87      0.87      0.87       423
weighted avg       0.87      0.87      0.87       423


Confusion Matrix:  [[50  4  0  0  0  0  0]
 [ 3 43  1  0  0  9  2]
 [ 0  2 65  1  0  0  2]
 [ 0  2  1 57  0  0  0]
 [ 0  0  0  1 64  0  0]
 [ 0 10  1  0  0 43  4]
 [ 0  5  4  1  0  2 46]]


### Random Forest Classifier Model

In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train_scaled, y_train)
rf_y_pred = rf_model.predict(X_test_scaled)
rf_accuracy = accuracy_score(y_test, rf_y_pred)
rf_cm = confusion_matrix(y_test, rf_y_pred)
print(f"Random Forest Model Accuracy: {rf_accuracy * 100:.2f}%")
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, rf_y_pred))
print("\nConfusion Matrix: ", rf_cm)

Random Forest Model Accuracy: 95.27%

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        54
           1       0.82      0.97      0.89        58
           2       0.97      0.97      0.97        70
           3       0.98      0.98      0.98        60
           4       1.00      0.98      0.99        65
           5       0.94      0.88      0.91        58
           6       0.96      0.95      0.96        58

    accuracy                           0.95       423
   macro avg       0.96      0.95      0.95       423
weighted avg       0.96      0.95      0.95       423


Confusion Matrix:  [[50  4  0  0  0  0  0]
 [ 0 56  0  0  0  2  0]
 [ 0  0 68  0  0  0  2]
 [ 0  0  1 59  0  0  0]
 [ 0  0  0  1 64  0  0]
 [ 0  7  0  0  0 51  0]
 [ 0  1  1  0  0  1 55]]


### Gradient Boosting Classfier Model

In [10]:
from sklearn.ensemble import GradientBoostingClassifier
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train_scaled, y_train)
gb_y_pred = gb_model.predict(X_test_scaled)
gb_accuracy = accuracy_score(y_test, gb_y_pred)
gb_cm = confusion_matrix(y_test, gb_y_pred)
print(f"Gradient Boosting Model Accuracy: {gb_accuracy * 100:.2f}%")
print("\nGradient Boosting Classification Report:")
print(classification_report(y_test, gb_y_pred))
print("\nConfusion Matrix: ", gb_cm)


Gradient Boosting Model Accuracy: 95.98%

Gradient Boosting Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.91      0.95        54
           1       0.85      0.98      0.91        58
           2       0.97      0.97      0.97        70
           3       0.98      0.97      0.97        60
           4       1.00      0.98      0.99        65
           5       0.93      0.91      0.92        58
           6       1.00      0.98      0.99        58

    accuracy                           0.96       423
   macro avg       0.96      0.96      0.96       423
weighted avg       0.96      0.96      0.96       423


Confusion Matrix:  [[49  4  0  0  0  1  0]
 [ 0 57  0  0  0  1  0]
 [ 0  0 68  1  0  1  0]
 [ 0  1  1 58  0  0  0]
 [ 0  0  1  0 64  0  0]
 [ 0  5  0  0  0 53  0]
 [ 0  0  0  0  0  1 57]]


### Artificial Neural Network Model

In [11]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
# Konversi target to one-hot encoding
y_train_nn = to_categorical(y_train)
y_test_nn = to_categorical(y_test)

nn_model = Sequential([
    Dense(64, input_dim=X_train_scaled.shape[1], activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(y_train_nn.shape[1], activation='softmax')
])

nn_model.compile(optimizer='adam', 
                 loss='categorical_crossentropy', 
                 metrics=['accuracy'])

nn_model.fit(X_train_scaled, y_train_nn, 
             epochs=50, 
             batch_size=32, 
             validation_split=0.2, 
             verbose=1)

nn_loss, nn_accuracy = nn_model.evaluate(X_test_scaled, y_test_nn, verbose=0)
print(f"Neural Network Model Accuracy: {nn_accuracy * 100:.2f}%")

# Generate classification report for Neural Network
nn_y_pred = nn_model.predict(X_test_scaled)
nn_y_pred_classes = nn_y_pred.argmax(axis=1)
nn_y_test_classes = y_test_nn.argmax(axis=1)
nn_cm = confusion_matrix(nn_y_test_classes, nn_y_pred_classes)
print("\nNeural Network Classification Report:")
print(classification_report(nn_y_test_classes, nn_y_pred_classes))
print("\nConfusion Matrix: ", nn_cm)

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.1679 - loss: 2.0974 - val_accuracy: 0.4260 - val_loss: 1.7699
Epoch 2/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.3369 - loss: 1.7659 - val_accuracy: 0.5118 - val_loss: 1.5029
Epoch 3/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4529 - loss: 1.5047 - val_accuracy: 0.5828 - val_loss: 1.3041
Epoch 4/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5195 - loss: 1.3235 - val_accuracy: 0.6095 - val_loss: 1.1630
Epoch 5/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5134 - loss: 1.2315 - val_accuracy: 0.6331 - val_loss: 1.0596
Epoch 6/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5349 - loss: 1.1662 - val_accuracy: 0.6598 - val_loss: 0.9791
Epoch 7/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━

In [12]:
# Input new data
new_data = {
    'Gender': 'Male',
    'Age': 25,
    'Height': 175,
    'Weight': 70,
    'family_history_with_overweight': 'yes',
    'FAVC': 'yes',
    'FCVC': 2,
    'NCP': 3,
    'CAEC': 'Sometimes',
    'SMOKE': 'no',
    'CH2O': 2,
    'SCC': 'no',
    'FAF': 1,
    'TUE': 2,
    'CALC': 'Sometimes',
    'MTRANS': 'Public_Transportation'
}

# Convert new data to DataFrame
new_data_df = pd.DataFrame([new_data])

# Encoding categorical
categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 
                       'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

for col in categorical_columns:
    if col in new_data_df.columns:
        new_data_df[col] = label_encoders[col].transform(new_data_df[col])

# Scalling numerical kolom
new_data_scaled = scaler.transform(new_data_df)

# Prediksi
prediction = gb_model.predict(new_data_scaled)
predicted_class = prediction[0]  # Get the predicted class

# Decode target label yang ter-encode
decoded_prediction = label_encoders['NObeyesdad'].inverse_transform([predicted_class])

print(f"Predicted Class (Encoded): {predicted_class}")
print(f"Predicted Class (Decoded): {decoded_prediction[0]}")

Predicted Class (Encoded): 1
Predicted Class (Decoded): Normal_Weight


In [13]:
import joblib
joblib.dump(gb_model, 'gradient_boosting_model.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoders, 'label_encoders.pkl')

['label_encoders.pkl']