# ปัญหาเราเป็น multiclass (1–6)

ใช้เมตริกหลัก: 
    - Macro-F1 (เฉลี่ยทุกคลาสเท่ากัน—กันปัญหาคลาสใหญ่ครอบ)
    - Accuracy (ดูรวม ๆ)
    - Confusion Matrix (ดูผิด/ถูกรายคลาส)

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import tensorflow as tf
import matplotlib.pyplot as plt
tf.keras.backend.clear_session()
from tensorflow.keras import layers, Model

df = pd.read_csv("MOBS_02S_Clean.csv")
y = df["Obesity"].astype(int)

# สัดส่วนคลาส
cnt = y.value_counts().sort_index()
N = len(y); K = cnt.shape[0]
display("Class percents (%):\n", (cnt/N*100).round(2))

# class weights: N / (K * n_k)
class_weight = {int(k): float(N/(K*v)) for k, v in cnt.items()}  
print("\nClass weights (ใช้ตอนเทรน):", class_weight)






'Class percents (%):\n'

Obesity
0    12.88
1    13.60
2    13.74
3    13.74
4    16.63
5    14.07
6    15.35
Name: count, dtype: float64


Class weights (ใช้ตอนเทรน): {0: 1.108718487394958, 1: 1.0507715281234444, 2: 1.0399014778325124, 3: 1.0399014778325124, 4: 0.8591778591778592, 5: 1.0153920153920153, 6: 0.9307760141093474}


In [2]:
# Step 1 — Stratified split 70/15/15 + class_weight(from train only)
from sklearn.model_selection import train_test_split

# 15% test โดย stratify
train_val_df, test_df = train_test_split(df, test_size=0.15, stratify=df["Obesity"], random_state=42)

# จาก train_val แบ่ง val อีก ~15% ของทั้งก้อน (0.1765 * 0.85 ≈ 0.15)
train_df, val_df = train_test_split(train_val_df, test_size=0.1765, stratify=train_val_df["Obesity"], random_state=42)

# 1) จำนวนแถวต้องเท่ากันเมื่อรวม 3 ชุด
assert len(train_df) + len(val_df) + len(test_df) == len(df)

# 2) ดัชนีห้ามซ้ำระหว่างชุด (ไม่มีแถวใดรั่วไปมา)
assert set(train_df.index).isdisjoint(val_df.index)
assert set(train_df.index).isdisjoint(test_df.index)
assert set(val_df.index).isdisjoint(test_df.index)


In [3]:
# --- ตรวจความถูกต้องของการแบ่ง ---
n_total = len(df)
print("\nตรวจรวมจำนวนแถว:", len(train_df) + len(val_df) + len(test_df), "จาก", n_total)
assert set(train_df.index).isdisjoint(val_df.index)
assert set(train_df.index).isdisjoint(test_df.index)
assert set(val_df.index).isdisjoint(test_df.index)

# --- คำนวณ class_weight จาก TRAIN เท่านั้น ---
N = len(train_df)
K = train_df["Obesity"].nunique()
cw = {int(k): float(N / (K * v)) for k, v in train_df["Obesity"].value_counts().items()}
print("\nclass_weight (จาก TRAIN เท่านั้น):", cw)

# ตัวแปรที่ได้ไว้ใช้ต่อ:
# train_df, val_df, test_df, cw


ตรวจรวมจำนวนแถว: 2111 จาก 2111

class_weight (จาก TRAIN เท่านั้น): {4: 0.8612244897959184, 6: 0.9295154185022027, 5: 1.0144230769230769, 3: 1.0394088669950738, 2: 1.0394088669950738, 1: 1.0497512437810945, 0: 1.1105263157894736}


In [4]:
# Step 2: กำหนดคอลัมน์ที่ใช้ในโมเดล

# 1) ระบุคอลัมน์ (ตามที่เราจัดกลุ่มไว้)
NUMERIC_COLS = ["age_clean", "bmi", "height_m", "weight_kg"]
CATEG_COLS = [
    "gender", "overweight_family", "consum_cf", "consum_vf", "consum_daily",
    "consum_other", "smoking", "consum_water", "cal_monitoring",
    "phyical_activity", "device_usage", "consum_alchohol", "transportation"
]
TARGET = "Obesity"

# สร้าง Keras Inputs

inputs = {}
for c in NUMERIC_COLS:
    inputs[c] = layers.Input(shape=(1,), name=c, dtype=tf.float32)
for c in CATEG_COLS:
    inputs[c] = layers.Input(shape=(1,), name=c, dtype=tf.int32)

# Numeric: Normalization (adapt จาก train เท่านั้น)
encoded_parts = []
for c in NUMERIC_COLS:
    norm = layers.Normalization(name=f"{c}_norm")
    norm.adapt(train_df[c].to_numpy().reshape(-1,1))
    encoded_parts.append(norm(inputs[c]))

# Categorical (int-coded): IntegerLookup -> one-hot (adapt จาก train เท่านั้น)
for c in CATEG_COLS:
    lk = layers.IntegerLookup(output_mode="one_hot", name=f"{c}_onehot")
    lk.adapt(train_df[c].to_numpy())
    encoded_parts.append(lk(inputs[c]))

# รวมเป็นเวกเตอร์ฟีเจอร์
features = layers.Concatenate(name="features_concat")(encoded_parts)
preprocessor = tf.keras.Model(inputs=inputs, outputs=features, name="preprocessor")

# helper แพ็ก Pandas → dict (ไม่แตะ missing)
def to_keras_inputs(df):
    x = {}
    for c in NUMERIC_COLS: x[c] = df[c].to_numpy()
    for c in CATEG_COLS:   x[c] = df[c].to_numpy()
    return x

Xtr, ytr = to_keras_inputs(train_df), train_df[TARGET].to_numpy().astype("int32")
Xva, yva = to_keras_inputs(val_df),   val_df[TARGET].to_numpy().astype("int32")
Xte, yte = to_keras_inputs(test_df),  test_df[TARGET].to_numpy().astype("int32")

In [5]:
from tensorflow import keras

n_classes = int(train_df[TARGET].nunique())

x = preprocessor(inputs)

# block 1
x = layers.Dense(128, kernel_initializer="he_normal",
                 kernel_regularizer=keras.regularizers.l2(1e-4))(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.Dropout(0.5)(x)

# block 2
x = layers.Dense(64, kernel_initializer="he_normal",
                 kernel_regularizer=keras.regularizers.l2(1e-4))(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.Dropout(0.4)(x)

x = layers.Dense(32, kernel_initializer="he_normal",
                 kernel_regularizer=keras.regularizers.l2(1e-4))(x)
x = layers.BatchNormalization()(x)
x = layers.Activation("relu")(x)
x = layers.Dropout(0.3)(x)

out = layers.Dense(n_classes, activation="softmax")(x)

model = Model(inputs=inputs, outputs=out)
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=1e-1),   # ค่อยๆลด lr ให้เสถียรก่อน
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)



In [6]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(patience=10, monitor="val_loss",restore_best_weights=True),
    tf.keras.callbacks.ReduceLROnPlateau(monitor="val_accuracy", factor=0.1, patience=5, min_lr=1e-4 )
]

base_cw = {4: 0.8612244897959184, 
           6: 0.9295154185022027, 
           5: 1.0144230769230769, 
           3: 1.0394088669950738, 
           2: 1.0394088669950738, 
           1: 1.0497512437810945, 
           0: 1.1105263157894736}


history = model.fit(
    Xtr, ytr,
    validation_data=(Xva, yva),
    epochs=500,
    batch_size=256,
    class_weight=base_cw,     
    callbacks=callbacks
)


Epoch 1/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 65ms/step - accuracy: 0.3595 - loss: 1.7757 - val_accuracy: 0.4164 - val_loss: 16.9332 - learning_rate: 0.1000
Epoch 2/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.5782 - loss: 1.2562 - val_accuracy: 0.3943 - val_loss: 42.5332 - learning_rate: 0.1000
Epoch 3/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6608 - loss: 1.1269 - val_accuracy: 0.4164 - val_loss: 36.2950 - learning_rate: 0.1000
Epoch 4/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7014 - loss: 1.0517 - val_accuracy: 0.4290 - val_loss: 32.7568 - learning_rate: 0.1000
Epoch 5/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7265 - loss: 0.9945 - val_accuracy: 0.4259 - val_loss: 24.4785 - learning_rate: 0.1000
Epoch 6/500
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

In [7]:
print("VAL:", model.evaluate(Xva, yva, verbose=0))
print("TEST:", model.evaluate(Xte, yte, verbose=0))

import numpy as np
from sklearn.metrics import confusion_matrix, classification_report

pred = model.predict(Xte, batch_size=256).argmax(axis=1)
print(confusion_matrix(yte, pred))
print(classification_report(yte, pred, digits=4))


VAL: [0.31378576159477234, 0.9463722109794617]
TEST: [0.39927423000335693, 0.9526813626289368]
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step
[[40  1  0  0  0  0  0]
 [ 2 38  3  0  0  0  0]
 [ 0  3 37  3  0  0  0]
 [ 0  0  0 43  0  0  0]
 [ 0  0  0  0 53  0  0]
 [ 0  0  0  0  0 44  1]
 [ 0  0  0  0  0  2 47]]
              precision    recall  f1-score   support

           0     0.9524    0.9756    0.9639        41
           1     0.9048    0.8837    0.8941        43
           2     0.9250    0.8605    0.8916        43
           3     0.9348    1.0000    0.9663        43
           4     1.0000    1.0000    1.0000        53
           5     0.9565    0.9778    0.9670        45
           6     0.9792    0.9592    0.9691        49

    accuracy                         0.9527       317
   macro avg     0.9504    0.9510    0.9503       317
weighted avg     0.9525    0.9527    0.9522       317

