# ปัญหาเราเป็น multiclass (1–6)

ใช้เมตริกหลัก: 
    - Macro-F1 (เฉลี่ยทุกคลาสเท่ากัน—กันปัญหาคลาสใหญ่ครอบ)
    - Accuracy (ดูรวม ๆ)
    - Confusion Matrix (ดูผิด/ถูกรายคลาส)

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import tensorflow as tf
import matplotlib.pyplot as plt
tf.keras.backend.clear_session()

df = pd.read_csv("MOBS_02S_Clean.csv")
y = df["Obesity"].astype(int)

# สัดส่วนคลาส
cnt = y.value_counts().sort_index()
N = len(y); K = cnt.shape[0]
display("Class percents (%):\n", (cnt/N*100).round(2))

# class weights: N / (K * n_k)
class_weight = {int(k): float(N/(K*v)) for k, v in cnt.items()}
print("\nClass weights (ใช้ตอนเทรน):", class_weight)






'Class percents (%):\n'

Obesity
0    12.88
1    13.60
2    13.74
3    13.74
4    16.63
5    14.07
6    15.35
Name: count, dtype: float64


Class weights (ใช้ตอนเทรน): {0: 1.108718487394958, 1: 1.0507715281234444, 2: 1.0399014778325124, 3: 1.0399014778325124, 4: 0.8591778591778592, 5: 1.0153920153920153, 6: 0.9307760141093474}


In [5]:
# Step 1 — Stratified split 70/15/15 + class_weight(from train only)
from sklearn.model_selection import train_test_split
# 15% test โดย stratify
train_val_df, test_df = train_test_split(df, test_size=0.15, stratify=df["Obesity"], random_state=42)
# จาก train_val แบ่ง val อีก ~15% ของทั้งก้อน (0.1765 * 0.85 ≈ 0.15)
train_df, val_df = train_test_split(train_val_df, test_size=0.1765, stratify=train_val_df["Obesity"], random_state=42)

# 1) จำนวนแถวต้องเท่ากันเมื่อรวม 3 ชุด
assert len(train_df) + len(val_df) + len(test_df) == len(df)
# 2) ดัชนีห้ามซ้ำระหว่างชุด (ไม่มีแถวใดรั่วไปมา)
assert set(train_df.index).isdisjoint(val_df.index)
assert set(train_df.index).isdisjoint(test_df.index)
assert set(val_df.index).isdisjoint(test_df.index)

# --- ตรวจสัดส่วนแต่ละชุด ---

def dist(pdf, name):
    vc = pdf["Obesity"].value_counts().sort_index()
    pct = (vc / len(pdf) * 100).round(2)
    display(f"\n{name} shape: {pdf.shape}")
    display(pd.DataFrame({"count": vc, "percent": pct}))
    return vc

vc_tr = dist(train_df, "TRAIN")
vc_va = dist(val_df,   "VAL")
vc_te = dist(test_df,  "TEST")
# -------------------------------

'\nTRAIN shape: (1477, 18)'

Unnamed: 0_level_0,count,percent
Obesity,Unnamed: 1_level_1,Unnamed: 2_level_1
0,190,12.86
1,201,13.61
2,203,13.74
3,203,13.74
4,245,16.59
5,208,14.08
6,227,15.37


'\nVAL shape: (317, 18)'

Unnamed: 0_level_0,count,percent
Obesity,Unnamed: 1_level_1,Unnamed: 2_level_1
0,41,12.93
1,43,13.56
2,44,13.88
3,44,13.88
4,53,16.72
5,44,13.88
6,48,15.14


'\nTEST shape: (317, 18)'

Unnamed: 0_level_0,count,percent
Obesity,Unnamed: 1_level_1,Unnamed: 2_level_1
0,41,12.93
1,43,13.56
2,43,13.56
3,43,13.56
4,53,16.72
5,45,14.2
6,49,15.46


In [3]:
# --- ตรวจความถูกต้องของการแบ่ง ---
n_total = len(df)
print("\nตรวจรวมจำนวนแถว:", len(train_df) + len(val_df) + len(test_df), "จาก", n_total)
assert set(train_df.index).isdisjoint(val_df.index)
assert set(train_df.index).isdisjoint(test_df.index)
assert set(val_df.index).isdisjoint(test_df.index)

# --- คำนวณ class_weight จาก TRAIN เท่านั้น ---
N = len(train_df)
K = train_df["Obesity"].nunique()
cw = {int(k): float(N / (K * v)) for k, v in train_df["Obesity"].value_counts().items()}
print("\nclass_weight (จาก TRAIN เท่านั้น):", cw)

# ตัวแปรที่ได้ไว้ใช้ต่อ:
# train_df, val_df, test_df, cw


ตรวจรวมจำนวนแถว: 2111 จาก 2111

class_weight (จาก TRAIN เท่านั้น): {4: 0.8612244897959184, 6: 0.9295154185022027, 5: 1.0144230769230769, 3: 1.0394088669950738, 2: 1.0394088669950738, 1: 1.0497512437810945, 0: 1.1105263157894736}


In [9]:
# Step 2 — Define feature lists from columns that actually exist

assert "Obesity" in train_df.columns
# 1) ตัวเลขต่อเนื่อง (normalize)
continuous_candidates = ["age_clean","height_m","weight_kg","bmi"]
# 2) หมวดหมู่เป็น "สตริง"
string_cat_candidates = ["gender","smoking","cal_monitoring","consum_other",
                         "consum_alchohol","transportation"]
# 3) หมวดหมู่เป็น "ตัวเลขจำนวนจำกัด" (0/1 หรือ 1–3 ฯลฯ)
int_cat_candidates = ["overweight_family","consum_cf","consum_vf",
                      "consum_daily","consum_water","phyical_activity","device_usage"]

# เอาเฉพาะคอลัมน์ที่มีจริงใน train_df
continuous_features   = [c for c in continuous_candidates if c in train_df.columns]
string_cat_features   = [c for c in string_cat_candidates if c in train_df.columns]
int_cat_features      = [c for c in int_cat_candidates if c in train_df.columns]

print("Continuous :", continuous_features)
print("String-cat :", string_cat_features)
print("Int-cat    :", int_cat_features)

# ตรวจ label: map เป็นดัชนี 0..(K-1) เพื่อใช้กับ SparseCategoricalCrossentropy
y_train_raw   = train_df["Obesity"].astype(int).values
classes_sorted = sorted(np.unique(y_train_raw))
label_to_idx   = {lab:i for i,lab in enumerate(classes_sorted)}
idx_to_label   = {i:lab for lab,i in label_to_idx.items()}
print("Classes:", classes_sorted)

Continuous : ['age_clean', 'height_m', 'weight_kg', 'bmi']
String-cat : ['gender', 'smoking', 'cal_monitoring', 'consum_other', 'consum_alchohol', 'transportation']
Int-cat    : ['overweight_family', 'consum_cf', 'consum_vf', 'consum_daily', 'consum_water', 'phyical_activity', 'device_usage']
Classes: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6)]


In [18]:
# Step 3 — Build preprocessing layers (train-only adaptation)

inputs = {}

for c in continuous_features:
    inputs[c] = tf.keras.Input(shape=(1,), name=c, dtype=tf.float32)
for c in string_cat_features:
    inputs[c] = tf.keras.Input(shape=(1,), name=c, dtype=tf.string)
for c in int_cat_features:
    inputs[c] = tf.keras.Input(shape=(1,), name=c, dtype=tf.int64)

norm_outs, str_outs, int_outs = [], [], []

# Normalize
for c in continuous_features:
    l = tf.keras.layers.Normalization(name=f"norm_{c}")
    l.adapt(train_df[c].astype("float32").values)
    norm_outs.append(l(inputs[c]))

# StringLookup
for c in string_cat_features:
    l = tf.keras.layers.StringLookup(output_mode="one_hot", name=f"lkp_{c}")
    l.adapt(train_df[c].astype(str).values)
    str_outs.append(l(inputs[c]))
    print(f"{c:18s}  vocab={l.vocabulary_size()}")

int_outs = []
for c in int_cat_features:
    if c == "overweight_family":
        # ไบนารี 0/1 -> one-hot ยาว 3 (รวม OOV)
        ilk = tf.keras.layers.IntegerLookup(
            output_mode="one_hot",
            vocabulary=[0, 1],           # ล็อก vocab ชัดเจน
            name=f"ilk_{c}"
        )
    else:
        # ไลเคิร์ต 1–3 -> one-hot ยาว 4 (รวม OOV)
        ilk = tf.keras.layers.IntegerLookup(
            output_mode="one_hot",
            vocabulary=[1, 2, 3],        # ล็อกให้ครบ 1,2,3 แม้ TRAIN จะขาดบางค่า
            name=f"ilk_{c}"
        )
    int_outs.append(ilk(inputs[c]))

parts = []
if norm_outs: parts += norm_outs
if str_outs:  parts += str_outs
if int_outs:  parts += int_outs
features_concat = tf.keras.layers.Concatenate(name="feat_concat")(parts) if len(parts)>1 else parts[0]
print("feat dim =", features_concat.shape[-1])   # ควร ~56



gender              vocab=3
smoking             vocab=3
cal_monitoring      vocab=3
consum_other        vocab=5
consum_alchohol     vocab=5
transportation      vocab=6
feat dim = 56


In [31]:
sample_x, _ = next(iter(train_ds.take(1)))
print("Model expects:", sorted(list(inputs.keys())))
print("Dataset keys :", sorted(list(sample_x.keys())))

missing = set(inputs.keys()) - set(sample_x.keys())
extra   = set(sample_x.keys()) - set(inputs.keys())
assert not missing and not extra, f"คีย์ไม่ตรง! missing={missing}, extra={extra}"

print("✅ keys ตรงกัน พร้อมเทรน")

Model expects: ['age_clean', 'bmi', 'cal_monitoring', 'consum_alchohol', 'consum_cf', 'consum_daily', 'consum_other', 'consum_vf', 'consum_water', 'device_usage', 'gender', 'height_m', 'overweight_family', 'phyical_activity', 'smoking', 'transportation', 'weight_kg']
Dataset keys : ['age_clean', 'bmi', 'cal_monitoring', 'consum_alchohol', 'consum_cf', 'consum_daily', 'consum_other', 'consum_vf', 'consum_water', 'device_usage', 'gender', 'height_m', 'overweight_family', 'phyical_activity', 'smoking', 'transportation', 'weight_kg']
✅ keys ตรงกัน พร้อมเทรน


In [32]:
# Step 4 — Build & train a small baseline DNN (softmax)

tf.keras.backend.clear_session()
tf.random.set_seed(42)

n_classes = len(classes_sorted)

x = tf.keras.layers.Dense(
        128, activation="relu",
        kernel_regularizer=tf.keras.regularizers.l2(1e-4))(features_concat)
x = tf.keras.layers.Dropout(0.30)(x)
x = tf.keras.layers.Dense(64, activation="relu")(x)
x = tf.keras.layers.Dropout(0.30)(x)
outputs = tf.keras.layers.Dense(n_classes, activation="softmax")(x)

model = tf.keras.Model(inputs=inputs, outputs=outputs)
model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"],
)

model.summary()

In [None]:
callbacks = [
    tf.keras.callbacks.EarlyStopping(
        monitor="val_loss", patience=10, restore_best_weights=True
    )
]

history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=200,
    callbacks=callbacks,
    class_weight=class_weight_idx,  # มาจาก TRAIN เท่านั้น
    verbose=1,
)


IndexError: tuple index out of range