In [None]:

import os
import glob
import json
import warnings
from typing import Tuple, Dict

import numpy as np
import pandas as pd

from sklearn.model_selection import GroupShuffleSplit, GroupKFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
    classification_report,
    matthews_corrcoef,
    brier_score_loss,
)
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.inspection import permutation_importance


warnings.filterwarnings("ignore", category=UserWarning)
np.random.seed(42)

# ปรับ path ตรงนี้ให้ตรงเครื่องคุณ
DATA_DIR = "/Users/wysuttida/pattern-project/mockdata_transaction"
ARTIFACT_DIR_API = "/Users/wysuttida/pattern-project/API-Statement-IntelliScan"  # สำหรับ export preprocessors / Keras
os.makedirs(ARTIFACT_DIR_API, exist_ok=True)



In [18]:
# %% [2] Loaders
# โหลดไฟล์ csv/xlsx ทั้งโฟลเดอร์ พร้อมตั้งคอลัมน์ file_id เพื่อใช้ทำ group split

def load_all_statements(data_dir: str = DATA_DIR) -> pd.DataFrame:
    paths = sorted(
        glob.glob(os.path.join(data_dir, "*.csv"))
        + glob.glob(os.path.join(data_dir, "*.xlsx"))
    )
    if not paths:
        raise FileNotFoundError(f"ไม่พบไฟล์ใน {data_dir}")

    dfs = []
    for p in paths:
        ext = os.path.splitext(p)[1].lower()
        df = pd.read_excel(p, engine="openpyxl") if ext == ".xlsx" else pd.read_csv(p)
        df["file_id"] = os.path.basename(p)
        dfs.append(df)
    return pd.concat(dfs, ignore_index=True)


raw_df = load_all_statements(DATA_DIR)
print("Loaded shape:", raw_df.shape)


Loaded shape: (10273, 33)


In [19]:
# %% [3] Preprocess
# แปลง datetime, สร้างฟีเจอร์ numeric/time, แยก code/channel, ทำความสะอาด text/label

def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    # 3.1 datetime
    df["tx_datetime"] = pd.to_datetime(df["tx_datetime"], errors="coerce")
    df = (
        df.dropna(subset=["tx_datetime"])
        .sort_values(["file_id", "tx_datetime"])
        .reset_index(drop=True)
    )

    # 3.2 split code/channel
    sp = df["code_channel_raw"].astype(str).str.split("/", n=1, expand=True)
    df["tx_code"] = sp[0].str.strip()
    df["channel"] = sp[1].str.strip() if sp.shape[1] > 1 else ""

    # 3.3 numeric ensure + engineer
    for col in ["debit_amount", "credit_amount", "balance_amount"]:
        if col not in df.columns:
            df[col] = 0.0
        df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0.0)

    df["net_amount"] = df["credit_amount"] - df["debit_amount"]
    df["abs_amount"] = df["debit_amount"].abs() + df["credit_amount"].abs()
    df["log1p_amount"] = np.log1p(df["abs_amount"])

    # 3.4 time features
    dt = df["tx_datetime"]
    df["hour"] = dt.dt.hour
    df["dayofweek"] = dt.dt.dayofweek
    df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)
    df["day"] = dt.dt.day
    df["month"] = dt.dt.month
    df["year"] = dt.dt.year

    # 3.5 text & label
    df["description_text"] = df["description_text"].astype(str).fillna("")
    df["fraud_label"] = df["fraud_label"].astype(int)

    return df


df = preprocess_dataframe(raw_df)
print("Preprocessed shape:", df.shape)
print("Class balance:\n", df["fraud_label"].value_counts(dropna=False))
print("Positive rate:", round(df["fraud_label"].mean(), 4))


Preprocessed shape: (10273, 44)
Class balance:
 fraud_label
0    7881
1    2392
Name: count, dtype: int64
Positive rate: 0.2328


In [20]:
# %% [4] Split (Group by file_id)
# แบ่ง 80/10/10 โดยใช้ GroupShuffleSplit เพื่อให้ไฟล์เดียวกันไม่ข้ามชุด

groups = df["file_id"].values
y_all = df["fraud_label"].values

gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
train_idx, temp_idx = next(gss.split(df, y_all, groups=groups))

groups_temp = groups[temp_idx]
y_temp = y_all[temp_idx]
gss2 = GroupShuffleSplit(n_splits=1, train_size=0.5, random_state=42)
val_rel, test_rel = next(gss2.split(df.iloc[temp_idx], y_temp, groups=groups_temp))

val_idx = temp_idx[val_rel]
test_idx = temp_idx[test_rel]

def take(idx):
    return df.iloc[idx].reset_index(drop=True)

train_df, val_df, test_df = take(train_idx), take(val_idx), take(test_idx)
print(f"Split sizes => train: {len(train_df)}, val: {len(val_df)}, test: {len(test_df)}")



Split sizes => train: 8309, val: 862, test: 1102


In [21]:
# %% [5] Feature Columns & XY helpers
# นิยามคอลัมน์และฟังก์ชันดึง X/y/groups

NUMERIC_FEATURES = [
    "debit_amount",
    "credit_amount",
    "balance_amount",
    "net_amount",
    "abs_amount",
    "log1p_amount",
    "hour",
    "dayofweek",
    "is_weekend",
    "day",
    "month",
    "year",
]
CATEGORICAL_FEATURES = ["tx_code", "channel"]
TEXT_FEATURE = "description_text"

def xy(df_: pd.DataFrame) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
    X = df_[
        NUMERIC_FEATURES + CATEGORICAL_FEATURES + [TEXT_FEATURE]
    ].copy()
    y = df_["fraud_label"].values
    groups = df_["file_id"].values
    return X, y, groups

X_train, y_train, g_train = xy(train_df)
X_val, y_val, g_val = xy(val_df)
X_test, y_test, g_test = xy(test_df)


In [22]:
# %% [6] Preprocessor (ColumnTransformer)
# หมายเหตุ: หาก scikit-learn <1.2 ให้เปลี่ยน OneHotEncoder(sparse_output=True) เป็น OneHotEncoder(sparse=True)

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler(with_mean=True, with_std=True))])
categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=True)
text_transformer = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=5000,
    min_df=5,
    max_df=0.95,
    strip_accents="unicode",
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, NUMERIC_FEATURES),
        ("cat", categorical_transformer, CATEGORICAL_FEATURES),
        ("txt", text_transformer, TEXT_FEATURE),
    ],
    remainder="drop",
    sparse_threshold=0.3,
)


In [23]:
# %% [7] Metrics helpers (score + threshold tuning + report)
# รวมทุกเมตริกที่ใช้บ่อย พร้อมเลือก threshold จาก validation ตาม metric ที่สนใจ

def _scores_from_estimator(clf, X):
    if hasattr(clf, "predict_proba"):
        return clf.predict_proba(X)[:, 1]
    # fallback decision_function → scale เข้า [0,1]
    d = clf.decision_function(X)
    return (d - d.min()) / (d.max() - d.min() + 1e-9)

def get_scores(clf, X, y, threshold=0.5) -> Dict:
    scores = _scores_from_estimator(clf, X)
    y_pred = (scores >= threshold).astype(int)
    tn, fp, fn, tp = confusion_matrix(y, y_pred, labels=[0, 1]).ravel()
    return {
        "accuracy": accuracy_score(y, y_pred),
        "precision": precision_score(y, y_pred, zero_division=0),
        "recall": recall_score(y, y_pred, zero_division=0),
        "f1": f1_score(y, y_pred, zero_division=0),
        "roc_auc": roc_auc_score(y, scores) if len(np.unique(y)) > 1 else np.nan,
        "pr_auc(AP)": average_precision_score(y, scores),
        "brier": brier_score_loss(y, scores),
        "mcc": matthews_corrcoef(y, y_pred) if len(np.unique(y_pred)) > 1 else 0.0,
        "tp": tp,
        "fp": fp,
        "tn": tn,
        "fn": fn,
        "threshold": threshold,
    }

def find_best_threshold(clf, X_val, y_val, target_metric="accuracy"):
    scores = _scores_from_estimator(clf, X_val)
    thresholds = np.linspace(0.05, 0.95, 19)
    best_t, best_v = 0.5, -1.0
    for t in thresholds:
        pred = (scores >= t).astype(int)
        if target_metric == "f1":
            v = f1_score(y_val, pred, zero_division=0)
        elif target_metric == "recall":
            v = recall_score(y_val, pred, zero_division=0)
        elif target_metric == "precision":
            v = precision_score(y_val, pred, zero_division=0)
        elif target_metric == "accuracy":
            v = accuracy_score(y_val, pred)
        else:
            v = f1_score(y_val, pred, zero_division=0)
        if v > best_v:
            best_v, best_t = v, float(t)
    return best_t, best_v

def report_model(name, clf, X_tr, y_tr, X_va, y_va, X_te, y_te, tune_threshold=True, target_metric="accuracy"):
    thr = 0.5
    if tune_threshold:
        thr, _ = find_best_threshold(clf, X_va, y_va, target_metric=target_metric)
    res_tr = get_scores(clf, X_tr, y_tr, threshold=thr)
    res_va = get_scores(clf, X_va, y_va, threshold=thr)
    res_te = get_scores(clf, X_te, y_te, threshold=thr)
    df_res = pd.DataFrame([res_tr, res_va, res_te], index=["train", "val", "test"])
    df_res.insert(0, "model", name)
    return df_res, thr

def print_full_report(name, clf, X, y, threshold):
    scores = _scores_from_estimator(clf, X)
    pred = (scores >= threshold).astype(int)
    print(f"\n{name} classification_report:\n", classification_report(y, pred, digits=4))
    print("Confusion matrix [0,1]:\n", confusion_matrix(y, pred, labels=[0, 1]))


In [24]:
# %% [8] Baseline (Dummy)
pipe_dummy = DummyClassifier(strategy="stratified", random_state=42)
pipe_dummy.fit(X_train, y_train)
dummy_res, dummy_thr = report_model(
    "Dummy", pipe_dummy, X_train, y_train, X_val, y_val, X_test, y_test, tune_threshold=False
)
print("\n== Dummy baseline ==")
print(dummy_res)



== Dummy baseline ==
       model  accuracy  precision    recall        f1   roc_auc  pr_auc(AP)  \
train  Dummy  0.643519   0.230729  0.225026  0.227842  0.498095    0.233049   
val    Dummy  0.660093   0.215686  0.248588  0.230971  0.507505    0.207909   
test   Dummy  0.639746   0.263359  0.252747  0.257944  0.509968    0.251681   

          brier       mcc   tp    fp    tn    fn  threshold  
train  0.356481 -0.003844  437  1457  4910  1505        0.5  
val    0.339907  0.014266   44   160   525   133        0.5  
test   0.360254  0.020217   69   193   636   204        0.5  


In [25]:
# %% [9] Logistic Regression (GridSearchCV, scoring=accuracy)
pipe_lr = Pipeline([
    ("prep", preprocessor),
    ("clf", LogisticRegression(max_iter=500, solver="saga", n_jobs=-1, class_weight=None)),
])

param_lr = {
    "prep__txt__max_features": [2000, 5000, 10000],
    "prep__txt__min_df": [3, 5],
    "prep__txt__ngram_range": [(1,1), (1,2)],
    "clf__C": [0.5, 1.0, 2.0, 5.0],
    "clf__penalty": ["l2"],
}

cv_lr = GridSearchCV(
    pipe_lr,
    param_grid=param_lr,
    cv=GroupKFold(n_splits=5),
    scoring="accuracy",
    n_jobs=-1,
    verbose=0,
)
cv_lr.fit(X_train, y_train, groups=g_train)
best_lr = cv_lr.best_estimator_
print("\nBest LR params:", cv_lr.best_params_, " CV(acc):", round(cv_lr.best_score_, 4))

lr_res, lr_thr = report_model("LogReg", best_lr, X_train, y_train, X_val, y_val, X_test, y_test, True, "accuracy")
print(lr_res)






Best LR params: {'clf__C': 5.0, 'clf__penalty': 'l2', 'prep__txt__max_features': 2000, 'prep__txt__min_df': 3, 'prep__txt__ngram_range': (1, 2)}  CV(acc): 0.8127
        model  accuracy  precision    recall        f1   roc_auc  pr_auc(AP)  \
train  LogReg  0.814659   0.758355  0.303811  0.433824  0.787100    0.613695   
val    LogReg  0.837587   0.760563  0.305085  0.435484  0.770844    0.583992   
test   LogReg  0.791289   0.741573  0.241758  0.364641  0.812670    0.606466   

          brier       mcc   tp   fp    tn    fn  threshold  
train  0.135718  0.398451  590  188  6179  1352        0.5  
val    0.126111  0.411800   54   17   668   123        0.5  
test   0.142106  0.339079   66   23   806   207        0.5  


In [29]:
# %% [10] RandomForest (GridSearchCV, scoring=accuracy)
pipe_rf = Pipeline([
    ("prep", preprocessor),
    ("clf", RandomForestClassifier(n_estimators=600, n_jobs=-1, class_weight=None, random_state=42)),
])

param_rf = {
    "prep__txt__max_features": [2000, 5000, 10000],
    "prep__txt__min_df": [3, 5],
    "prep__txt__ngram_range": [(1,1), (1,2)],
    "clf__max_depth": [None, 20, 30],
    "clf__min_samples_leaf": [1, 3, 5],
    "clf__max_features": ["sqrt", 0.5, None],
}

cv_rf = GridSearchCV(
    pipe_rf,
    param_grid=param_rf,
    cv=GroupKFold(n_splits=5),
    scoring="accuracy",
    n_jobs=-1,
    verbose=0,
)
cv_rf.fit(X_train, y_train, groups=g_train)
best_rf = cv_rf.best_estimator_
print("\nBest RF params:", cv_rf.best_params_, " CV(acc):", round(cv_rf.best_score_, 4))

rf_res, rf_thr = report_model("RandomForest", best_rf, X_train, y_train, X_val, y_val, X_test, y_test, True, "accuracy")
print(rf_res)



Best RF params: {'clf__max_depth': 30, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 3, 'prep__txt__max_features': 2000, 'prep__txt__min_df': 3, 'prep__txt__ngram_range': (1, 2)}  CV(acc): 0.8165
              model  accuracy  precision    recall        f1   roc_auc  \
train  RandomForest  0.857504   0.990933  0.393924  0.563744  0.984404   
val    RandomForest  0.836427   0.875000  0.237288  0.373333  0.832430   
test   RandomForest  0.782214   0.753846  0.179487  0.289941  0.787899   

       pr_auc(AP)     brier       mcc   tp  fp    tn    fn  threshold  
train    0.948008  0.071376  0.572641  765   7  6360  1177        0.6  
val      0.609913  0.119182  0.402569   42   6   679   135        0.6  
test     0.591206  0.146055  0.293521   49  16   813   224        0.6  


In [27]:
# %% [11] MLP (GridSearchCV, scoring=accuracy)
pipe_mlp = Pipeline([
    ("prep", preprocessor),
    ("clf", MLPClassifier(
        max_iter=600,
        random_state=42,
        early_stopping=True,
        n_iter_no_change=8,
        validation_fraction=0.1,
    )),
])

param_mlp = {
    "prep__txt__max_features": [2000, 5000, 10000],
    "prep__txt__min_df": [3, 5],
    "prep__txt__ngram_range": [(1,1), (1,2)],
    "clf__hidden_layer_sizes": [(128,), (256,), (256,128)],
    "clf__alpha": [1e-4, 1e-3],
    "clf__learning_rate_init": [1e-3, 3e-3],
    "clf__batch_size": [256, 512],
    "clf__activation": ["relu"],
    "clf__solver": ["adam"],
}

cv_mlp = GridSearchCV(
    pipe_mlp,
    param_grid=param_mlp,
    cv=GroupKFold(n_splits=5),
    scoring="accuracy",
    n_jobs=-1,
    verbose=0,
)
cv_mlp.fit(X_train, y_train, groups=g_train)
best_mlp = cv_mlp.best_estimator_
print("\nBest MLP params:", cv_mlp.best_params_, " CV(acc):", round(cv_mlp.best_score_, 4))

mlp_res, mlp_thr = report_model("MLP", best_mlp, X_train, y_train, X_val, y_val, X_test, y_test, True, "accuracy")
print(mlp_res)



Best MLP params: {'clf__activation': 'relu', 'clf__alpha': 0.0001, 'clf__batch_size': 512, 'clf__hidden_layer_sizes': (256,), 'clf__learning_rate_init': 0.003, 'clf__solver': 'adam', 'prep__txt__max_features': 2000, 'prep__txt__min_df': 3, 'prep__txt__ngram_range': (1, 1)}  CV(acc): 0.8273
      model  accuracy  precision    recall        f1   roc_auc  pr_auc(AP)  \
train   MLP  0.843302   0.864465  0.390834  0.538298  0.875452    0.743055   
val     MLP  0.853828   0.814815  0.372881  0.511628  0.822153    0.650232   
test    MLP  0.792196   0.761905  0.234432  0.358543  0.802874    0.603220   

          brier       mcc   tp   fp    tn    fn  threshold  
train  0.109285  0.512309  759  119  6248  1183       0.55  
val    0.113669  0.485904   66   15   670   111       0.55  
test   0.145269  0.342136   64   20   809   209       0.55  


In [28]:
# %% [12] Pick Winner on Validation & Refit on Train+Val, Evaluate on Test
present = {vn: isinstance(globals().get(vn), pd.DataFrame) for vn in ["dummy_res", "lr_res", "rf_res", "mlp_res"]}
print("present result frames:", present)

frames = []
for label, varname in [("Dummy", "dummy_res"), ("LogReg", "lr_res"), ("RandomForest", "rf_res"), ("MLP", "mlp_res")]:
    df_var = globals().get(varname)
    if isinstance(df_var, pd.DataFrame) and "val" in df_var.index:
        frames.append(df_var.loc[["val"]].copy())

if not frames:
    raise RuntimeError("No model validation results found.")

summary_val = pd.concat(frames, axis=0).set_index("model", drop=True)
summary_val_no_dummy = summary_val.drop(index="Dummy", errors="ignore")
candidates = summary_val_no_dummy if len(summary_val_no_dummy) > 0 else summary_val

winner_name = candidates.sort_values(by=["pr_auc(AP)", "f1", "recall"], ascending=False).index[0]
print("\n== Validation Summary (sorted by PR-AUC) ==")
print(summary_val.loc[:, ["pr_auc(AP)", "f1", "recall"]].sort_values("pr_auc(AP)", ascending=False))
print("winner_name =", winner_name)

winners = {}
if "LogReg" in summary_val.index:       winners["LogReg"] = (best_lr, lr_thr)
if "RandomForest" in summary_val.index:  winners["RandomForest"] = (best_rf, rf_thr)
if "MLP" in summary_val.index:           winners["MLP"] = (best_mlp, mlp_thr)

best_model_template, _ = winners.get(winner_name, (None, None))
if best_model_template is None:
    raise RuntimeError(f"Winner '{winner_name}' not available in winners map.")

# concat X for train+val
def concat_X(*dfs):
    return pd.concat(dfs, axis=0, ignore_index=True)

X_trval = concat_X(X_train, X_val)
y_trval = np.concatenate([y_train, y_val])

best_model_final = best_model_template
best_model_final.fit(X_trval, y_trval)

# เลือก threshold จาก validation โดย optimize F1 (ปรับได้)
final_thr, _ = find_best_threshold(best_model_final, X_val, y_val, target_metric="f1")
final_res = get_scores(best_model_final, X_test, y_test, threshold=final_thr)

print(f"\n== FINAL on TEST using {winner_name} (thr={final_thr:.2f}) ==")
print(pd.DataFrame([final_res], index=["test"]))
print_full_report("BEST(TEST)", best_model_final, X_test, y_test, final_thr)


present result frames: {'dummy_res': True, 'lr_res': True, 'rf_res': False, 'mlp_res': True}

== Validation Summary (sorted by PR-AUC) ==
        pr_auc(AP)        f1    recall
model                                 
MLP       0.650232  0.511628  0.372881
LogReg    0.583992  0.435484  0.305085
Dummy     0.207909  0.230971  0.248588
winner_name = MLP

== FINAL on TEST using MLP (thr=0.30) ==
      accuracy  precision    recall        f1  roc_auc  pr_auc(AP)     brier  \
test  0.777677   0.571429  0.410256  0.477612  0.77714      0.5783  0.152842   

           mcc   tp  fp   tn   fn  threshold  
test  0.348759  112  84  745  161        0.3  

BEST(TEST) classification_report:
               precision    recall  f1-score   support

           0     0.8223    0.8987    0.8588       829
           1     0.5714    0.4103    0.4776       273

    accuracy                         0.7777      1102
   macro avg     0.6969    0.6545    0.6682      1102
weighted avg     0.7601    0.7777    0.7644 

In [30]:
# %% [13] Feature Importance / Interpretability
# สำหรับ LogReg แสดง coefficients, RF แสดง feature_importances_, อื่น ๆ ใช้ permutation_importance

def get_feature_names(ct: ColumnTransformer):
    names = []
    for name, trans, cols in ct.transformers_:
        if name == "num":
            names.extend(cols)
        elif name == "cat":
            ohe = trans
            try:
                names.extend(list(ohe.get_feature_names_out(cols)))
            except Exception:
                names.extend(cols)
        elif name == "txt":
            tfidf = trans
            try:
                names.extend(list(tfidf.get_feature_names_out()))
            except Exception:
                names.append("tfidf_features")
    return names

if winner_name == "LogReg":
    coefs = best_model_final.named_steps["clf"].coef_[0]
    feat_names = get_feature_names(best_model_final.named_steps["prep"])
    top_pos_idx = np.argsort(coefs)[-15:][::-1]
    top_neg_idx = np.argsort(coefs)[:15]
    print("\nTop + coefficients:")
    for i in top_pos_idx:
        print(feat_names[i], round(coefs[i], 4))
    print("\nTop - coefficients:")
    for i in top_neg_idx:
        print(feat_names[i], round(coefs[i], 4))

elif winner_name == "RandomForest":
    rf = best_model_final.named_steps["clf"]
    feat_names = get_feature_names(best_model_final.named_steps["prep"])
    importances = getattr(rf, "feature_importances_", None)
    if importances is not None:
        idx = np.argsort(importances)[-20:][::-1]
        print("\nTop 20 RF importances:")
        for i in idx:
            print(feat_names[i], round(importances[i], 4))

else:
    print(f"{winner_name} has no direct coefficients — running permutation importance...")
    r = permutation_importance(
        best_model_final,
        X_test,
        y_test,
        n_repeats=5,
        random_state=42,
        n_jobs=-1,
        scoring="average_precision",
    )
    base_names = list(X_test.columns)
    idx = np.argsort(r.importances_mean)[-15:][::-1]
    print("\nTop 15 permutation importances (input level):")
    for i in idx:
        print(base_names[i], round(r.importances_mean[i], 6))



MLP has no direct coefficients — running permutation importance...

Top 15 permutation importances (input level):
log1p_amount 0.32219
channel 0.248087
tx_code 0.229019
description_text 0.162943
credit_amount 0.061991
debit_amount 0.027332
hour 0.026856
balance_amount 0.02416
abs_amount 0.021441
dayofweek 0.019027
net_amount 0.01681
month 0.008305
is_weekend 0.006585
year 0.00576
day -0.001862


In [31]:
# %% [14] (Optional) Install extras for Excel ifจำเป็น
# ใช้เฉพาะกรณี Notebook ยังไม่มี openpyxl
%pip install openpyxl


Note: you may need to restart the kernel to use updated packages.


In [32]:
# %% [15] External Preprocessors Export (scaler / tfidf / vocab)  → ใช้คู่กับ API
# กรณีที่ฝั่ง API ต้องการโหลดสเกลเลอร์/TF-IDF/หมวดหมู่ เพื่อแปลงฟีเจอร์ภายนอกโมเดล Keras

import joblib
import scipy.sparse as sp

# 15.1 Fit preprocessors จาก train_df
scaler = StandardScaler(with_mean=True, with_std=True).fit(train_df[NUMERIC_FEATURES])

tx_vocab = sorted(train_df["tx_code"].astype(str).unique().tolist())
ch_vocab = sorted(train_df["channel"].astype(str).unique().tolist())
tx_index = {t: i for i, t in enumerate(tx_vocab)}
ch_index = {t: i for i, t in enumerate(ch_vocab)}

tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    max_features=5000,
    min_df=5,
    max_df=0.95,
    strip_accents="unicode",
).fit(train_df[TEXT_FEATURE].astype(str))

# 15.2 Save
joblib.dump(scaler, f"{ARTIFACT_DIR_API}/pre_scaler.joblib")
joblib.dump(tfidf, f"{ARTIFACT_DIR_API}/pre_tfidf.joblib")
with open(f"{ARTIFACT_DIR_API}/pre_categ_vocab.json", "w", encoding="utf-8") as f:
    json.dump({"tx_vocab": tx_vocab, "ch_vocab": ch_vocab}, f, ensure_ascii=False, indent=2)

print("[OK] Saved scaler/tfidf/vocabs to", ARTIFACT_DIR_API)


[OK] Saved scaler/tfidf/vocabs to /Users/wysuttida/pattern-project/API-Statement-IntelliScan


In [34]:
# %% [16] Build Sparse Matrices (numeric + one-hot + tfidf)
# แปลง DataFrame เป็น CSR Matrix เพื่อป้อนให้ Keras (หรือโมเดลอื่น ๆ)

def _one_hot_from_vocab(series_str: pd.Series, index_map: dict, vocab_size: int):
    arr = series_str.astype(str).map(index_map).to_numpy()
    N = len(arr)
    rows = np.arange(N, dtype=np.int64)
    mask = ~pd.isna(arr)
    cols = arr[mask].astype(np.int64)
    data = np.ones(mask.sum(), dtype=np.float32)
    return sp.csr_matrix((data, (rows[mask], cols)), shape=(N, vocab_size), dtype=np.float32)

def transform_df_to_X(
    df_: pd.DataFrame,
    scaler_: StandardScaler,
    tfidf_: TfidfVectorizer,
    tx_index_: Dict[str, int],
    ch_index_: Dict[str, int],
    tx_vocab_: list,
    ch_vocab_: list,
):
    # numeric → scale → csr
    X_num = scaler_.transform(df_[NUMERIC_FEATURES]).astype(np.float32)
    X_num = sp.csr_matrix(X_num)

    # categorical → one-hot csr
    X_tx = _one_hot_from_vocab(df_["tx_code"].astype(str), tx_index_, len(tx_vocab_))
    X_ch = _one_hot_from_vocab(df_["channel"].astype(str), ch_index_, len(ch_vocab_))

    # text → tfidf csr
    X_txt = tfidf_.transform(df_[TEXT_FEATURE].astype(str)).astype(np.float32)

    # hstack
    return sp.hstack([X_num, X_tx, X_ch, X_txt], format="csr", dtype=np.float32)

X_train_ext = transform_df_to_X(train_df, scaler, tfidf, tx_index, ch_index, tx_vocab, ch_vocab)
X_val_ext   = transform_df_to_X(val_df,   scaler, tfidf, tx_index, ch_index, tx_vocab, ch_vocab)
X_test_ext  = transform_df_to_X(test_df,  scaler, tfidf, tx_index, ch_index, tx_vocab, ch_vocab)

print("Shapes:", "train", X_train_ext.shape, "val", X_val_ext.shape, "test", X_test_ext.shape)


Shapes: train (8309, 201) val (862, 201) test (1102, 201)


In [None]:
# ==== 16C) Keras model (no preprocessing inside) + save .h5 ====
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    average_precision_score,
)

# แปลงเป็น dense (ถ้าเมมไม่พอ ลดฟีเจอร์ก่อน)
Xtr = X_train_ext.toarray()
Xva = X_val_ext.toarray()
Xte = X_test_ext.toarray()

# Labels
y_tr = y_train.astype(np.float32)
y_va = y_val.astype(np.float32)
y_te = y_test.astype(np.float32)

# Class weights → sample_weight (เหมือนเดิม)
classes = np.array([0, 1])
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
w0, w1 = float(cw[0]), float(cw[1])
sw_tr = np.where(y_tr == 1, w1, w0).astype(np.float32)

# Build model (input = เวกเตอร์ฟีเจอร์รวม)
inp = layers.Input(shape=(Xtr.shape[1],), name="X")
x = layers.Dense(128, activation="relu")(inp)
x = layers.Dropout(0.2)(x)
out = layers.Dense(1, activation="sigmoid")(x)
model = Model(inp, out)
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=[
        tf.keras.metrics.AUC(name="auc"),
        tf.keras.metrics.Precision(name="precision"),
        tf.keras.metrics.Recall(name="recall"),
    ],
)

# Train
es = tf.keras.callbacks.EarlyStopping(
    patience=3, restore_best_weights=True, monitor="val_auc", mode="max"
)
history = model.fit(
    Xtr,
    y_tr,
    validation_data=(Xva, y_va),
    epochs=20,
    batch_size=512,
    sample_weight=sw_tr,
    callbacks=[es],
    verbose=0,
)

# เลือก threshold จาก validation เพื่อ maximize F1
val_scores = model.predict(Xva, verbose=0).ravel()
ths = np.linspace(0.05, 0.95, 19)


def f1_at(t):
    return f1_score(y_va, (val_scores >= t).astype(int), zero_division=0)


best_thr = float(ths[np.argmax([f1_at(t) for t in ths])])

# ประเมินบน test
test_scores = model.predict(Xte, verbose=0).ravel()
pred_test = (test_scores >= best_thr).astype(int)
final_metrics = {
    "accuracy": float((pred_test == y_te).mean()),
    "precision": float(precision_score(y_te, pred_test, zero_division=0)),
    "recall": float(recall_score(y_te, pred_test, zero_division=0)),
    "f1": float(f1_score(y_te, pred_test, zero_division=0)),
    "roc_auc": float(roc_auc_score(y_te, test_scores)),
    "pr_auc(AP)": float(average_precision_score(y_te, test_scores)),
    "threshold": best_thr,
}
print("Test metrics:", final_metrics)

# เซฟเป็น .h5 ได้แล้ว (ไม่มี StringLookup/TextVectorization ภายในโมเดล)
model.save(f"{ARTIFACT_DIR_API}/model.h5")
with open(f"{ARTIFACT_DIR_API}/model_meta.json", "w") as f:
    json.dump(
        {"threshold": best_thr, "metrics_test": final_metrics},
        f,
        ensure_ascii=False,
        indent=2,
    )

print("[OK] Saved model.h5 and model_meta.json at", ARTIFACT_DIR_API)

2025-10-09 01:01:23.682511: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M4
2025-10-09 01:01:23.682671: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-10-09 01:01:23.682677: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
2025-10-09 01:01:23.682868: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:306] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-10-09 01:01:23.683222: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:272] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)
2025-10-09 01:01:24.032299: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


Test metrics: {'accuracy': 0.779491833030853, 'precision': 0.5555555555555556, 'recall': 0.5494505494505495, 'f1': 0.5524861878453039, 'roc_auc': 0.8105975247109144, 'pr_auc(AP)': 0.60512966342085, 'threshold': 0.6}


NameError: name 'OUT_DIR' is not defined