In [1]:
import os, glob, re, math, warnings
import numpy as np
import pandas as pd


from datetime import datetime
from dateutil import tz


import numpy as np, json, os
from datetime import datetime
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import GroupShuffleSplit, GroupKFold
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
   accuracy_score, precision_score, recall_score, f1_score,
   roc_auc_score, average_precision_score, confusion_matrix,
   classification_report, matthews_corrcoef, brier_score_loss
)
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance

warnings.filterwarnings("ignore", category=UserWarning)
np.random.seed(42)


In [2]:
def load_all_statements(data_dir="/Users/wysuttida/pattern-project/mockdata_transaction"):
   """
   อ่านทุกไฟล์ .csv / .xlsx ในโฟลเดอร์ data/
   ต้องมีคอลัมน์: tx_datetime, code_channel_raw, debit_amount, credit_amount, balance_amount, description_text, fraud_label
   คืนค่า df รวมทุกไฟล์ พร้อมคอลัมน์ file_id (ชื่อไฟล์)
   """
   paths = sorted(glob.glob(os.path.join(data_dir, "*.csv")) + glob.glob(os.path.join(data_dir, "*.xlsx")))
   if not paths:
       raise FileNotFoundError("ไม่พบไฟล์ในโฟลเดอร์ mockdata_transaction/")


   dfs = []
   for p in paths:
       ext = os.path.splitext(p)[1].lower()
       if ext == ".csv":
           df = pd.read_csv(p)
       else:
           df = pd.read_excel(p, engine="openpyxl")


       df["file_id"] = os.path.basename(p)  # ใช้เป็น group
       dfs.append(df)


   df = pd.concat(dfs, ignore_index=True)
   return df


df = load_all_statements("/Users/wysuttida/pattern-project/mockdata_transaction")


In [3]:
def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
   # 2.1 datetime
   df["tx_datetime"] = pd.to_datetime(df["tx_datetime"], errors="coerce")
   df = df.dropna(subset=["tx_datetime"]).sort_values(["file_id","tx_datetime"]).reset_index(drop=True)


   # 2.2 split code/channel
   sp = df["code_channel_raw"].astype(str).str.split("/", n=1, expand=True)
   df["tx_code"] = sp[0].str.strip()
   df["channel"] = sp[1].str.strip() if sp.shape[1] > 1 else ""


   # 2.3 amount features
   for col in ["debit_amount","credit_amount","balance_amount"]:
       if col not in df.columns:
           df[col] = 0.0
   df["net_amount"] = df["credit_amount"].fillna(0) - df["debit_amount"].fillna(0)
   df["abs_amount"] = (df["debit_amount"].fillna(0).abs() + df["credit_amount"].fillna(0).abs())
   df["log1p_amount"] = np.log1p(df["abs_amount"])


   # 2.4 time features
   dt = df["tx_datetime"]
   df["hour"] = dt.dt.hour
   df["dayofweek"] = dt.dt.dayofweek     # 0=Mon
   df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)
   df["day"] = dt.dt.day
   df["month"] = dt.dt.month
   df["year"] = dt.dt.year


   # 2.5 text
   df["description_text"] = df["description_text"].astype(str).fillna("")


   # 2.6 label
   df["fraud_label"] = df["fraud_label"].astype(int)


   return df


df = preprocess_dataframe(df)


In [4]:
print("Shape:", df.shape)
print("Class balance (0/1):")
print(df["fraud_label"].value_counts(dropna=False))
print("Positive rate:", df["fraud_label"].mean().round(4))


Shape: (10273, 44)
Class balance (0/1):
fraud_label
0    7881
1    2392
Name: count, dtype: int64
Positive rate: 0.2328


In [5]:
groups = df["file_id"].values
y = df["fraud_label"].values


# แบ่ง 80% train vs 20% temp
gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
train_idx, temp_idx = next(gss.split(df, y, groups=groups))


# แบ่ง temp (20%) เป็น val/test อย่างละครึ่ง
groups_temp = groups[temp_idx]
y_temp = y[temp_idx]
gss2 = GroupShuffleSplit(n_splits=1, train_size=0.5, random_state=42)
val_rel, test_rel = next(gss2.split(df.iloc[temp_idx], y_temp, groups=groups_temp))
val_idx = temp_idx[val_rel]
test_idx = temp_idx[test_rel]


def take(idx):
   return df.iloc[idx].reset_index(drop=True)


train_df = take(train_idx)
val_df   = take(val_idx)
test_df  = take(test_idx)


def xy(df_):
   X = df_[["debit_amount","credit_amount","balance_amount","net_amount","abs_amount","log1p_amount",
            "hour","dayofweek","is_weekend","day","month","year",
            "tx_code","channel","description_text"]].copy()
   y = df_["fraud_label"].values
   groups = df_["file_id"].values
   return X, y, groups


X_train, y_train, g_train = xy(train_df)
X_val,   y_val,   g_val   = xy(val_df)
X_test,  y_test,  g_test  = xy(test_df)


print(f"Split sizes => train: {len(train_df)}, val: {len(val_df)}, test: {len(test_df)}")


Split sizes => train: 8309, val: 862, test: 1102


In [6]:
numeric_features = [
   "debit_amount","credit_amount","balance_amount","net_amount","abs_amount","log1p_amount",
   "hour","dayofweek","is_weekend","day","month","year"
]
categorical_features = ["tx_code","channel"]
text_feature = "description_text"


numeric_transformer = Pipeline(steps=[
   ("scaler", StandardScaler(with_mean=True, with_std=True))
])


categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=True)


text_transformer = TfidfVectorizer(
   ngram_range=(1,2), max_features=5000, min_df=5, max_df=0.95, strip_accents="unicode"
)


preprocessor = ColumnTransformer(
   transformers=[
       ("num", numeric_transformer, numeric_features),
       ("cat", categorical_transformer, categorical_features),
       ("txt", text_transformer, text_feature),
   ],
   remainder="drop",
   sparse_threshold=0.3,  # อนุญาต sparse
)


In [7]:
def get_scores(clf, X, y, threshold=0.5, average='binary'):
   # รองรับทั้ง predict_proba/decision_function
   if hasattr(clf, "predict_proba"):
       s = clf.predict_proba(X)[:, 1]
   else:
       # scale decision_function ให้เข้า [0,1] แบบคร่าว ๆ
       d = clf.decision_function(X)
       s = (d - d.min()) / (d.max() - d.min() + 1e-9)


   y_pred = (s >= threshold).astype(int)


   cm = confusion_matrix(y, y_pred, labels=[0,1])
   tn, fp, fn, tp = cm.ravel()


   return {
       "accuracy": accuracy_score(y, y_pred),
       "precision": precision_score(y, y_pred, zero_division=0),
       "recall": recall_score(y, y_pred, zero_division=0),
       "f1": f1_score(y, y_pred, zero_division=0),
       "roc_auc": roc_auc_score(y, s) if len(np.unique(y))>1 else np.nan,
       "pr_auc(AP)": average_precision_score(y, s),
       "brier": brier_score_loss(y, s),
       "mcc": matthews_corrcoef(y, y_pred) if len(np.unique(y_pred))>1 else 0.0,
       "tp": tp, "fp": fp, "tn": tn, "fn": fn,
       "threshold": threshold,
   }


def find_best_threshold(clf, X_val, y_val, target_metric="f1"):
   # ทดลอง threshold 101 ค่า
   if hasattr(clf, "predict_proba"):
       s = clf.predict_proba(X_val)[:, 1]
   else:
       d = clf.decision_function(X_val)
       s = (d - d.min()) / (d.max() - d.min() + 1e-9)


   thresholds = np.linspace(0.05, 0.95, 19)
   best_t, best_v = 0.5, -1
   for t in thresholds:
       y_pred = (s >= t).astype(int)
       if target_metric == "f1":
           v = f1_score(y_val, y_pred, zero_division=0)
       elif target_metric == "recall":
           v = recall_score(y_val, y_pred, zero_division=0)
       elif target_metric == "precision":
           v = precision_score(y_val, y_pred, zero_division=0)
       else:
           v = f1_score(y_val, y_pred, zero_division=0)
       if v > best_v:
           best_v, best_t = v, t
   return best_t, best_v


def report_model(name, clf, X_tr, y_tr, X_va, y_va, X_te, y_te, tune_threshold=True):
   thr = 0.5
   if tune_threshold:
       thr, _ = find_best_threshold(clf, X_va, y_va, target_metric="f1")
   res_tr = get_scores(clf, X_tr, y_tr, threshold=thr)
   res_va = get_scores(clf, X_va, y_va, threshold=thr)
   res_te = get_scores(clf, X_te, y_te, threshold=thr)
   df_res = pd.DataFrame([res_tr, res_va, res_te], index=["train","val","test"])
   df_res.insert(0, "model", name)
   return df_res, thr




In [8]:
pipe_dummy = DummyClassifier(strategy="stratified", random_state=42)
pipe_dummy.fit(X_train, y_train)
dummy_res, dummy_thr = report_model("Dummy", pipe_dummy, X_train, y_train, X_val, y_val, X_test, y_test, tune_threshold=False)
print("\n== Dummy baseline ==")
print(dummy_res)


== Dummy baseline ==
       model  accuracy  precision    recall        f1   roc_auc  pr_auc(AP)  \
train  Dummy  0.643519   0.230729  0.225026  0.227842  0.498095    0.233049   
val    Dummy  0.660093   0.215686  0.248588  0.230971  0.507505    0.207909   
test   Dummy  0.639746   0.263359  0.252747  0.257944  0.509968    0.251681   

          brier       mcc   tp    fp    tn    fn  threshold  
train  0.356481 -0.003844  437  1457  4910  1505        0.5  
val    0.339907  0.014266   44   160   525   133        0.5  
test   0.360254  0.020217   69   193   636   204        0.5  


In [None]:
pipe_lr = Pipeline([
   ("prep", preprocessor),
   ("clf", LogisticRegression(
       max_iter=1000, solver="saga", n_jobs=-1, class_weight="balanced"
   )),
])
param_lr = {
   "clf__C": [0.1, 0.5, 1.0, 2.0, 5.0],
   "clf__penalty": ["l1","l2"],  # saga รองรับ l1,l2
}
cv_lr = GridSearchCV(
   pipe_lr, param_grid=param_lr, cv=GroupKFold(n_splits=5),
   scoring="average_precision", n_jobs=-1, verbose=0
)
cv_lr.fit(X_train, y_train, clf__sample_weight=None, groups=g_train)
best_lr = cv_lr.best_estimator_
print("\nBest LR params:", cv_lr.best_params_, " CV(AP):", round(cv_lr.best_score_,4))


lr_res, lr_thr = report_model("LogReg", best_lr, X_train, y_train, X_val, y_val, X_test, y_test, tune_threshold=True)
print(lr_res)







Best LR params: {'clf__C': 5.0, 'clf__penalty': 'l1'}  CV(AP): 0.5962
        model  accuracy  precision    recall        f1   roc_auc  pr_auc(AP)  \
train  LogReg  0.786256   0.542828  0.541710  0.542268  0.786809    0.609055   
val    LogReg  0.801624   0.517857  0.491525  0.504348  0.771199    0.582585   
test   LogReg  0.781307   0.559259  0.553114  0.556169  0.813589    0.607523   

          brier       mcc    tp   fp    tn   fn  threshold  
train  0.183984  0.402843  1052  886  5481  890        0.6  
val    0.181923  0.380653    87   81   604   90        0.6  
test   0.179366  0.411093   151  119   710  122        0.6  


In [10]:
pipe_rf = Pipeline([
   ("prep", preprocessor),
   ("clf", RandomForestClassifier(
       n_estimators=400, n_jobs=-1, class_weight="balanced", random_state=42
   )),
])
param_rf = {
   "clf__max_depth": [None, 10, 20],
   "clf__min_samples_leaf": [1, 3, 5],
   "clf__max_features": ["sqrt", 0.5, None],
}
cv_rf = GridSearchCV(
   pipe_rf, param_grid=param_rf, cv=GroupKFold(n_splits=5),
   scoring="average_precision", n_jobs=-1, verbose=0
)
cv_rf.fit(X_train, y_train, groups=g_train)
best_rf = cv_rf.best_estimator_
print("\nBest RF params:", cv_rf.best_params_, " CV(AP):", round(cv_rf.best_score_,4))


rf_res, rf_thr = report_model("RandomForest", best_rf, X_train, y_train, X_val, y_val, X_test, y_test, tune_threshold=True)
print(rf_res)





Best RF params: {'clf__max_depth': None, 'clf__max_features': 'sqrt', 'clf__min_samples_leaf': 1}  CV(AP): 0.6203
              model  accuracy  precision    recall        f1   roc_auc  \
train  RandomForest  0.993982   0.974900  1.000000  0.987290  1.000000   
val    RandomForest  0.772622   0.463602  0.683616  0.552511  0.829634   
test   RandomForest  0.747731   0.492877  0.633700  0.554487  0.779067   

       pr_auc(AP)     brier       mcc    tp   fp    tn   fn  threshold  
train    1.000000  0.017714  0.983485  1942   50  6317    0       0.25  
val      0.614031  0.118739  0.421332   121  140   545   56       0.25  
test     0.575459  0.148896  0.388223   173  178   651  100       0.25  


In [11]:
pipe_mlp = Pipeline([
   ("prep", preprocessor),
   ("clf", MLPClassifier(
       hidden_layer_sizes=(128, ),
       activation="relu", solver="adam", max_iter=50, random_state=42,
       early_stopping=True, n_iter_no_change=5, validation_fraction=0.1
   )),
])
param_mlp = {
   "clf__hidden_layer_sizes": [(64,), (128,), (128,64)],
   "clf__alpha": [1e-4, 1e-3, 1e-2],
}
cv_mlp = GridSearchCV(
   pipe_mlp, param_grid=param_mlp, cv=GroupKFold(n_splits=5),
   scoring="average_precision", n_jobs=-1, verbose=0
)
cv_mlp.fit(X_train, y_train, groups=g_train)
best_mlp = cv_mlp.best_estimator_
print("\nBest MLP params:", cv_mlp.best_params_, " CV(AP):", round(cv_mlp.best_score_,4))


mlp_res, mlp_thr = report_model("MLP", best_mlp, X_train, y_train, X_val, y_val, X_test, y_test, tune_threshold=True)
print(mlp_res)



Best MLP params: {'clf__alpha': 0.001, 'clf__hidden_layer_sizes': (128, 64)}  CV(AP): 0.6232
      model  accuracy  precision    recall        f1   roc_auc  pr_auc(AP)  \
train   MLP  0.812492   0.578947  0.725026  0.643804  0.871259    0.733035   
val     MLP  0.790023   0.490741  0.598870  0.539440  0.809097    0.637196   
test    MLP  0.768603   0.528125  0.619048  0.569983  0.800921    0.595532   

          brier       mcc    tp    fp    tn   fn  threshold  
train  0.111069  0.524762  1408  1024  5343  534       0.25  
val    0.116764  0.408551   106   110   575   71       0.25  
test   0.146306  0.415492   169   151   678  104       0.25  


In [12]:
# ==== เลือกผู้ชนะจากผลบน validation (robust) ====
import pandas as pd


# ตรวจว่าบล็อกผลลัพธ์ใดรันสำเร็จแล้วบ้าง
present = {vn: isinstance(globals().get(vn), pd.DataFrame)
          for vn in ["dummy_res","lr_res","rf_res","mlp_res"]}
print("present result frames:", present)


frames = []
for label, varname in [
   ("Dummy",        "dummy_res"),
   ("LogReg",       "lr_res"),
   ("RandomForest", "rf_res"),
   ("MLP",          "mlp_res"),
]:
   df_var = globals().get(varname)
   if isinstance(df_var, pd.DataFrame) and "val" in df_var.index:
       frames.append(df_var.loc[["val"]].copy())


if not frames:
   raise RuntimeError(
       "No model validation results found. "
       "Make sure you ran the training cells that produce dummy_res/lr_res/rf_res/mlp_res."
   )


summary_val = pd.concat(frames, axis=0)
summary_val = summary_val.set_index("model", drop=True)


# ตัด Dummy ออก (ถ้ามี) แล้วเลือกผู้ชนะ โดยเน้น PR-AUC -> F1 -> Recall
summary_val_no_dummy = summary_val.drop(index="Dummy", errors="ignore")


# ถ้าเผลอลบจนว่าง (เช่นมีแต่ Dummy จริง ๆ) ให้ fallback กลับไปใช้ summary_val
candidates = summary_val_no_dummy if len(summary_val_no_dummy) > 0 else summary_val


winner_name = candidates.sort_values(
   by=["pr_auc(AP)", "f1", "recall"], ascending=False
).index[0]


print("\n== Validation Summary ==")
print(summary_val.loc[:, ["pr_auc(AP)", "f1", "recall"]].sort_values("pr_auc(AP)", ascending=False))


# map เฉพาะโมเดลที่มีจริง
winners = {}
if "LogReg"       in summary_val.index: winners["LogReg"]       = (best_lr, lr_thr)
if "RandomForest" in summary_val.index: winners["RandomForest"] = (best_rf, rf_thr)
if "MLP"          in summary_val.index: winners["MLP"]          = (best_mlp, mlp_thr)


best_model_template, best_thr_val = winners.get(winner_name, (None, None))
if best_model_template is None:
   raise RuntimeError(f"Winner '{winner_name}' not available in winners map. Check which models were trained.")




present result frames: {'dummy_res': True, 'lr_res': True, 'rf_res': True, 'mlp_res': True}

== Validation Summary ==
              pr_auc(AP)        f1    recall
model                                       
MLP             0.637196  0.539440  0.598870
RandomForest    0.614031  0.552511  0.683616
LogReg          0.582585  0.504348  0.491525
Dummy           0.207909  0.230971  0.248588


In [13]:
def concat_X(*dfs):
   return pd.concat(dfs, axis=0, ignore_index=True)


X_trval = concat_X(X_train, X_val)
y_trval = np.concatenate([y_train, y_val])


best_model_final = best_model_template
best_model_final.fit(X_trval, y_trval)


final_thr, _ = find_best_threshold(best_model_final, X_val, y_val, target_metric="f1")
final_res = get_scores(best_model_final, X_test, y_test, threshold=final_thr)


print(f"\n== FINAL on TEST using {winner_name} ==")
print(pd.DataFrame([final_res], index=["test"]))





== FINAL on TEST using MLP ==
      accuracy  precision    recall        f1   roc_auc  pr_auc(AP)     brier  \
test  0.783122      0.585  0.428571  0.494715  0.794912    0.597636  0.146171   

           mcc   tp  fp   tn   fn  threshold  
test  0.367884  117  83  746  156       0.35  


In [14]:
def print_full_report(name, clf, X, y, threshold):
   if hasattr(clf, "predict_proba"):
       s = clf.predict_proba(X)[:, 1]
   else:
       d = clf.decision_function(X)
       s = (d - d.min()) / (d.max() - d.min() + 1e-9)
   y_pred = (s >= threshold).astype(int)
   print(f"\n{name} classification_report:\n", classification_report(y, y_pred, digits=4))
   print("Confusion matrix [0,1]:\n", confusion_matrix(y, y_pred, labels=[0,1]))


print_full_report("BEST(TEST)", best_model_final, X_test, y_test, final_thr)





BEST(TEST) classification_report:
               precision    recall  f1-score   support

           0     0.8271    0.8999    0.8619       829
           1     0.5850    0.4286    0.4947       273

    accuracy                         0.7831      1102
   macro avg     0.7060    0.6642    0.6783      1102
weighted avg     0.7671    0.7831    0.7710      1102

Confusion matrix [0,1]:
 [[746  83]
 [156 117]]


In [15]:
def get_feature_names(prep: ColumnTransformer):
   names = []
   for name, trans, cols in prep.transformers_:
       if name == "num":
           # numeric columns
           names.extend(cols)
       elif name == "cat":
           ohe = trans
           try:
               names.extend(list(ohe.get_feature_names_out(cols)))
           except:
               names.extend(cols)
       elif name == "txt":
           tfidf = trans
           try:
               names.extend(list(tfidf.get_feature_names_out()))
           except:
               names.append("tfidf_features")
   return names


print("winner_name =", winner_name)
if winner_name == "LogReg":
   coefs = best_model_final.named_steps["clf"].coef_[0]
   feat_names = get_feature_names(best_model_final.named_steps["prep"])
   top_pos_idx = np.argsort(coefs)[-15:][::-1]
   top_neg_idx = np.argsort(coefs)[:15]
   print("\nTop + coefficients:")
   for i in top_pos_idx: print(feat_names[i], round(coefs[i], 4))
   print("\nTop - coefficients:")
   for i in top_neg_idx: print(feat_names[i], round(coefs[i], 4))


elif winner_name == "RandomForest":
   rf = best_model_final.named_steps["clf"]
   feat_names = get_feature_names(best_model_final.named_steps["prep"])
   importances = getattr(rf, "feature_importances_", None)
   if importances is not None:
       idx = np.argsort(importances)[-20:][::-1]
       print("\nTop 20 RF importances:")
       for i in idx: print(feat_names[i], round(importances[i], 4))


else:
   print(f"{winner_name} has no direct coefficients — running permutation importance...")
   r = permutation_importance(
       best_model_final, X_test, y_test,
       n_repeats=5, random_state=42, n_jobs=-1, scoring="average_precision"
   )
   base_names = list(X_test.columns)
   idx = np.argsort(r.importances_mean)[-15:][::-1]
   print("\nTop 15 permutation importances (input level):")
   for i in idx:
       print(base_names[i], round(r.importances_mean[i], 6))




%pip install openpyxl


# ==== PRELUDE: load + preprocess + split (run this BEFORE the Keras cell) ====
import os, glob
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupShuffleSplit


DATA_DIR = "/Users/wysuttida/pattern-project/mockdata_transaction"  # ปรับได้ถ้าตำแหน่งต่าง


def load_all_statements(data_dir=DATA_DIR):
   paths = sorted(glob.glob(os.path.join(data_dir, "*.csv")) + glob.glob(os.path.join(data_dir, "*.xlsx")))
   if not paths:
       raise FileNotFoundError(f"ไม่พบไฟล์ใน {data_dir}")
   dfs = []
   for p in paths:
       ext = os.path.splitext(p)[1].lower()
       df = pd.read_excel(p, engine="openpyxl") if ext == ".xlsx" else pd.read_csv(p)
       df["file_id"] = os.path.basename(p)
       dfs.append(df)
   return pd.concat(dfs, ignore_index=True)


def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
   df = df.copy()
   # datetime
   df["tx_datetime"] = pd.to_datetime(df["tx_datetime"], errors="coerce")
   df = df.dropna(subset=["tx_datetime"]).sort_values(["file_id","tx_datetime"]).reset_index(drop=True)


   # split code/channel
   sp = df["code_channel_raw"].astype(str).str.split("/", n=1, expand=True)
   df["tx_code"] = sp[0].str.strip()
   df["channel"] = sp[1].str.strip() if sp.shape[1] > 1 else ""


   # numeric (ensure present)
   for col in ["debit_amount","credit_amount","balance_amount"]:
       if col not in df.columns:
           df[col] = 0.0
   for col in ["debit_amount","credit_amount","balance_amount"]:
       df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)


   # engineered
   df["net_amount"] = df["credit_amount"] - df["debit_amount"]
   df["abs_amount"] = df["debit_amount"].abs() + df["credit_amount"].abs()
   df["log1p_amount"] = np.log1p(df["abs_amount"])


   # time features
   dt = df["tx_datetime"]
   df["hour"] = dt.dt.hour
   df["dayofweek"] = dt.dt.dayofweek
   df["is_weekend"] = (df["dayofweek"] >= 5).astype(int)
   df["day"] = dt.dt.day
   df["month"] = dt.dt.month
   df["year"] = dt.dt.year


   # text & label
   df["description_text"] = df["description_text"].astype(str).fillna("")
   df["fraud_label"] = df["fraud_label"].astype(int)
   return df


# load + preprocess
df = preprocess_dataframe(load_all_statements(DATA_DIR))


# group split (80/10/10) by file_id
groups = df["file_id"].values
y_all = df["fraud_label"].values


gss = GroupShuffleSplit(n_splits=1, train_size=0.8, random_state=42)
train_idx, temp_idx = next(gss.split(df, y_all, groups=groups))


groups_temp = groups[temp_idx]
y_temp = y_all[temp_idx]
gss2 = GroupShuffleSplit(n_splits=1, train_size=0.5, random_state=42)
val_rel, test_rel = next(gss2.split(df.iloc[temp_idx], y_temp, groups=groups_temp))


val_idx = temp_idx[val_rel]
test_idx = temp_idx[test_rel]


def take(idx): return df.iloc[idx].reset_index(drop=True)


train_df = take(train_idx)
val_df   = take(val_idx)
test_df  = take(test_idx)


# y สำหรับประเมิน/คำนวณ threshold
y_train = train_df["fraud_label"].values
y_val   = val_df["fraud_label"].values
y_test  = test_df["fraud_label"].values


print("Split sizes:", len(train_df), len(val_df), len(test_df))


winner_name = MLP
MLP has no direct coefficients — running permutation importance...

Top 15 permutation importances (input level):
log1p_amount 0.33173
channel 0.223934
tx_code 0.200926
description_text 0.172914
debit_amount 0.088082
credit_amount 0.041641
balance_amount 0.032793
abs_amount 0.031147
hour 0.024019
net_amount 0.022721
is_weekend 0.007353
month 0.006993
dayofweek 0.003979
year -0.000762
day -0.01438
Note: you may need to restart the kernel to use updated packages.
Split sizes: 8309 862 1102


In [18]:
# ==== 16A) Fit external preprocessors & save artifacts ====
import os, json, joblib
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

OUT_DIR = "/Users/wysuttida/pattern-project/API-Statement-IntelliScan"
os.makedirs(OUT_DIR, exist_ok=True)

NUMERIC_FEATURES = [
    "debit_amount","credit_amount","balance_amount",
    "net_amount","abs_amount","log1p_amount",
    "hour","dayofweek","is_weekend","day","month","year"
]
TEXT_FEATURE = "description_text"

# 1) Numeric scaler
scaler = StandardScaler(with_mean=True, with_std=True)
scaler.fit(train_df[NUMERIC_FEATURES])

# 2) Categorical vocab (one-hot ภายนอก)
tx_vocab = sorted(train_df["tx_code"].astype(str).unique().tolist())
ch_vocab = sorted(train_df["channel"].astype(str).unique().tolist())
tx_index = {t:i for i,t in enumerate(tx_vocab)}
ch_index = {t:i for i,t in enumerate(ch_vocab)}

# 3) Text TF-IDF (ตั้งค่าให้ใกล้เคียงของเดิม)
tfidf = TfidfVectorizer(
    ngram_range=(1,2),
    max_features=5000,
    min_df=5,
    max_df=0.95,
    strip_accents="unicode"
)
tfidf.fit(train_df[TEXT_FEATURE].astype(str))

# Save preprocessors
joblib.dump(scaler, f"{OUT_DIR}/pre_scaler.joblib")
joblib.dump(tfidf,  f"{OUT_DIR}/pre_tfidf.joblib")
with open(f"{OUT_DIR}/pre_categ_vocab.json", "w") as f:
    json.dump({"tx_vocab": tx_vocab, "ch_vocab": ch_vocab}, f, ensure_ascii=False, indent=2)

print("[OK] Saved scaler/tfidf/vocabs to", OUT_DIR)


[OK] Saved scaler/tfidf/vocabs to /Users/wysuttida/pattern-project/API-Statement-IntelliScan


In [19]:
# ==== 16B) Transform DataFrames to feature matrices (numeric + one-hot + tfidf) ====
import numpy as np
import pandas as pd
import scipy.sparse as sp

def _one_hot_from_vocab(series_str: pd.Series, index_map: dict, vocab_size: int):
    """ สร้าง one-hot (CSR) จาก vocab ที่ฟิตไว้ (ตัวนอก vocab จะเป็นแถว zero) """
    arr = series_str.astype(str).map(index_map).to_numpy()
    N = len(arr)
    rows = np.arange(N, dtype=np.int64)
    mask = ~pd.isna(arr)
    cols = arr[mask].astype(np.int64)
    data = np.ones(mask.sum(), dtype=np.float32)
    return sp.csr_matrix((data, (rows[mask], cols)), shape=(N, vocab_size), dtype=np.float32)

def transform_df_to_X(df: pd.DataFrame, scaler, tfidf, tx_index, ch_index, tx_vocab, ch_vocab):
    # numeric → scale → csr
    X_num = scaler.transform(df[NUMERIC_FEATURES]).astype(np.float32)
    X_num = sp.csr_matrix(X_num)

    # categorical → one-hot csr
    X_tx = _one_hot_from_vocab(df["tx_code"].astype(str), tx_index, len(tx_vocab))
    X_ch = _one_hot_from_vocab(df["channel"].astype(str), ch_index, len(ch_vocab))

    # text → tfidf csr
    X_txt = tfidf.transform(df[TEXT_FEATURE].astype(str)).astype(np.float32)

    # hstack → csr
    X = sp.hstack([X_num, X_tx, X_ch, X_txt], format="csr", dtype=np.float32)
    return X

# แปลง train/val/test
X_train_ext = transform_df_to_X(train_df, scaler, tfidf, tx_index, ch_index, tx_vocab, ch_vocab)
X_val_ext   = transform_df_to_X(val_df,   scaler, tfidf, tx_index, ch_index, tx_vocab, ch_vocab)
X_test_ext  = transform_df_to_X(test_df,  scaler, tfidf, tx_index, ch_index, tx_vocab, ch_vocab)

print("Shapes:",
      "train", X_train_ext.shape,
      "val",   X_val_ext.shape,
      "test",  X_test_ext.shape)


Shapes: train (8309, 201) val (862, 201) test (1102, 201)


In [20]:
# ==== 16C) Keras model (no preprocessing inside) + save .h5 ====
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, average_precision_score

# แปลงเป็น dense (ถ้าเมมไม่พอ ลดฟีเจอร์ก่อน)
Xtr = X_train_ext.toarray()
Xva = X_val_ext.toarray()
Xte = X_test_ext.toarray()

# Labels
y_tr = y_train.astype(np.float32)
y_va = y_val.astype(np.float32)
y_te = y_test.astype(np.float32)

# Class weights → sample_weight (เหมือนเดิม)
classes = np.array([0,1])
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
w0, w1 = float(cw[0]), float(cw[1])
sw_tr = np.where(y_tr == 1, w1, w0).astype(np.float32)

# Build model (input = เวกเตอร์ฟีเจอร์รวม)
inp = layers.Input(shape=(Xtr.shape[1],), name="X")
x = layers.Dense(128, activation="relu")(inp)
x = layers.Dropout(0.2)(x)
out = layers.Dense(1, activation="sigmoid")(x)
model = Model(inp, out)
model.compile(optimizer="adam", loss="binary_crossentropy",
              metrics=[tf.keras.metrics.AUC(name="auc"),
                       tf.keras.metrics.Precision(name="precision"),
                       tf.keras.metrics.Recall(name="recall")])

# Train
es = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True, monitor="val_auc", mode="max")
history = model.fit(
    Xtr, y_tr,
    validation_data=(Xva, y_va),
    epochs=20,
    batch_size=512,
    sample_weight=sw_tr,
    callbacks=[es],
    verbose=0,
)

# เลือก threshold จาก validation เพื่อ maximize F1
val_scores = model.predict(Xva, verbose=0).ravel()
ths = np.linspace(0.05, 0.95, 19)
def f1_at(t):
    return f1_score(y_va, (val_scores >= t).astype(int), zero_division=0)
best_thr = float(ths[np.argmax([f1_at(t) for t in ths])])

# ประเมินบน test
test_scores = model.predict(Xte, verbose=0).ravel()
pred_test = (test_scores >= best_thr).astype(int)
final_metrics = {
    "accuracy": float((pred_test == y_te).mean()),
    "precision": float(precision_score(y_te, pred_test, zero_division=0)),
    "recall": float(recall_score(y_te, pred_test, zero_division=0)),
    "f1": float(f1_score(y_te, pred_test, zero_division=0)),
    "roc_auc": float(roc_auc_score(y_te, test_scores)),
    "pr_auc(AP)": float(average_precision_score(y_te, test_scores)),
    "threshold": best_thr,
}
print("Test metrics:", final_metrics)

# เซฟเป็น .h5 ได้แล้ว (ไม่มี StringLookup/TextVectorization ภายในโมเดล)
model.save(f"{OUT_DIR}/model.h5")
with open(f"{OUT_DIR}/model_meta.json", "w") as f:
    json.dump({"threshold": best_thr, "metrics_test": final_metrics}, f, ensure_ascii=False, indent=2)

print("[OK] Saved model.h5 and model_meta.json at", OUT_DIR)


Test metrics: {'accuracy': 0.7577132486388385, 'precision': 0.5090361445783133, 'recall': 0.6190476190476191, 'f1': 0.5586776859504132, 'roc_auc': 0.8074735879319714, 'pr_auc(AP)': 0.6014701791949221, 'threshold': 0.5499999999999999}
[OK] Saved model.h5 and model_meta.json at /Users/wysuttida/pattern-project/API-Statement-IntelliScan
