In [1]:
import pandas as pd
from glob import glob

In [2]:
folder = "C:/Users/Noon/Documents/DSDE/projectTraffy/1/test_data"

# find only data files (ignore .crc)
files = glob(f"{folder}/part-*.csv")

dfs = [pd.read_csv(f) for f in files]

test_data = pd.concat(dfs, ignore_index=True)

In [3]:
org_dist_cols = [col for col in test_data.columns if col.startswith("dist")]
print(len(org_dist_cols))
print(org_dist_cols)

1529
['dist_1111_ศูนย์รับเรื่องราวร้องทุกข์ของรัฐบาล', 'dist_BTS', 'dist_Bangkok_Smart_Lighting', 'dist_Bangkok_Smart_Lighting__สำนักการโยธา_กทม__', 'dist_CAT_TELECOM__Official_', 'dist_CU_iHouse___CU_Terrace', 'dist_Chula_TUN_JAI_สำนักงานจัดการทรัพย์สิน', 'dist_Chula_TUN_JAI_สำนักบริหารระบบกายภาพ', 'dist_Komgrip', 'dist_MEA', 'dist_MEA_บางขุนเทียน_บำรุงรักษาเสาสาย', 'dist_MEA_บางขุนเทียน_หม้อแปลงฯ', 'dist_MEA_บางขุนเทียน_แก้ไฟฟ้าขัดข้อง', 'dist_MEA_บางขุนเทียน_ไฟฟ้าสาธารณะ', 'dist_MRTA', 'dist_NECTEC', 'dist_NT_ฝ่ายธุรกิจบริการดิจิทัล_ส่วนพันธมิตรบริการดิจิทัล', 'dist_PTT_HO', 'dist_SCGC', 'dist_TOT_smartcity', 'dist_The_line_Jatujak_Mochit__นิติ_', 'dist__ติดตามคำขออนุญาต__ทีมงานผู้ว่าราชการกรุงเทพมหานคร', 'dist_iACO_Bangkokเครือข่ายปปช_ภาคประชาสังคม', 'dist_กกจ_1_ฝกจ_1', 'dist_กกจ_2_ฝกจ_1', 'dist_กกจ_3_ฝกจ_1', 'dist_กกจ_4_ฝกจ_1', 'dist_กกต', 'dist_กทม', 'dist_กผ3_สวจ__สจส__กทม_', 'dist_กฟน_', 'dist_กรมการขนส่งทางบก', 'dist_กรมการขนส่งทางราง', 'dist_กรมการค้าภายใน', 'dist_กรมการปกครอ

In [4]:
def add_relevant_cols(df, org_dist_cols):
    relevant = (df[org_dist_cols] > 0).astype(int)
    new_col_names = list()
    for col in org_dist_cols:
        clean = col.replace("dist_", "relevant_")
        new_col_names.append(clean)
    relevant.columns = new_col_names
    return df.join(relevant)

In [5]:
test_data = add_relevant_cols(test_data, org_dist_cols)
test_data[org_dist_cols] = test_data[org_dist_cols].clip(lower=0)

In [None]:
from sklearn.cluster import KMeans
import numpy as np
test_data["created_at"] = pd.to_datetime(test_data["timestamp"])

test_data["month"]      = test_data["created_at"].dt.month.astype("int16")
test_data["dayofweek"]  = test_data["created_at"].dt.dayofweek.astype("int8")  # 0=Mon
test_data["hour"]       = test_data["created_at"].dt.hour.astype("int8")
test_data["is_weekend"] = test_data["dayofweek"].isin([5, 6]).astype("int8")
test_data.drop(['timestamp', 'created_at', 'latitude', 'longitude'], axis='columns', inplace=True)

# Fit KMeans on test_data coordinates
coords_train = test_data[["latitude", "longitude"]].dropna()

kmeans = KMeans(n_clusters=50, random_state=42)
kmeans.fit(coords_train)

# Assign region IDs to train and test
test_data["region"] = kmeans.predict(test_data[["latitude", "longitude"]])
# print(test_data["region"].value_counts().sort_index())

# One-hot encode region
test_data = pd.get_dummies(test_data, columns=["region"], prefix="region")
hours_7 = 7 * 24
test_data["fast_7d"]  = (test_data["completion_time_hours"] <= hours_7).astype(int)

# 3) Train/val split (stratify to keep class balance)
feature_cols = [col for col in test_data.columns if col not in ["completion_time_hours", "log_completion_hours", "time_bucket", "fast_7d"]]

X_test = test_data[feature_cols]
y_test = test_data["fast_7d"]

  test_data["created_at"] = pd.to_datetime(test_data["timestamp"])
  test_data["month"]      = test_data["created_at"].dt.month.astype("int16")
  test_data["dayofweek"]  = test_data["created_at"].dt.dayofweek.astype("int8")  # 0=Mon
  test_data["hour"]       = test_data["created_at"].dt.hour.astype("int8")
  test_data["is_weekend"] = test_data["dayofweek"].isin([5, 6]).astype("int8")
  test_data["region"] = kmeans.predict(test_data[["latitude", "longitude"]])


In [9]:
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    roc_auc_score,
)

# Load
package = joblib.load("fast_7day_model.joblib")
clf_bin = package["model"]

y_proba = clf_bin.predict_proba(X_test)[:, 1]
y_pred  = clf_bin.predict(X_test)

# 6) Metrics
acc = accuracy_score(y_test, y_pred)
prec, rec, f1, _ = precision_recall_fscore_support(
    y_test, y_pred, average="binary", zero_division=0
)

try:
    auc = roc_auc_score(y_test, y_proba)
except ValueError:
    auc = None

print("\n=== Binary ≤7 days vs >7 days ===")
print("Accuracy:", acc)
print("Precision (fast<=7d):", prec)
print("Recall (fast<=7d):", rec)
print("F1 (fast<=7d):", f1)
print("ROC-AUC:", auc)

print("\nClassification report:")
print(classification_report(
    y_test,
    y_pred,
    target_names=[">7 days", "≤7 days"],
    zero_division=0
))

print("Confusion matrix (rows=true, cols=pred):")
print(confusion_matrix(y_test, y_pred))

# 7) Simple baseline: always predict majority class
majority_class = y_test.mode()[0]
y_baseline = np.full_like(y_test, majority_class)

acc_base = accuracy_score(y_test, y_baseline)
print("\nBaseline (always predict class", majority_class, ") accuracy:", acc_base)


=== Binary ≤7 days vs >7 days ===
Accuracy: 0.7359269316218218
Precision (fast<=7d): 0.6994835377663008
Recall (fast<=7d): 0.7714004522991875
F1 (fast<=7d): 0.733683854135548
ROC-AUC: 0.8170167422014905

Classification report:
              precision    recall  f1-score   support

     >7 days       0.78      0.70      0.74     53519
     ≤7 days       0.70      0.77      0.73     47756

    accuracy                           0.74    101275
   macro avg       0.74      0.74      0.74    101275
weighted avg       0.74      0.74      0.74    101275

Confusion matrix (rows=true, cols=pred):
[[37692 15827]
 [10917 36839]]

Baseline (always predict class 0 ) accuracy: 0.5284522340162923
