# 06 – Breach Prediction Modeling (with LLM-Inspired Features)

ในโน้ตบุ๊กนี้เราจะสร้าง **โมเดลทำนายว่าตั๋ว Traffy จะ breach SLA หรือไม่**  
โดยใช้ข้อมูลจาก:

- ข้อมูลพื้นฐานของเคส (`std_type`, `urgency`, เขต/แขวง, เวลาแจ้ง ฯลฯ)
- ข้อมูลสภาพอากาศที่เรา scrape มา (`temp_high`, `humidity_high`, ...)
- **LLM-inspired reasoning features** จากโน้ตบุ๊ก 05  
  - `rule_risk_score_raw` (คะแนนความเสี่ยงจาก rule-based reasoning)
  - `rule_risk_level` (low / medium / high)
  - `rule_risk_reason` (ข้อความอธิบายเหตุผล)

โครงหลัก:
1. Load & ตรวจข้อมูล
2. เลือก feature + เตรียมชุด train/test
3. สร้าง `ColumnTransformer + Pipeline`
4. เทรน Logistic Regression (baseline)
5. เทรน RandomForest...
6. Evaluate โมเดล
7. ใช้`Best`โมเดลกับ `df_training and df_prediction` เพื่อทำนาย pending tickets
8. Export ผลลัพธ์สำหรับใช้ต่อ (dashboard / report)

---


In [189]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    classification_report,
    roc_auc_score,
    confusion_matrix,
    precision_recall_fscore_support,
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score

)

from sklearn.inspection import permutation_importance

# ถ้าไม่มีให้ลอง pip install ก่อนใน terminal
try:
    from xgboost import XGBClassifier
    HAS_XGB = True
except ImportError:
    HAS_XGB = False
    print("⚠️ xgboost not installed, skip XGB model")

try:
    from lightgbm import LGBMClassifier
    HAS_LGBM = True
except ImportError:
    HAS_LGBM = False
    print("⚠️ lightgbm not installed, skip LGBM model")

df_training = pd.read_csv('../data/traffy_completed_for_model_with_weather_and_llm.csv')
df_prediction = pd.read_csv('../data/traffy_pending_for_predict_with_weather_and_llm.csv')

df_training.shape, df_prediction.shape


((111210, 27), (38790, 26))

In [190]:
# 2) Inspect

print("Training columns:", df_training.columns.tolist())
print("Prediction columns:", df_prediction.columns.tolist())

print("\nTraining breach distribution:")
print(df_training["breach"].value_counts(normalize=True).rename("proportion"))


Training columns: ['ticket_id', 'std_type', 'district', 'subdistrict', 'province_clean', 'urgency', 'comment', 'comment_length', 'day_of_week', 'hour_of_day', 'day', 'month', 'year', 'lng', 'lat', 'SLA_days', 'breach', 'temp_high', 'temp_low', 'humidity_high', 'humidity_low', 'pressure_high', 'pressure_low', 'comment_clean', 'rule_risk_score_raw', 'rule_risk_reason', 'rule_risk_level']
Prediction columns: ['ticket_id', 'std_type', 'district', 'subdistrict', 'province_clean', 'urgency', 'comment', 'comment_length', 'day_of_week', 'hour_of_day', 'day', 'month', 'year', 'lng', 'lat', 'SLA_days', 'temp_high', 'temp_low', 'humidity_high', 'humidity_low', 'pressure_high', 'pressure_low', 'comment_clean', 'rule_risk_score_raw', 'rule_risk_reason', 'rule_risk_level']

Training breach distribution:
breach
0    0.741012
1    0.258988
Name: proportion, dtype: float64


In [191]:
# 3) Define Feature Set & Target

TARGET_COL = "breach"

FEATURE_COLS = [
    "std_type",
    "district",
    "subdistrict",
    "province_clean",
    "urgency",
    "comment_length",
    "day_of_week",
    "hour_of_day",
    "day",
    "month",
    "year",
    "lng",
    "lat",
    "SLA_days",
    "temp_high",
    "temp_low",
    "humidity_high",
    "humidity_low",
    "pressure_high",
    "pressure_low",
    "rule_risk_score_raw",
    "rule_risk_level",
]

CATEGORICAL_COLS = [
    "std_type",
    "district",
    "subdistrict",
    "province_clean",
    "urgency",
    "rule_risk_level",
    "day_of_week",   
    "month",        
]

NUMERIC_COLS = [
    col for col in FEATURE_COLS
    if col not in CATEGORICAL_COLS
]

print("Categorical:", CATEGORICAL_COLS)
print("Numeric:", NUMERIC_COLS)


Categorical: ['std_type', 'district', 'subdistrict', 'province_clean', 'urgency', 'rule_risk_level', 'day_of_week', 'month']
Numeric: ['comment_length', 'hour_of_day', 'day', 'year', 'lng', 'lat', 'SLA_days', 'temp_high', 'temp_low', 'humidity_high', 'humidity_low', 'pressure_high', 'pressure_low', 'rule_risk_score_raw']


In [192]:
# 4) Train/Test Split

X = df_training[FEATURE_COLS].copy()
y = df_training[TARGET_COL].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train.shape[0], X_test.shape[0]


(88968, 22242)

In [193]:
# 5) Preprocessing Pipeline (ColumnTransformer)

numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)

categorical_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore")),
    ]
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, NUMERIC_COLS),
        ("cat", categorical_transformer, CATEGORICAL_COLS),
    ]
)


In [194]:
def evaluate_model(name, model, X_train, X_test, y_train, y_test):
    """
    train model ที่รับเข้ามา (ต้องเป็น Pipeline แล้ว)
    คืน dict ของ metrics ต่างๆ
    """
    print(f"\n=== {name} — Train & Evaluate ===")
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]

    print("\nClassification report:")
    print(classification_report(y_test, y_pred, digits=4))

    roc = roc_auc_score(y_test, y_proba)
    print("ROC-AUC:", roc)

    # คำนวณ precision / recall / f1 ของ class 1
    p1, r1, f1, _ = precision_recall_fscore_support(
        y_test, y_pred, average=None, labels=[1]
    )
    metrics = {
        "model": name,
        "roc_auc": roc,
        "precision_1": p1[0],
        "recall_1": r1[0],
        "f1_1": f1[0],
    }
    return metrics, model


In [195]:
results = []
trained_models = {}

# 4.1 Logistic Regression (baseline)
log_reg_clf = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", LogisticRegression(
            max_iter=1000,
            class_weight="balanced",
            n_jobs=-1,
        )),
    ]
)

metrics_lr, model_lr = evaluate_model(
    "Logistic Regression", log_reg_clf, X_train, X_test, y_train, y_test
)
results.append(metrics_lr)
trained_models["logreg"] = model_lr

# 4.2 RandomForest
rf_clf = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", RandomForestClassifier(
            n_estimators=300,
            max_depth=None,
            min_samples_split=10,
            min_samples_leaf=5,
            class_weight="balanced_subsample",
            n_jobs=-1,
            random_state=42,
        )),
    ]
)

metrics_rf, model_rf = evaluate_model(
    "Random Forest", rf_clf, X_train, X_test, y_train, y_test
)
results.append(metrics_rf)
trained_models["rf"] = model_rf

if HAS_XGB:
    # ช่วยบาลานซ์ class
    pos_weight = (y_train == 0).sum() / (y_train == 1).sum()

    xgb_clf = Pipeline(
        steps=[
            ("preprocess", preprocess),
            ("model", XGBClassifier(
                n_estimators=300,
                max_depth=6,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                objective="binary:logistic",
                eval_metric="logloss",
                scale_pos_weight=pos_weight,
                n_jobs=-1,
                random_state=42,
                tree_method="hist",
            )),
        ]
    )

    metrics_xgb, model_xgb = evaluate_model(
        "XGBoost", xgb_clf, X_train, X_test, y_train, y_test
    )
    results.append(metrics_xgb)
    trained_models["xgb"] = model_xgb

if HAS_LGBM:
    lgbm_clf = Pipeline(
        steps=[
            ("preprocess", preprocess),
            ("model", LGBMClassifier(
                n_estimators=400,
                learning_rate=0.05,
                max_depth=-1,
                num_leaves=63,
                subsample=0.8,
                colsample_bytree=0.8,
                class_weight="balanced",
                random_state=42,
                n_jobs=-1,
            )),
        ]
    )

    metrics_lgbm, model_lgbm = evaluate_model(
        "LightGBM", lgbm_clf, X_train, X_test, y_train, y_test
    )
    results.append(metrics_lgbm)
    trained_models["lgbm"] = model_lgbm

# สรุปผลทุกโมเดล
df_results = pd.DataFrame(results).sort_values("roc_auc", ascending=False)
df_results



=== Logistic Regression — Train & Evaluate ===


  return _ForkingPickler.loads(res)



Classification report:
              precision    recall  f1-score   support

           0     0.8454    0.5953    0.6986     16482
           1     0.3728    0.6884    0.4837      5760

    accuracy                         0.6194     22242
   macro avg     0.6091    0.6418    0.5912     22242
weighted avg     0.7230    0.6194    0.6430     22242

ROC-AUC: 0.6976287947542099

=== Random Forest — Train & Evaluate ===

Classification report:
              precision    recall  f1-score   support

           0     0.8450    0.7299    0.7833     16482
           1     0.4439    0.6170    0.5163      5760

    accuracy                         0.7007     22242
   macro avg     0.6445    0.6735    0.6498     22242
weighted avg     0.7412    0.7007    0.7141     22242

ROC-AUC: 0.7495466487430732

=== XGBoost — Train & Evaluate ===

Classification report:
              precision    recall  f1-score   support

           0     0.8593    0.6596    0.7463     16482
           1     0.4150    0.69




Classification report:
              precision    recall  f1-score   support

           0     0.8614    0.6605    0.7477     16482
           1     0.4173    0.6958    0.5217      5760

    accuracy                         0.6696     22242
   macro avg     0.6393    0.6782    0.6347     22242
weighted avg     0.7464    0.6696    0.6892     22242

ROC-AUC: 0.752178649857083


Unnamed: 0,model,roc_auc,precision_1,recall_1,f1_1
3,LightGBM,0.752179,0.417326,0.695833,0.521739
1,Random Forest,0.749547,0.443917,0.617014,0.516345
2,XGBoost,0.745803,0.415016,0.690972,0.518567
0,Logistic Regression,0.697629,0.372826,0.688368,0.483684


In [196]:
# 1. สร้าง best model pipeline (ใช้ LightGBM)
best_model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", LGBMClassifier(
            n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            class_weight="balanced"
        )),
    ]
)

# 2. เทรนบนทั้ง training set
best_model.fit(X, y)

[LightGBM] [Info] Number of positive: 28802, number of negative: 82408
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1436
[LightGBM] [Info] Number of data points in the train set: 111210, number of used features: 270
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [197]:
# ใช้ X_train, X_test, y_train, y_test เดิม
y_proba = best_model.predict_proba(X_test)[:, 1]

print("ROC-AUC:", roc_auc_score(y_test, y_proba))

ROC-AUC: 0.7997433384820478




In [198]:
thresholds = np.linspace(0.1, 0.9, 17) 
rows = []

for th in thresholds:
    y_pred_th = (y_proba >= th).astype(int)
    cm = confusion_matrix(y_test, y_pred_th)
    tn, fp, fn, tp = cm.ravel()
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall    = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1        = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    rows.append({"threshold": th, "precision": precision, "recall": recall, "f1": f1})

threshold_df = pd.DataFrame(rows)
threshold_df


Unnamed: 0,threshold,precision,recall,f1
0,0.1,0.28222,0.999479,0.440154
1,0.15,0.290021,0.997049,0.449339
2,0.2,0.300373,0.993576,0.46129
3,0.25,0.315155,0.985243,0.477553
4,0.3,0.332043,0.968056,0.49448
5,0.35,0.351829,0.946701,0.513006
6,0.4,0.375986,0.910069,0.532129
7,0.45,0.406513,0.853819,0.55079
8,0.5,0.446034,0.774132,0.565971
9,0.55,0.494979,0.667535,0.568451


In [199]:
# เลือก threshold ที่ให้ f1 score สูงที่สุด
best_row = threshold_df.loc[threshold_df['f1'].idxmax()]
print(best_row["threshold"])

0.55


In [200]:
best_th = best_row['threshold']

y_pred_best = (y_proba >= best_th).astype(int)

print(classification_report(y_test, y_pred_best, digits=4))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_best))


              precision    recall  f1-score   support

           0     0.8677    0.7620    0.8114     16482
           1     0.4950    0.6675    0.5685      5760

    accuracy                         0.7375     22242
   macro avg     0.6813    0.7148    0.6899     22242
weighted avg     0.7712    0.7375    0.7485     22242

Confusion Matrix:
 [[12559  3923]
 [ 1915  3845]]


In [201]:
best_clf = LGBMClassifier(
    n_estimators=500,
            learning_rate=0.05,
            max_depth=-1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42,
            class_weight="balanced"
)

best_clf_pipe = Pipeline(
    steps=[
        ("preprocess", preprocess),
        ("model", best_clf),
    ]
)

best_clf_pipe.fit(X, y)   # train ทั้งหมด


[LightGBM] [Info] Number of positive: 28802, number of negative: 82408
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001216 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1436
[LightGBM] [Info] Number of data points in the train set: 111210, number of used features: 270
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=-0.000000
[LightGBM] [Info] Start training from score -0.000000


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,boosting_type,'gbdt'
,num_leaves,31
,max_depth,-1
,learning_rate,0.05
,n_estimators,500
,subsample_for_bin,200000
,objective,
,class_weight,'balanced'
,min_split_gain,0.0
,min_child_weight,0.001


In [202]:
train_proba   = best_clf_pipe.predict_proba(df_training[FEATURE_COLS])[:, 1]
pending_proba = best_clf_pipe.predict_proba(df_prediction[FEATURE_COLS])[:, 1]

df_training_with_prob  = df_training.copy()
df_prediction_with_prob = df_prediction.copy()

df_training_with_prob["breach_proba"] = train_proba
df_training_with_prob["breach_pred"]  = (train_proba >= best_th).astype(int)

df_prediction_with_prob["breach_proba"] = pending_proba
df_prediction_with_prob["breach_pred"]  = (pending_proba >= best_th).astype(int)




In [203]:
def evaluate_training_predictions(df):
    y_true = df['breach']
    y_pred = df['breach_pred']

    # === Metrics ===
    acc = accuracy_score(y_true, y_pred)
    precision_0 = precision_score(y_true, y_pred, pos_label=0)
    recall_0 = recall_score(y_true, y_pred, pos_label=0)
    f1_0 = f1_score(y_true, y_pred, pos_label=0)

    precision_1 = precision_score(y_true, y_pred, pos_label=1)
    recall_1 = recall_score(y_true, y_pred, pos_label=1)
    f1_1 = f1_score(y_true, y_pred, pos_label=1)

    cm = confusion_matrix(y_true, y_pred)
    
    results = {
        "accuracy": acc,
        "precision_0": precision_0,
        "recall_0": recall_0,
        "f1_0": f1_0,
        "precision_1": precision_1,
        "recall_1": recall_1,
        "f1_1": f1_1,
        "confusion_matrix": cm
    }

    return pd.Series(results)

evaluate_training_predictions(df_training_with_prob)

accuracy                                   0.739763
precision_0                                0.867779
recall_0                                   0.765435
f1_0                                       0.813401
precision_1                                0.498196
recall_1                                   0.666308
f1_1                                       0.570117
confusion_matrix    [[63078, 19330], [9611, 19191]]
dtype: object

In [204]:
SAVE_COLS = [
    "ticket_id",
    "std_type",
    "district",
    "subdistrict",
    "province_clean",
    "comment_clean",
    "comment_length",
    "day_of_week",
    "hour_of_day",
    "day",
    "month",
    "year",
    "lng",
    "lat",
    "urgency",
    "SLA_days",
    "temp_high",
    "temp_low",
    "humidity_high",
    "humidity_low",
    "pressure_high",
    "pressure_low",
    "rule_risk_score_raw",
    "rule_risk_level",
    "rule_risk_reason",
    "breach_proba",
    "breach_pred"
]

df_training_with_prob[SAVE_COLS + ["breach"]].to_csv(
    "../data/final-traffy_completed_with_predictions.csv",
    index=False,
)

df_prediction_with_prob[SAVE_COLS].to_csv(
    "../data/final-traffy_pending_with_predictions.csv",
    index=False,
)

df_training_with_prob[SAVE_COLS + ["breach"]].sample(100,random_state=42).to_csv(
    "../data_samples/05-final-traffy_completed_with_predictions.csv",
    index=False,
)

df_prediction_with_prob[SAVE_COLS].sample(100,random_state=42).to_csv(
    "../data_samples/05-final-traffy_pending_with_predictions.csv",
    index=False,
)