## LSTM + GBDT for sequence-based prediction
### Goal: Learn temporal patterns with LSTM, feed 64-D embeddings to GBDT for final prediction.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/deep-cache-dataset/y_dataset1_window.npy
/kaggle/input/deep-cache-dataset/syntheticDataset_O50_properties.csv
/kaggle/input/deep-cache-dataset/syntheticDataset_O50.csv
/kaggle/input/deep-cache-dataset/X_dataset1_window.npy
/kaggle/input/ttl-dataset/request_data.csv
/kaggle/input/ttl-dataset/pred_lambda.csv


In [2]:
# Use GPU if available, otherwise fall back to CPU
import tensorflow as tf

device = "/GPU:0" if tf.config.list_physical_devices('GPU') else "/CPU:0"
print(device)

2025-11-03 12:58:49.197380: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1762174729.427179      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1762174729.493110      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


/CPU:0


2025-11-03 12:59:04.018988: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)


In [3]:
# Load data
df = pd.read_csv('/kaggle/input/deep-cache-dataset/syntheticDataset_O50.csv') 
df

Unnamed: 0,object_ID,request_time
0,5,2.961472
1,25,3.274127
2,2,3.785475
3,2,4.455687
4,4,5.288994
...,...,...
292141,39,304330.451276
292142,39,304352.296649
292143,39,304405.469075
292144,39,304442.964190


## Split Data
- To align with the paper's goal which is predicting the future characteristics of an object based on past logs, we split the dataset into 60% for training and 40% for evaluation.

In [4]:
train_cut = int(len(df) * 0.7)
val_cut = int(len(df) * 0.85)

train_df = df.iloc[:train_cut].reset_index(drop=True)
val_df   = df.iloc[train_cut:val_cut].reset_index(drop=True)
test_df  = df.iloc[val_cut:].reset_index(drop=True)

## Feature Engineering
> "For dataset 1, the probability of object $o^i$ is calculated as $Ni$ /1000, where $N^i$ represents the number of occurrences of $o^i$ in the window of past 1K objects."
- Using step=1 generated too much data, so we used step=100 to make it manageable.

In [5]:
window_size = 1000
step = 50
m, k = 20, 10 # Sequence length for input(m) and output(k)

object_ids = df['object_ID'].unique()
object_ids.sort()
num_objects = len(object_ids) # Number of unique objects: 50

In [6]:
# Generate training data for sequence-to-sequence modeling from object request logs
def build_sequence_input(df, object_ids, m, k, window_size, step):
    X, y = [], []
    for i in range(0, len(df) - window_size * (m + k), step):
        seq = df['object_ID'].iloc[i : i + window_size * (m + k)]
        x_seq, y_seq = [], []

        # Build the input sequence: m windows of past requests
        for j in range(m):
            window = seq[j * window_size : (j + 1) * window_size]
            counts = window.value_counts(normalize=True).reindex(object_ids, fill_value=0).values
            x_seq.append(counts)

        # Build the output sequence: k windows of future requests
        for j in range(k):
            window = seq[(m + j) * window_size : (m + j + 1) * window_size]
            counts = window.value_counts(normalize=True).reindex(object_ids, fill_value=0).values
            y_seq.append(counts)

        X.append(x_seq)
        y.append(y_seq)

    X = np.array(X) # (#samples, 20, d)
    y = np.array(y) # (#samples, 26, d)
    X = X.transpose(0, 2, 1).reshape(-1, m, 1)
    y = y.transpose(0, 2, 1).reshape(-1, k, 1)
    return X, y

In [7]:
import numpy as np
import pandas as pd

def build_simple_sequences(df, object_ids, m, k, window_size, step, time_col="request_time"):
    """
    df: time-sorted DataFrame with [time_col, 'object_ID']
    object_ids: unique object IDs (sorted)
    return: X_seq (N, m, 1), y_pop (N, d), y_next (N, d)
    """
    X_seq, y_pop, y_next = [], [], []

    # 1) 시간 정렬
    df = df.sort_values(time_col).reset_index(drop=True)

    # 2) 시간 배열/객체 배열 만들기
    ts = df[time_col].to_numpy().astype(float)  # seconds (float)
    obj = df['object_ID'].to_numpy()
    id2idx = {o: i for i, o in enumerate(object_ids)}
    d = len(object_ids)

    # 3) 슬라이딩 윈도우
    total = window_size * (m + k)
    for i in range(0, len(df) - total, step):
        seq = obj[i : i + total]
        ts_seq = ts[i : i + total]

        # 입력: 과거 m개 창 (각 창 길이: window_size)
        x_seq = []
        for j in range(m):
            w = seq[j*window_size : (j+1)*window_size]
            ts_w  = ts_seq[j*window_size : (j+1)*window_size]
            
            counts = np.zeros(d, dtype=float)
            unique, cnts = np.unique(w, return_counts=True)
            for u, c in zip(unique, cnts):
                counts[id2idx[u]] = c / window_size      # 비율

            # 평균 간격 — 창 전체의 평균 간격을 각 객체 위치에 broadcast
            if len(ts_w) >= 2:
                gap_mean = float(np.mean(np.diff(ts_w)))
            else:
                gap_mean = float(window_size)
            gap_mean = 0.0 if not np.isfinite(gap_mean) else gap_mean
            gap_vec = np.full_like(counts, gap_mean, dtype=float)

            # (d, 2)
            x_seq.append(np.stack([counts, gap_vec], axis=1))
            
        # (m, d) -> (d, m, 1)
        # X_seq.append(np.array(x_seq).T.reshape(-1, m, 1))
        X_seq.append(np.stack(x_seq, axis=0).transpose(1, 0, 2))  # (d, m, 2)

        # 미래 구간
        future_objs = seq[m*window_size:]
        future_ts   = ts_seq[m*window_size:]

        # 인기도 라벨: 미래 구간에 한 번이라도 등장했는지
        L = 3
        future_subset = future_objs[:L * window_size]
        y_pop.append(np.isin(object_ids, future_subset).astype(int))

        
        # y_pop.append(np.isin(object_ids, future_objs).astype(int))  # (d,)

        # (b) 다음 inter-arrival: 기준시각 이후 첫 등장까지 시간
        next_time = []
        t_ref = ts_seq[m*window_size - 1]
        for oid in object_ids:
            mask = (future_objs == oid)
            if np.any(mask):
                idx_first = np.argmax(mask)  # True가 처음인 위치
                next_time.append(max(0.0, float(future_ts[idx_first] - t_ref)))
            else:
                next_time.append(np.nan)     # 미래에 아예 안 나옴
        y_next.append(next_time)

    if not X_seq:
        return (np.zeros((0, m, 1)), np.zeros((0, d), int), np.zeros((0, d), float))

    X_seq = np.concatenate(X_seq, axis=0)   # (N, m, 1)
    y_pop = np.array(y_pop, dtype=int)      # (N, d)
    y_next = np.array(y_next, dtype=float)  # (N, d)
    return X_seq, y_pop, y_next


## Build LSTM Model
> "For our datasets, we use a two-layer depth LSTM Encoder-Decoder model with 128 and 64 as the number of hidden units. ... The loss function is chosen as mean-squared-error (MSE)."

In [8]:
from tensorflow.keras.layers import Input, LSTM, Dropout, Dense, TimeDistributed, RepeatVector
from tensorflow.keras.models import Model

In [9]:
# =========================
# 1) LSTM 인코더 + 간단한 사전학습 헤드
#    - 사전학습 타깃: next inter-arrival(또는 임의 예측 과제)
#    - 학습 후 인코더만 떼어내 임베딩 추출에 사용
# =========================
def build_encoder(m, head_units: int = 32):
    # ----- Encoder -----
    encoder_inputs = Input(shape=(m, 1), name="seq_in")  # 이름을 seq_in으로 고정해도 OK

    x = LSTM(128, return_sequences=True, name="enc_lstm_1")(encoder_inputs)
    x = Dropout(0.2, name="enc_do_1")(x)
    x = LSTM(64, name="enc_lstm_2")(x)
    emb = Dropout(0.2, name="emb_drop")(x)  # embedding

    # pretrain head: 다음 inter-arrival 회귀(양수)
    h = Dense(head_units, activation="relu", name="pre_head")(emb)
    next_hat = Dense(1, activation="linear", name="next_hat")(h)

    # 사전학습용 모델 (인코더 + 헤드)
    # pretrain_model = Model(encoder_inputs, next_hat, name="encoder_pretrain")
    # pretrain_model.compile(optimizer="adam", loss="mae")
    pretrain_model = Model(encoder_inputs, next_hat, name="encoder_pretrain")
    pretrain_model.compile(optimizer="adam", loss=tf.keras.losses.Huber())

    # 인코더 단독 모델 (입력=encoder_inputs)
    encoder = Model(encoder_inputs, emb, name="encoder_only")
    return encoder, pretrain_model

In [10]:
# =========================
# 2) 인코더 사전학습
# =========================
def pretrain_encoder(pretratin_model, X_seq_train, y_next_train, X_seq_val=None, y_next_val=None, epochs=20, batch_size=128):
    callbacks = [
        tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
    ] if X_seq_val is not None else []
    hist = pretrain_model.fit(
        X_seq_train, y_next_train,
        validation_data=(X_seq_val, y_next_val) if X_seq_val is not None else None,
        epochs=epochs, batch_size=batch_size, verbose=1, callbacks=callbacks
    )
    return hist

In [11]:
# =========================
# 3) 임베딩 추출
# =========================
def extract_embeddings(encoder, X_seq):
    return encoder.predict(X_seq, verbose=0)  # shape: (n_samples, 64)

In [12]:
# =========================
# 4) 탭울러 전처리
# =========================
def fit_transform_tab_scaler(X_tab_train, X_tab_val=None, X_tab_test=None):
    scaler = StandardScaler()
    X_tab_train_s = scaler.fit_transform(X_tab_train)
    X_tab_val_s = scaler.transform(X_tab_val) if X_tab_val is not None else None
    X_tab_test_s = scaler.transform(X_tab_test) if X_tab_test is not None else None
    return scaler, X_tab_train_s, X_tab_val_s, X_tab_test_s

In [13]:
# =========================
# 5) 2단계 GBDT 학습
# =========================
def train_gbdt_popularity(X_train, y_train, X_val, y_val, calibrate=True):
    base = LGBMClassifier(
        objective="binary", learning_rate=0.05, num_leaves=64,
        n_estimators=2000, subsample=0.8, colsample_bytree=0.8, random_state=SEED
    )
    if calibrate:
        # AUC 최적화 모델에 확률 보정 적용
        clf = CalibratedClassifierCV(base, method="isotonic", cv=5)
        clf.fit(X_train, y_train)
        return clf
    else:
        base.fit(X_train, y_train,
                 eval_set=[(X_val, y_val)],
                 eval_metric="auc",
                 early_stopping_rounds=100,
                 verbose=100)
        return base

In [14]:
def train_gbdt_next(X_train, y_train, X_val, y_val):
    reg = LGBMRegressor(
        objective="mae", learning_rate=0.05, num_leaves=64,
        n_estimators=2000, subsample=0.8, colsample_bytree=0.8, random_state=SEED
    )
    reg.fit(X_train, y_train,
            eval_set=[(X_val, y_val)],
            eval_metric="l1",
            early_stopping_rounds=100,
            verbose=100)
    return reg

In [15]:
def train_gbdt_hazard_bins(X_train, y_train_bins, X_val, y_val_bins, calibrate=True):
    """
    y_*_bins: shape (n_samples, H). 각 열이 하나의 임계 τ에 대한 0/1 라벨
    반환: 리스트 또는 dict[bin_idx] = 모델
    """
    H = y_train_bins.shape[1]
    models = []
    for j in range(H):
        y_tr = y_train_bins[:, j]
        y_va = y_val_bins[:, j]
        model = train_gbdt_popularity(X_train, y_tr, X_val, y_va, calibrate=calibrate)
        models.append(model)
    return models

In [16]:
# =========================
# 6) 평가 함수
# =========================
def evaluate_popularity(model, X, y_true):
    # 확률 예측
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X)[:, 1]
    else:
        y_prob = model.predict(X)
        if y_prob.ndim > 1:  # LGBMClassifier raw pred
            y_prob = y_prob[:, 1]
    auc = roc_auc_score(y_true, y_prob)
    ap = average_precision_score(y_true, y_prob)
    return {"AUC": auc, "PR-AUC": ap}

def evaluate_next(model, X, y_true):
    y_hat = model.predict(X)
    mae = mean_absolute_error(y_true, y_hat)
    return {"MAE": mae}

def evaluate_hazard(models, X, y_true_bins):
    out = []
    for j, mdl in enumerate(models):
        if hasattr(mdl, "predict_proba"):
            prob = mdl.predict_proba(X)[:, 1]
        else:
            prob = mdl.predict(X)
            if prob.ndim > 1:
                prob = prob[:, 1]
        auc = roc_auc_score(y_true_bins[:, j], prob)
        ap = average_precision_score(y_true_bins[:, j], prob)
        out.append({"bin": j, "AUC": auc, "PR-AUC": ap})
    return out

In [17]:
# =========================
# 7) 파이프라인 묶기
# =========================
def train_pipeline(
    X_seq_train, X_seq_val, X_tab_train, X_tab_val,
    y_pop_train, y_pop_val, y_next_train, y_next_val,
    y_haz_train, y_haz_val,
    use_log1p_for_next=True
):
    m = X_seq_train.shape[1]

    # encoder + pretrain
    encoder, pretrain_model = build_encoder(m)
    y_next_tr = np.log1p(y_next_train) if use_log1p_for_next else y_next_train
    y_next_va = np.log1p(y_next_val) if use_log1p_for_next else y_next_val
    pretrain_encoder(pretrain_model, X_seq_train, y_next_tr, X_seq_val, y_next_va,
                     epochs=20, batch_size=128)

    # extract embedding
    emb_tr = extract_embeddings(encoder, X_seq_train)   # (n, 64)
    emb_va = extract_embeddings(encoder, X_seq_val)

    # tabular scaling
    scaler, X_tab_tr_s, X_tab_va_s, _ = fit_transform_tab_scaler(X_tab_train, X_tab_val, None)

    X_tr = np.concatenate([emb_tr, X_tab_tr_s], axis=1)
    X_va = np.concatenate([emb_va, X_tab_va_s], axis=1)

    # train GBDT
    mdl_pop = train_gbdt_popularity(X_tr, y_pop_train, X_va, y_pop_val, calibrate=True)

    y_next_tr_target = np.log1p(y_next_train) if use_log1p_for_next else y_next_train
    y_next_va_target = np.log1p(y_next_val) if use_log1p_for_next else y_next_val
    mdl_next = train_gbdt_next(X_tr, y_next_tr_target, X_va, y_next_va_target)

    mdl_haz_bins = train_gbdt_hazard_bins(X_tr, y_haz_train, X_va, y_haz_val, calibrate=True)

    artifacts = {
        "encoder": encoder,
        "scaler": scaler,
        "mdl_pop": mdl_pop,
        "mdl_next": mdl_next,
        "mdl_haz_bins": mdl_haz_bins
    }
    return artifacts

In [18]:
# =========================
# 8) 추론 래퍼
# =========================
def predict_signals(artifacts, X_seq, X_tab, use_log1p_for_next=True):
    encoder = artifacts["encoder"]
    scaler  = artifacts["scaler"]
    mdl_pop = artifacts["mdl_pop"]
    mdl_next = artifacts["mdl_next"]
    mdl_haz_bins = artifacts["mdl_haz_bins"]

    emb = extract_embeddings(encoder, X_seq)
    X_tab_s = scaler.transform(X_tab)
    X_ = np.concatenate([emb, X_tab_s], axis=1)

    # popularity
    if hasattr(mdl_pop, "predict_proba"):
        pop_prob = mdl_pop.predict_proba(X_)[:, 1]
    else:
        pop_prob = mdl_pop.predict(X_)  # 보정 안 쓴 경우
        if pop_prob.ndim > 1:
            pop_prob = pop_prob[:, 1]

    # next inter-arrival
    next_hat = mdl_next.predict(X_)
    if use_log1p_for_next:
        next_hat = np.expm1(next_hat)

    # hazard bins
    haz_probs = []
    for mdl in mdl_haz_bins:
        if hasattr(mdl, "predict_proba"):
            haz_probs.append(mdl.predict_proba(X_)[:, 1])
        else:
            p = mdl.predict(X_)
            if p.ndim > 1:
                p = p[:, 1]
            haz_probs.append(p)
    haz_probs = np.stack(haz_probs, axis=1)  # (n, H)

    return {
        "popularity": pop_prob,            # (n,)
        "next_interarrival_sec": next_hat, # (n,)
        "hazard_proxy": haz_probs          # (n, H)
    }


In [19]:

X_seq_train, y_pop_train, y_next_train = build_simple_sequences(train_df, object_ids, m, k, window_size, step)
X_seq_val,   y_pop_val,   y_next_val   = build_simple_sequences(val_df, object_ids, m, k, window_size, step)
X_seq_test,  y_pop_test,  y_next_test  = build_simple_sequences(test_df, object_ids, m, k, window_size, step)

# 1) 시퀀스/라벨 생성
X_seq_train, y_pop_train, y_next_train = build_simple_sequences(
    train_df, object_ids, m, k, window_size, step, time_col="request_time"
)
X_seq_val,   y_pop_val,   y_next_val   = build_simple_sequences(
    val_df, object_ids, m, k, window_size, step, time_col="request_time"
)

# 2) 라벨을 X에 맞춰 "객체별"로 펼치기 (flatten)
#    y_next: (num_windows, d) -> (num_windows*d, 1)
y_next_train_flat = y_next_train.reshape(-1, 1)
y_next_val_flat   = y_next_val.reshape(-1, 1)



# 3) NaN/inf 제거 (미래에 안 나오는 객체는 next_time이 NaN일 수 있음)
def drop_nan_rows(X_seq, y_flat):
    mask = np.isfinite(y_flat[:, 0])
    return X_seq[mask], y_flat[mask]

X_seq_train_fit, y_next_train_fit = drop_nan_rows(X_seq_train, y_next_train_flat)
X_seq_val_fit,   y_next_val_fit   = drop_nan_rows(X_seq_val,   y_next_val_flat)

print("X_seq_train_fit:", X_seq_train_fit.shape)  # (N*d_filtered, m, 1)
print("y_next_train_fit:", y_next_train_fit.shape)  # (N*d_filtered, 1)

# 4) 인코더/사전학습
encoder, pretrain_model = build_encoder(m=20)
batch_size = max(32, int(len(X_seq_train_fit) * 0.1))

history = pretrain_model.fit(
    X_seq_train_fit, y_next_train_fit,
    validation_data=(X_seq_val_fit, y_next_val_fit),
    epochs=20, batch_size=batch_size, verbose=1
)

# 5) 임베딩 추출 (val/test에도 동일)
emb_train = encoder.predict(X_seq_train, verbose=0)
emb_val   = encoder.predict(X_seq_val,   verbose=0)

# 탭울러 피처가 있다면 concat해서 GBDT로 학습
# X_gbdt_train = np.concatenate([emb_train, X_tab_train], axis=1)
# X_gbdt_val   = np.concatenate([emb_val,   X_tab_val],   axis=1)

# 임베딩만 GBDT 입력으로 사용
X_gbdt_train = emb_train
X_gbdt_val   = emb_val



# Reshape X and y to fit LSTM input requirements:
# X: (samples * num_objects, m, 1)
# y: (samples * num_objects, K, 1)
print("X_seq_train:", X_seq_train.shape)
print("y_pop_train:", y_pop_train.shape, "y_next_train:", y_next_train.shape)
print("X_seq_val:  ", X_seq_val.shape)
print("X_seq_test: ", X_seq_test.shape)

X_seq_train_fit: (174080, 20, 2)
y_next_train_fit: (174080, 1)
Epoch 1/20


ValueError: Exception encountered when calling LSTMCell.call().

[1mDimensions must be equal, but are 2 and 1 for '{{node encoder_pretrain_1/enc_lstm_1_1/lstm_cell_1/MatMul}} = MatMul[T=DT_FLOAT, grad_a=false, grad_b=false, transpose_a=false, transpose_b=false](encoder_pretrain_1/enc_lstm_1_1/strided_slice_1, encoder_pretrain_1/enc_lstm_1_1/lstm_cell_1/Cast/ReadVariableOp)' with input shapes: [17408,2], [1,512].[0m

Arguments received by LSTMCell.call():
  • inputs=tf.Tensor(shape=(17408, 2), dtype=float32)
  • states=('tf.Tensor(shape=(17408, 128), dtype=float32)', 'tf.Tensor(shape=(17408, 128), dtype=float32)')
  • training=True

In [None]:
np.save("X_seq_train.npy", X_seq_train)
np.save("y_pop_train.npy", y_pop_train)
np.save("y_next_train.npy", y_next_train)

np.save("X_seq_val.npy", X_seq_val)
np.save("y_pop_val.npy", y_pop_val)
np.save("y_next_val.npy", y_next_val)

np.save("X_seq_test.npy", X_seq_test)
np.save("y_pop_test.npy", y_pop_test)
np.save("y_next_test.npy", y_next_test)

### LSTM 인코더를 사전학습한 결과
- 세로축은 MAE, 가로축은 Epoch
- 학습이 진행될수록 검증 손실이 꾸준히 감소
- -> 모델이 객체의 요청 간격 패턴을 안정적으로 학습하고 있음

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title("Pretrain Loss Curve")
plt.xlabel("Epochs")
plt.ylabel("Loss (MAE)")
plt.legend()
plt.show()

In [None]:
# 1) 라벨을 객체별로 평탄화
y_next_train_flat = y_next_train.reshape(-1)  # (N*d,)
y_next_val_flat   = y_next_val.reshape(-1)

# 2) 다음간격 회귀용 마스크 (NaN/inf 제거)
mask_tr_next = np.isfinite(y_next_train_flat)
mask_va_next = np.isfinite(y_next_val_flat)

# 3) 마스크 적용한 시퀀스만으로 임베딩 추출
X_seq_train_next = X_seq_train[mask_tr_next]      # (n_tr_fit, m, 1)
X_seq_val_next   = X_seq_val[mask_va_next]        # (n_va_fit, m, 1)

emb_train_next = encoder.predict(X_seq_train_next, verbose=0)  # (n_tr_fit, 64)
emb_val_next   = encoder.predict(X_seq_val_next,   verbose=0)  # (n_va_fit, 64)

# 4) 회귀용 y도 동일 마스크 적용 + log1p 변환
y_next_train_fit = y_next_train_flat[mask_tr_next]             # (n_tr_fit,)
y_next_val_fit   = y_next_val_flat[mask_va_next]               # (n_va_fit,)
y_next_tr_t = np.log1p(y_next_train_fit)
y_next_va_t = np.log1p(y_next_val_fit)

In [None]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation

# reg = lgb.LGBMRegressor(
#     objective="mae", learning_rate=0.05, num_leaves=64, n_estimators=2000
# )
reg = lgb.LGBMRegressor(
    objective="quantile", alpha=0.5,
    learning_rate=0.05, num_leaves=64, n_estimators=2000
)
reg.fit(
    emb_train_next, y_next_tr_t,
    eval_set=[(emb_val_next, y_next_va_t)],
    eval_metric="l1",
    callbacks=[early_stopping(100), log_evaluation(100)],
)

# 6) 예측 및 역변환
y_next_val_pred = np.expm1(reg.predict(emb_val_next, num_iteration=reg.best_iteration_))

# 7) 확인 (길이 일치해야 함)
print("emb_train_next:", emb_train_next.shape,
      "y_next_tr_t:", y_next_tr_t.shape)
print("emb_val_next:", emb_val_next.shape,
      "y_next_va_t:", y_next_va_t.shape)

In [None]:
import numpy as np

def expand_for_next(emb, ynext_mat, add_object_onehot=True):
    """
    emb        : np.ndarray, (N_seq, F) 또는 (N_seq*d, F)
    ynext_mat  : np.ndarray, (N_seq, d)   각 (시퀀스, 객체) 다음 도착까지 시간(초), 미등장은 NaN
    add_object_onehot : bool, True면 객체 원-핫을 피처 뒤에 추가

    return:
      X_next : np.ndarray, (N_seq*d_filtered, F [+ d])
      y_next : np.ndarray, (N_seq*d_filtered,)   # NaN 제거됨
      obj_idx: np.ndarray, (N_seq*d_filtered,)   # 각 행의 객체 인덱스(0..d-1)
    """
    emb = np.asarray(emb)
    ynext_mat = np.asarray(ynext_mat)
    assert emb.ndim == 2 and ynext_mat.ndim == 2, "emb는 2D, ynext_mat는 2D여야 합니다."

    N_seq, d = ynext_mat.shape
    N_emb, F = emb.shape

    # 공통: 평탄화 타깃과 객체 인덱스
    y_flat = ynext_mat.reshape(-1)                         # (N_seq*d,)
    obj_idx_all = np.tile(np.arange(d, dtype=np.int32), reps=N_seq)  # (N_seq*d,)

    if N_emb == N_seq:
        # 1) 임베딩이 시퀀스 단위 → 객체축으로 확장
        X_rep = np.repeat(emb, repeats=d, axis=0)          # (N_seq*d, F)
        X_base = X_rep
    elif N_emb == N_seq * d:
        # 2) 임베딩이 이미 시퀀스×객체 단위
        X_base = emb                                       # (N_seq*d, F)
    else:
        raise ValueError(
            f"Shape mismatch: emb={emb.shape}, ynext_mat={ynext_mat.shape}. "
            "emb의 첫 차원이 N_seq 또는 N_seq*d 여야 합니다."
        )

    # NaN(미등장) 제거 마스크
    mask = ~np.isnan(y_flat)
    X_base = X_base[mask]
    y_next = y_flat[mask]
    obj_idx = obj_idx_all[mask]

    # 원-핫 추가 옵션
    if add_object_onehot:
        # 주의: d가 클 경우 메모리 사용량이 커질 수 있습니다.
        obj_oh = np.eye(d, dtype=np.float32)[obj_idx]      # (n_kept, d)
        X_next = np.concatenate([X_base.astype(np.float32, copy=False), obj_oh], axis=1)
    else:
        X_next = X_base.astype(np.float32, copy=False)

    return X_next, y_next, obj_idx


In [None]:
import numpy as np

def align_for_pop(emb, y_pop_mat, add_object_onehot=True):
    """
    emb: (N_emb, F)
    y_pop_mat: (N_seq, d)
    return:
      X_pop, y_pop, obj_idx
    """
    N_emb, F = emb.shape
    N_seq, d = y_pop_mat.shape

    if N_emb == N_seq:
        # 임베딩이 '시퀀스 단위' → 객체축으로 확장 필요
        X_pop, y_pop, obj_idx = expand_objectwise_features(
            emb, y_pop_mat, add_object_onehot=add_object_onehot
        )
        return X_pop, y_pop, obj_idx

    elif N_emb == N_seq * d:
        # 임베딩이 이미 '시퀀스×객체 단위' → 라벨만 평탄화
        X_pop = emb
        y_pop = y_pop_mat.reshape(-1).astype(np.int32, copy=False)
        obj_idx = np.tile(np.arange(d, dtype=np.int32), reps=N_seq)
        return X_pop, y_pop, obj_idx

    else:
        raise ValueError(
            f"Shape mismatch: emb={emb.shape}, y_pop_mat={y_pop_mat.shape}. "
            "임베딩이 시퀀스 단위인지(=N_seq) 또는 시퀀스×객체 단위인지(=N_seq*d) 확인하세요."
        )


In [None]:
# extract embedding
emb_tr = extract_embeddings(encoder, X_seq_train)   # (n, 64)
emb_va = extract_embeddings(encoder, X_seq_val)

# 객체 축으로 확장 (인기 분류용)
# X_pop_tr, y_pop_tr, obj_idx_pop_tr = expand_objectwise_features(emb_tr, y_pop_train, add_object_onehot=True)
# X_pop_va, y_pop_va, obj_idx_pop_va = expand_objectwise_features(emb_va, y_pop_val, add_object_onehot=True)

# 2) 모양에 맞춰 자동 정렬
X_pop_tr, y_pop_tr, obj_idx_pop_tr = align_for_pop(emb_tr, y_pop_train, add_object_onehot=True)
X_pop_va, y_pop_va, obj_idx_pop_va = align_for_pop(emb_va, y_pop_val,   add_object_onehot=True)

# 3) 학습
clf = lgb.LGBMClassifier(objective="binary", learning_rate=0.05, num_leaves=64, n_estimators=2000)
clf.fit(
    X_pop_tr, y_pop_tr,
    eval_set=[(X_pop_va, y_pop_va)],
    eval_metric="auc",
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(100)]
)

# next 회귀 입력 준비 (원-핫을 쓰지 않을 거면 False로 통일!)
X_next_tr, y_next_tr, obj_idx_tr = expand_for_next(emb_tr, y_next_train, add_object_onehot=True)
X_next_va, y_next_va, obj_idx_va = expand_for_next(emb_va, y_next_val, add_object_onehot=True)


In [None]:
#    λ̂ = 1 / (τ̂ + ε)  로 변환 후, object_id 별 중앙값으로 집계
import pandas as pd
eps = 1e-6
hazard_va = 1.0 / (y_next_val_pred + eps)  # (N_va * d,)  각 (시퀀스,객체) 행의 λ̂

df_lambda_rows = pd.DataFrame({
    "object_id": object_ids[obj_idx_va],   # obj_idx → 실제 object_id 매핑
    "pred_lambda": hazard_va                    # 초당 요청률(1/초)
})

# 객체별 중앙값(robust)으로 대표값 산출
df_lambda = (
    df_lambda_rows
    .groupby("object_id", as_index=False)["pred_lambda"]
    .median()
    .sort_values("object_id")
)

# CSV 저장
df_lambda.to_csv("pred_lambda.csv", index=False)
print("Saved pred_lambda.csv")
print(df_lambda.head())

artifacts = {
    "encoder": encoder,
    "reg_next": reg,
    "clf_pop": clf,
    "object_ids": object_ids,
    "m": m
}
print("artifacts", artifacts)

In [None]:
import os
os.listdir('.')

In [None]:
from sklearn.metrics import mean_absolute_error, roc_auc_score, average_precision_score

print("Next MAE:", mean_absolute_error(y_next_val_fit, y_next_val_pred))

uniq = np.unique(y_pop_val_fit)
if uniq.size < 2:
    pos_rate = float((y_pop_val_fit == 1).mean())
    print(f"Pop AUC: (스킵, 한 클래스만 존재)  양성비율={pos_rate:.4f}")
else:
    print("Pop AUC:", roc_auc_score(y_pop_val_fit, y_pop_val_prob))
    print("Pop PR-AUC:", average_precision_score(y_pop_val_fit, y_pop_val_prob))

In [None]:
idx = np.random.choice(len(y_next_val_fit), size=200, replace=False)
plt.scatter(y_next_val_fit[idx], y_next_val_pred[idx], s=8)
plt.xlabel("True next (sec)"); plt.ylabel("Pred next (sec)"); plt.title("Prediction vs Truth"); plt.show()

- X축: 실제 다음 도착 간격
- Y축: 모델이 예측한 다음 도착 간격

### 결과 해석
- 대부분의 점이 (0~200초) 근처에 밀집되어 있음 → 대부분의 객체는 짧은 간격 내에 재등장
- y축이 전반적으로 x축보다 낮게 분포 → 모델이 실제보다 짧은 inter-arrival time을 예측하는 경향


In [None]:
plt.hist(y_next_train_fit, bins=100)
plt.yscale('log')
plt.xlabel("next_interarrival (sec)")
plt.title("Label distribution")
plt.show()

In [None]:
y_train = y_next_train_fit.copy()
y_val   = y_next_val_fit.copy()

# 상/하한 설정
low = np.percentile(y_train[np.isfinite(y_train)],  0.5)   # 필요시 0 또는 0.1초
high = np.percentile(y_train[np.isfinite(y_train)], 99.0)  # 97~99.5% 사이로 테스트

y_tr_t = np.log1p(np.clip(y_train, low, high))
y_va_t = np.log1p(np.clip(y_val,   low, high))

In [None]:
y_hat = np.expm1(reg.predict(emb_val_next))
mae_lin = mean_absolute_error(y_val, y_hat)
mae_log = mean_absolute_error(np.log1p(y_val), np.log1p(y_hat))
print({"MAE": mae_lin, "MAE_log": mae_log})

In [None]:
bins = [0, 30, 60, 120, 300, 600, 1200, 3600, 1e9]
for a,b in zip(bins[:-1], bins[1:]):
    m = (y_val>=a)&(y_val<b)
    if m.any():
        print(f"[{a},{b}) n={m.sum():4d}",
              "MAE=", mean_absolute_error(y_val[m], y_hat[m]))

In [None]:
reg = lgb.LGBMRegressor(
    objective="regression",   # 로그라벨 회귀
    learning_rate=0.05,
    num_leaves=31,            # 과적합 완화
    min_data_in_leaf=200,     # 데이터 크기에 맞춰 100~500 시도
    n_estimators=4000,
    subsample=0.8, colsample_bytree=0.8,
)
reg.fit(
    emb_train_next, np.log1p(np.clip(y_train, low, high)),
    eval_set=[(emb_val_next, np.log1p(np.clip(y_val, low, high)))],
    eval_metric="l1",
    callbacks=[early_stopping(200), log_evaluation(200)],
)
y_hat = np.expm1(reg.predict(emb_val_next, num_iteration=reg.best_iteration_))


In [None]:
plt.scatter(np.log1p(y_val), np.log1p(y_hat), s=8)
plt.xlabel("log1p(True)"); plt.ylabel("log1p(Pred)"); plt.title("Log-Log Pred vs True"); plt.show()

import pandas as pd
df = pd.DataFrame({"y":y_val, "yhat":y_hat})
df["bin"] = pd.qcut(np.log1p(df["y"]), q=10)
print(df.groupby("bin").apply(lambda g: pd.Series({
    "n": len(g),
    "true_med": g["y"].median(),
    "pred_med": g["yhat"].median(),
    "MAE": mean_absolute_error(g["y"], g["yhat"])
})))

# TTL 추천 시스템

### input 파일 4가지

- feature_list.json : 필수! 모델에서 필요한 input 데이터 정의하는 파일
>[
  "pred_lambda",
  "lambda_mean",
  "lambda_peak",
  "lambda_cv",
  "burst_score",
  "n_req",
  "log_size",
  "hour_sin",
  "hour_cos",
  "wday_sin",
  "wday_cos",
  "tau_base"
]

- features_df : feature_list의 데이터들의 실제 입력 데이터 (지금 sample_df로 임의로 값 넣어주고 있음)

- base_policy.json : (선택) scale, size_coef, eps 값인데 기본값이 있어서 파일 안넣어도됨

- model.joblib : (선택) 잔차 보정 모델인데 이것도 안넣어도 기본 휴리스틱으로 결과값 나옴

- meta.json : (선택) tau_min, tau_max 값인데 기본값이 0.0, 300.0으로 설정되어있어서 파일 안넣어도 됨

In [None]:
# ttl_predictor_residual.py
from __future__ import annotations
import math
from datetime import datetime, timezone, timedelta
from typing import Optional, Tuple
import json, joblib, numpy as np, pandas as pd
from pathlib import Path
from typing import Tuple

KST = timezone(timedelta(hours=9))

# ==== Defaults & helpers ====
DEFAULT_FEATURE_LIST = [
    "pred_lambda","lambda_mean","lambda_peak","lambda_cv","burst_score","n_req",
    "log_size","hour_sin","hour_cos","wday_sin","wday_cos","tau_base"
]
DEFAULT_BASE_POLICY = {"scale": 30.0, "size_coef": 0.3, "eps": 1e-6}
DEFAULT_META = {"tau_min": 0.0, "tau_max": 300.0}

def _load_json_or_default(path: str | Path | None, default_obj):
    try:
        if path and Path(path).exists():
            return json.loads(Path(path).read_text(encoding="utf-8"))
    except Exception:
        pass
    return default_obj

class ZeroModel:
    def predict(self, X):
        return np.zeros((len(X),), dtype=float)


def _validate_request_df(df: pd.DataFrame) -> pd.DataFrame:
    """요청 로그 DataFrame 검증/정리: 필수 컬럼, 타입 캐스팅, NaN 제거 등."""
    required = {"object_id", "request_time", "size_bytes"}
    if not required.issubset(df.columns):
        raise ValueError(f"request_df에는 {sorted(required)} 컬럼이 필요합니다. 현재 컬럼: {sorted(df.columns)}")

    df = df.copy()
    # object_id는 float로 올 수 있으므로 문자열로 통일(그룹핑/머지 안정성)
    df["object_id"] = df["object_id"].astype(str)
    df["request_time"] = pd.to_numeric(df["request_time"], errors="coerce")
    df["size_bytes"] = pd.to_numeric(df["size_bytes"], errors="coerce").fillna(0).astype(int)
    df = df.dropna(subset=["request_time"])
    return df

def _compute_features_per_object(g: pd.DataFrame) -> dict:
    """단일 object_id 그룹에 대해 통계 피처 계산."""
    ts = np.sort(g["request_time"].values.astype(float))
    n = len(ts)

    # 간격
    if n >= 2:
        intervals = np.diff(ts)
        intervals = intervals[intervals > 0]  # 0 간격 제거
    else:
        intervals = np.array([], dtype=float)

    # lambda_mean / lambda_peak
    if intervals.size > 0:
        lam_mean = 1.0 / np.mean(intervals)
        lam_peak = 1.0 / np.min(intervals)
    else:
        lam_mean = 0.0
        lam_peak = 0.0

    # lambda_cv
    if intervals.size >= 2 and np.mean(intervals) > 0:
        lam_cv = float(np.std(intervals) / np.mean(intervals))
    else:
        lam_cv = 0.0

    # burst_score (P95 / median)
    if intervals.size >= 2 and np.median(intervals) > 0:
        p95 = float(np.percentile(intervals, 95))
        med = float(np.median(intervals))
        burst = float(p95 / med) if med > 0 else 1.0
        if not np.isfinite(burst) or burst <= 0:
            burst = 1.0
    else:
        burst = 1.0

    # n_req
    n_req = int(n)

    # size_bytes: 동일 object 내 상이할 수 있으므로 최대값 사용(정책에 따라 변경 가능)
    size_val = int(np.max(g["size_bytes"].values)) if n > 0 else 0

    # 시간 피처: 가장 최신 요청 시각 기준, KST(+09:00)
    last_ts = float(ts[-1]) if n > 0 else float("nan")
    if math.isfinite(last_ts):
        dt_kst = datetime.fromtimestamp(last_ts, tz=KST)
        hour = dt_kst.hour
        weekday = dt_kst.weekday()  # Monday=0
    else:
        hour, weekday = 0, 0

    return {
        "lambda_mean": float(lam_mean),
        "lambda_peak": float(lam_peak),
        "lambda_cv": float(lam_cv),
        "burst_score": float(burst),
        "n_req": n_req,
        "size_bytes": size_val,
        "hour": int(hour),
        "weekday": int(weekday),
    }

def build_features_from_dataframe(
    request_df: pd.DataFrame,
    pred_lambda_df: Optional[pd.DataFrame] = None,
) -> pd.DataFrame:
    """
    요청 로그 DataFrame(필수)와 pred_lambda DataFrame(선택)을 받아 features_df 생성.
    - request_df: columns=['object_id', 'request_time', 'size_bytes']
    - pred_lambda_df: columns=['object_id', 'pred_lambda']  (없으면 pred_lambda=lambda_mean)
    반환: features_df (object_id별 1행)
    """
    req = _validate_request_df(request_df)

    feats = (
        req.sort_values(["object_id", "request_time"])
           .groupby("object_id", as_index=False)
            .apply(lambda g: pd.Series(_compute_features_per_object(g)), include_groups=False)
            .reset_index()
           .drop(columns=["level_0"], errors="ignore")
    )

    if pred_lambda_df is not None:
        pl = pred_lambda_df.copy()
        if not {"object_id", "pred_lambda"}.issubset(pl.columns):
            raise ValueError("pred_lambda_df에는 columns=['object_id','pred_lambda'] 가 필요합니다.")
        pl["object_id"] = pl["object_id"].astype(str)
        pl["pred_lambda"] = pd.to_numeric(pl["pred_lambda"], errors="coerce")
        feats = feats.merge(pl[["object_id", "pred_lambda"]], on="object_id", how="left")
        feats["pred_lambda"] = feats["pred_lambda"].fillna(feats["lambda_mean"])
    else:
        feats["pred_lambda"] = feats["lambda_mean"]

    return feats

def build_features_from_csv(
    request_csv_path: str,
    pred_lambda_csv_path: Optional[str] = None,
) -> Tuple[pd.DataFrame, str]:
    """
    CSV를 직접 받아 features_df 생성.
    반환: (features_df, pred_lambda_source)
    """
    request_df = pd.read_csv('/kaggle/input/ttl-dataset/request_data.csv')
    pred_df = None
    src = "lambda_mean(default)"
    if pred_lambda_csv_path:
        pred_df = pd.read_csv('/kaggle/input/ttl-dataset/pred_lambda.csv')
        src = "pred_lambda.csv"

    feats = build_features_from_dataframe(request_df, pred_df)
    return feats, src
# ===== end of Data Preparation functions =====


# -------- Guardrail defaults --------
_DEFAULT_TAU_MIN = 0.0
_DEFAULT_TAU_MAX = 300.0
_EPS_RATE = 1e-6

def _cyclical_encode(series: pd.Series, period: float) -> Tuple[pd.Series, pd.Series]:
    s = pd.to_numeric(series, errors="coerce").fillna(0.0).astype(float)
    return np.sin(2*np.pi*s/period), np.cos(2*np.pi*s/period)

def _safe_num(series: pd.Series, default: float = 0.0, min_value: float | None = None) -> np.ndarray:
    """Coerce to finite float, replace NaN/inf with default, clamp to min_value if provided."""
    x = pd.to_numeric(series, errors="coerce").astype(float)
    x = x.replace([np.inf, -np.inf], np.nan).fillna(default).to_numpy()
    if min_value is not None:
        x = np.maximum(x, min_value)
    return x

def _compute_tau_base(df: pd.DataFrame, cfg: dict, tau_min: float, tau_max: float) -> np.ndarray:
    """
    Heuristic baseline (updated to match policy):
      - TTL increases with popularity (rate), decreases with size.
      - tau_base = scale * rate / (1 + size_coef * log1p(size))
      - Fallback: if BOTH pred_lambda and lambda_mean are missing -> return tau_max (default).
      - Final clipping to [tau_min, tau_max].
    """
    eps = float(cfg.get("eps", _EPS_RATE))
    scale = float(cfg.get("scale", 30.0))
    size_coef = float(cfg.get("size_coef", 0.3))

    # raw series to detect missing rows
    rate_pred_raw = df.get("pred_lambda", pd.Series([np.nan]*len(df)))
    rate_obs_raw  = df.get("lambda_mean", pd.Series([np.nan]*len(df)))
    rate_missing_mask = rate_pred_raw.isna() & rate_obs_raw.isna()

    # numeric, non-negative
    rate_pred = _safe_num(rate_pred_raw, default=0.0, min_value=0.0)
    rate_obs  = _safe_num(rate_obs_raw,  default=0.0, min_value=0.0)

    # prefer predicted if available/nonzero else observed
    rate = np.where(np.isfinite(rate_pred) & (rate_pred > 0.0), rate_pred, rate_obs)
    rate = np.maximum(rate, 0.0)

    size = _safe_num(df.get("size_bytes", pd.Series([0]*len(df))), default=0.0, min_value=0.0)
    log_size = np.log1p(size)

    # proportional to rate (NOT inverse)
    denom = (1.0 + size_coef * log_size)
    tau = scale * np.maximum(eps, rate) / denom

    # 원본 결측 플래그가 있으면 fallback -> default = tau_max
    if "_missing_rate" in df.columns:
        mask = df["_missing_rate"].to_numpy(dtype=bool)
        tau[mask] = tau_max
    else:
        # 레거시: 둘 다 NaN인 경우만 폴백
        tau[rate_missing_mask.to_numpy()] = tau_max

    return np.clip(tau, tau_min, tau_max)




class TTLResidualPredictor:
    """
    Residual policy: TTL = tau_base + residual_model(features)
    Guardrails are enforced here (clipping to [tau_min, tau_max], fallback defaults, numeric sanitation).
    """
    def __init__(self, model_path: str | None = None,
                 feature_list_path: str | None = None,
                 base_policy_path: str | None = None,
                 meta_path: str | None = None):

        # 1) feature list / base policy / meta: 경로가 없거나 파일이 없어도 내부 디폴트 사용
        self.feat_cols = _load_json_or_default(feature_list_path, DEFAULT_FEATURE_LIST)
        self.base_cfg  = _load_json_or_default(base_policy_path, DEFAULT_BASE_POLICY)
        meta = _load_json_or_default(meta_path, DEFAULT_META)
        self.tau_min = float(meta.get("tau_min", DEFAULT_META["tau_min"]))
        self.tau_max = float(meta.get("tau_max", DEFAULT_META["tau_max"]))

        # 2) residual model: 모델 경로가 없거나 로드 실패 → ZeroModel 폴백
        self.model = None
        if model_path and Path(model_path).exists():
            try:
                self.model = joblib.load(model_path)
            except Exception as e:
                import warnings
                warnings.warn(f"[warn] failed to load model '{model_path}': {e}. Falling back to ZeroModel().")
        if self.model is None:
            self.model = ZeroModel()

    def _ensure_derived(self, df: pd.DataFrame) -> pd.DataFrame:
        df = df.copy()

        # '원본' 결측 여부 캡처
        pred_raw = df.get("pred_lambda")
        obs_raw  = df.get("lambda_mean") if "lambda_mean" in df else None
        if pred_raw is None:
            pred_missing = pd.Series([True]*len(df))
        else:
            pred_missing = pd.to_numeric(pred_raw, errors="coerce").isna()
        if obs_raw is None:
            obs_missing = pd.Series([True]*len(df))
        else:
            obs_missing = pd.to_numeric(obs_raw, errors="coerce").isna()
        df["_missing_rate"] = (pred_missing & obs_missing)


        # Fill obvious missing columns with safe defaults
        for col, dflt in [
            ("pred_lambda", 0.0),
            ("lambda_mean", 0.0),
            ("size_bytes",  0.0),
            ("hour",        0.0),
            ("weekday",     0.0),
            ("lambda_peak", 0.0),
            ("lambda_cv",   0.0),
            ("burst_score", 1.0),
            ("n_req",       0.0),
        ]:
            if col not in df:
                df[col] = dflt

        # Derived features
        hs, hc = _cyclical_encode(df.get("hour"), 24.0)
        df["hour_sin"], df["hour_cos"] = hs, hc
        ws, wc = _cyclical_encode(df.get("weekday"), 7.0)
        df["wday_sin"], df["wday_cos"] = ws, wc
        df["log_size"] = np.log1p(_safe_num(df["size_bytes"], default=0.0, min_value=0.0))
        if "lambda_x_size" in self.feat_cols and "lambda_x_size" not in df:
            df["lambda_x_size"] = _safe_num(df["pred_lambda"], 0.0, 0.0) * df["log_size"]

        # tau_base (with guardrails)
        df["tau_base"] = _compute_tau_base(df, self.base_cfg, self.tau_min, self.tau_max)

        # Ensure all features exist
        for c in self.feat_cols:
            if c not in df:
                df[c] = 0.0

        # Sanitize final matrix
        X = df[self.feat_cols].apply(pd.to_numeric, errors="coerce").replace([np.inf, -np.inf], np.nan).fillna(0.0).to_numpy(dtype=float)
        return df, X

    def predict_ttl(self, features_df: pd.DataFrame, prev_ttl: float | None = None,
                    ema_ratio: float = 0.5, change_bound: float = 0.5) -> np.ndarray:
        """
        Returns TTL seconds with guardrails & optional smoothing.
        - Always clipped to [tau_min, tau_max]
        - If model outputs NaN/inf, treated as 0 residual
        - Optional smoothing & change bound applied if prev_ttl is given
        """
        df, X = self._ensure_derived(features_df)
        residual = self.model.predict(X)
        # Replace any NaN/inf residual with 0
        residual = np.where(np.isfinite(residual), residual, 0.0)

        ttl = df["tau_base"].to_numpy(dtype=float) + residual
        ttl = np.clip(ttl, self.tau_min, self.tau_max)

        if prev_ttl is not None:
            low, high = (1.0 - change_bound) * prev_ttl, (1.0 + change_bound) * prev_ttl
            ttl = np.minimum(np.maximum(ema_ratio * prev_ttl + (1 - ema_ratio) * ttl, low), high)

        # Final safety: ensure finite & within bounds
        ttl = np.where(np.isfinite(ttl), ttl, self.tau_min)
        ttl = np.clip(ttl, self.tau_min, self.tau_max)
        return ttl

if __name__ == "__main__":
    import os, json, numpy as np, pandas as pd, joblib
    from pathlib import Path

    # === Kaggle 경로 설정 ===
    BASE_DIR = Path("/kaggle/input/ttl-dataset")

    REQUEST_CSV = BASE_DIR / "request_data.csv"
    PRED_LAMBDA_CSV = BASE_DIR / "pred_lambda.csv"  # 없으면 자동 대체
    FEATURES_CSV = "features_df.csv"
    TTL_CSV = "ttl_results.csv"

    MODEL_PATH = BASE_DIR / "residual_zero_model.joblib"
    FEATURE_LIST_PATH = BASE_DIR / "feature_list.json"
    BASE_POLICY_PATH = BASE_DIR / "base_policy.json"
    META_PATH = BASE_DIR / "meta.json"

    # === 입력 체크 ===
    if not REQUEST_CSV.exists():
        raise FileNotFoundError(f"[ERROR] '{REQUEST_CSV}' 파일이 없습니다.")

    # 1) features 생성
    pred_csv_path = PRED_LAMBDA_CSV if PRED_LAMBDA_CSV.exists() else None
    feats, src_pred = build_features_from_csv(str(REQUEST_CSV), str(pred_csv_path) if pred_csv_path else None)
    feats.to_csv(FEATURES_CSV, index=False)
    print(f"[saved] {FEATURES_CSV}  (pred_lambda source: {src_pred})")

    # 2) TTL 예측
    predictor = TTLResidualPredictor(
        model_path=str(MODEL_PATH) if MODEL_PATH.exists() else None,
        feature_list_path=str(FEATURE_LIST_PATH) if FEATURE_LIST_PATH.exists() else None,
        base_policy_path=str(BASE_POLICY_PATH) if BASE_POLICY_PATH.exists() else None,
        meta_path=str(META_PATH) if META_PATH.exists() else None,
    )

    ttl = predictor.predict_ttl(feats)

    out = pd.DataFrame({
        "object_id": feats["object_id"],
        "ttl_seconds": ttl
    })
    out.to_csv(TTL_CSV, index=False)
    print(f"[saved] {TTL_CSV} (columns: object_id, ttl_seconds)")

    print("\n=== TTL 예측 샘플(상위 10개) ===")
    print(out.head(10).to_string(index=False))
    print("\n완료")