In [None]:
# ============================
# Full inference pipeline: raw -> features -> cluster -> model -> days -> date
# ============================
import numpy as np
import pandas as pd
import joblib
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from datetime import timedelta
import os
from typing import Dict, List

# -----------------------------
# 0) Config
# -----------------------------
FOURIER_CONFIG = {
    "weekly":  {"period": 7.0,      "K": 3},
    "monthly": {"period": 30.4375,  "K": 2},
    "yearly":  {"period": 365.25,   "K": 3},
}

# Must match the notebook's keep_cols (you had this exact list)
KEEP_COLS = [
    'DAYS_SINCE_LAST_TX', 'AMOUNT_LCY', 'AMOUNT_LCY_LOG',
    'EXCHANGE_DIFF', 'AMOUNT_RATIO', 'CLIENT_AMOUNT_LCY_mean', 
    'CLIENT_BOOKING_DATE_count', 'GROUP_AMOUNT_LCY_mean', 
    'DAYS_SINCE_ACCT_CREATION',
    'DESCRIPTION_OPERATION_ENCODED', 'NOM_GROUPE_ENCODED', 
    'DAY_OF_WEEK', 'WEEK_OF_YEAR', 'TX_NUM', 'TX_REVERSE_NUM',
    'QUARTER', 'CLIENT_DAYS_SINCE_LAST_TX_mean',
    'CLIENT_AMOUNT_LCY_std',
    # Cat columns follow, kept as category for CatBoost
    'SENS', 'CURRENCY', 'NATURE_CLIENT', 'DIRECTION', 'SEGMENTS'
]

CAT_COLS = ["SENS", "CURRENCY", "NATURE_CLIENT", "DIRECTION", "SEGMENTS",
            "DESCRIPTION_OPERATION_ENCODED", "NOM_GROUPE_ENCODED"]

FRM_SCALER_PATH = "scaler.pkl"
KMEANS_PATH     = "kmeans.pkl"
CATBOOST_PATH   = "CatBoost.pkl"
RIDGE_PATH      = "Ridge.pkl"

# -----------------------------
# 1) Utils
# -----------------------------
def _relative_days(dates, ref_date):
    s = pd.Series(dates)
    if pd.api.types.is_datetime64_any_dtype(s):
        delta = s - pd.Timestamp(ref_date)
    elif pd.api.types.is_timedelta64_dtype(s):
        delta = s
    else:
        s = pd.to_datetime(s, errors="coerce")
        delta = s - pd.Timestamp(ref_date)
    return (delta / pd.Timedelta(days=1)).to_numpy(dtype=float)

def make_fourier_df(dates, config, ref_date):
    t = _relative_days(dates, ref_date)
    feats = {}
    for name, cfg in config.items():
        P, K = cfg["period"], cfg["K"]
        for k in range(1, K + 1):
            feats[f"{name}_sin_{k}"] = np.sin(2.0 * np.pi * k * t / P)
            feats[f"{name}_cos_{k}"] = np.cos(2.0 * np.pi * k * t / P)
    return pd.DataFrame(feats, index=pd.RangeIndex(len(t)))

def encode_categorical_on_full(df: pd.DataFrame):
    # Low-card cols + two label-encoded texts (as in your notebook)
    cat_cols = ['SENS', 'CURRENCY', 'NATURE_CLIENT', 'DIRECTION', 'SEGMENTS']
    for col in ['DESCRIPTION_OPERATION', 'NOM_GROUPE']:
        le = LabelEncoder()
        df[col + '_ENCODED'] = le.fit_transform(df[col].astype(str))
        cat_cols.append(col + '_ENCODED')
    # cast to category dtypes for CatBoost
    for c in cat_cols:
        df[c] = df[c].astype('category')
    return df

def feature_engineering(df: pd.DataFrame):
    # Date handling (expects BOOKING_DATE, AJOUTE_LE already datetime)
    df = df.sort_values(['CLIENT_ID', 'BOOKING_DATE']).copy()
    df['YEAR'] = df['BOOKING_DATE'].dt.year
    df['MONTH'] = df['BOOKING_DATE'].dt.month
    df['DAY'] = df['BOOKING_DATE'].dt.day
    df['DAY_OF_WEEK'] = df['BOOKING_DATE'].dt.dayofweek
    df['WEEK_OF_YEAR'] = df['BOOKING_DATE'].dt.isocalendar().week.astype('int32')
    df['IS_MONTH_START'] = df['BOOKING_DATE'].dt.is_month_start.astype(int)
    df['IS_MONTH_END'] = df['BOOKING_DATE'].dt.is_month_end.astype(int)
    df['QUARTER'] = df['BOOKING_DATE'].dt.quarter

    # Time-based
    df['DAYS_SINCE_ACCT_CREATION'] = (df['BOOKING_DATE'] - df['AJOUTE_LE']).dt.days
    df['DAYS_SINCE_LAST_TX'] = df.groupby('CLIENT_ID')['BOOKING_DATE'].diff().dt.days

    # Amount features (same as notebook)
    df['AMOUNT_RATIO'] = df['AMOUNT_FCY'] / df['AMOUNT_LCY']
    df['EXCHANGE_DIFF'] = df['EXCHANGE_RATE'] - df['COURS_MARCHE']
    df['AMOUNT_SIGN'] = np.where(df['SENS'] == 'A', 1, -1)
    df['AMOUNT_SIZE'] = np.log1p(np.abs(df['AMOUNT_LCY']))
    df['AMOUNT_LCY_LOG'] = np.log1p(df['AMOUNT_LCY'])

    # Client agg
    client_agg = df.groupby('CLIENT_ID').agg({
        'AMOUNT_LCY': ['mean', 'std', 'sum', 'min', 'max', 'count'],
        'DAYS_SINCE_LAST_TX': ['mean', 'std', 'median'],
        'BOOKING_DATE': ['min', 'max', 'count'],
    }).reset_index()
    client_agg.columns = ['CLIENT_' + '_'.join(col).strip('_') for col in client_agg.columns.values]
    client_agg = client_agg.rename(columns={'CLIENT_CLIENT_ID': 'CLIENT_ID'})
    df = pd.merge(df, client_agg, on='CLIENT_ID', how='left')

    # Group agg
    group_agg = df.groupby('NOM_GROUPE').agg({
        'AMOUNT_LCY': ['mean', 'median', 'std'],
        'BOOKING_DATE': ['count'],
    }).reset_index()
    group_agg.columns = ['GROUP_' + '_'.join(col).strip('_') for col in group_agg.columns.values]
    group_agg = group_agg.rename(columns={'GROUP_NOM_GROUPE': 'NOM_GROUPE'})
    df = pd.merge(df, group_agg, on='NOM_GROUPE', how='left')

    # Segment agg (count-only to avoid target leakage)
    segment_agg = df.groupby('SEGMENTS').agg({'BOOKING_DATE': ['count']}).reset_index()
    segment_agg.columns = ['SEGMENT_' + '_'.join(col).strip('_') for col in segment_agg.columns.values]
    segment_agg = segment_agg.rename(columns={'SEGMENT_SEGMENTS': 'SEGMENTS'})
    df = pd.merge(df, segment_agg, on='SEGMENTS', how='left')

    # Sequencing
    df['TX_NUM'] = df.groupby('CLIENT_ID').cumcount() + 1
    df['TX_REVERSE_NUM'] = df.groupby('CLIENT_ID')['BOOKING_DATE'].rank(ascending=False, method='first')

    return df

def fill_missing_numeric(df: pd.DataFrame):
    # DAYS_SINCE_LAST_TX: fill per-client median (or 0 for first tx)
    first_tx_mask = df.groupby('CLIENT_ID')['BOOKING_DATE'].rank(method='first') == 1
    df['DAYS_SINCE_LAST_TX'] = (
        df.groupby('CLIENT_ID')['DAYS_SINCE_LAST_TX']
          .transform(lambda s: s.fillna(s.median() if not pd.isna(s.median()) else 0))
    )
    df.loc[first_tx_mask, 'DAYS_SINCE_LAST_TX'] = 0

    # general numeric fills
    for col in df.columns:
        if df[col].dtype != 'category' and pd.api.types.is_numeric_dtype(df[col]):
            if df[col].isnull().any():
                if col.startswith(('CLIENT_', 'GROUP_', 'SEGMENT_')):
                    df[col] = df[col].fillna(df[col].median())
                else:
                    df[col] = df[col].fillna(0)
    return df

def build_frm_clusters(train_df: pd.DataFrame):
    # FRM on train only
    frm = train_df.groupby('CLIENT_ID').agg({
        'BOOKING_DATE': 'count',            # Frequency
        'AMOUNT_LCY': 'mean',               # Monetary
        'DAYS_SINCE_LAST_TX': 'mean',       # Recency proxy
    }).reset_index()
    frm.columns = ['CLIENT_ID', 'TX_FREQUENCY', 'TX_AMOUNT_MEAN', 'TX_RECENCY']
    frm['TX_AMOUNT_MEAN_LOG'] = np.log1p(np.abs(frm['TX_AMOUNT_MEAN']))

    # load if available; else fit & save
    if os.path.exists(FRM_SCALER_PATH):
        scaler = joblib.load(FRM_SCALER_PATH)
    else:
        scaler = StandardScaler()
        joblib.dump(scaler, FRM_SCALER_PATH)
    scaled = scaler.fit_transform(frm[['TX_FREQUENCY', 'TX_AMOUNT_MEAN_LOG', 'TX_RECENCY']])

    if os.path.exists(KMEANS_PATH):
        kmeans = joblib.load(KMEANS_PATH)
    else:
        kmeans = KMeans(n_clusters=2, random_state=42, n_init=10)
        joblib.dump(kmeans, KMEANS_PATH)

    frm['CLUSTER'] = kmeans.fit_predict(scaled)
    return frm[['CLIENT_ID', 'CLUSTER']], scaler, kmeans

def prepare_ridge_inputs(X_train: pd.DataFrame, X_test: pd.DataFrame):
    """Replicate Ridge training preprocessing: encode categoricals, scale numerics."""
    X_tr = X_train.copy()
    X_te = X_test.copy()
    # Fit per-column LabelEncoder on combined to avoid unknowns at inference
    for col in X_tr.columns:
        if X_tr[col].dtype.name in ("category", "object") or isinstance(X_tr[col].dtype, pd.CategoricalDtype):
            le = LabelEncoder()
            vals = pd.concat([X_tr[col].astype(str), X_te[col].astype(str)], axis=0)
            le.fit(vals)
            X_tr[col] = le.transform(X_tr[col].astype(str))
            X_te[col] = le.transform(X_te[col].astype(str))
    scaler = StandardScaler()
    X_tr_sc = scaler.fit_transform(X_tr)
    X_te_sc = scaler.transform(X_te)
    return X_tr_sc, X_te_sc, scaler

# -----------------------------
# 2) Main entry
# -----------------------------
def predict_next_dates(train_raw: pd.DataFrame, test_raw: pd.DataFrame) -> pd.DataFrame:
    # Parse datetimes
    for df in [train_raw, test_raw]:
        df['BOOKING_DATE'] = pd.to_datetime(df['BOOKING_DATE'])
        df['AJOUTE_LE']     = pd.to_datetime(df['AJOUTE_LE'])

    # Combine for FE (like your notebook), then re-split
    full = pd.concat([train_raw.copy(), test_raw.copy()], ignore_index=True)
    full = full.sort_values(['CLIENT_ID', 'BOOKING_DATE'])

    # Build NEXT date gap to compute target for train rows (if present)
    full['NEXT_BOOKING_DATE'] = full.groupby('CLIENT_ID')['BOOKING_DATE'].shift(-1)
    full['DAYS_TO_NEXT'] = (full['NEXT_BOOKING_DATE'] - full['BOOKING_DATE']).dt.days

    # Feature eng + categorical encoding
    full = feature_engineering(full)
    full = encode_categorical_on_full(full)

    # Split back to train/test by presence of target
    train_fe = full[full['DAYS_TO_NEXT'].notnull()].copy()
    test_fe  = full[full['DAYS_TO_NEXT'].isnull()].copy()

    # Fill NaNs in numerics
    train_fe = fill_missing_numeric(train_fe)
    test_fe  = fill_missing_numeric(test_fe)

    # FRM clustering using train only
    client_clusters, scaler, kmeans = build_frm_clusters(train_fe)

    # Attach cluster to both
    train_fe = train_fe.merge(client_clusters, on='CLIENT_ID', how='left')
    test_fe  = test_fe.merge(client_clusters,  on='CLIENT_ID', how='left')

    # Unseen clients in test → most frequent cluster
    if test_fe['CLUSTER'].isnull().any():
        most_freq_cluster = train_fe['CLUSTER'].mode()[0]
        test_fe['CLUSTER'] = test_fe['CLUSTER'].fillna(most_freq_cluster)

    # Targets (log) for consistency if we ever need train stats
    train_fe['DAYS_TO_NEXT_LOG'] = np.log1p(train_fe['DAYS_TO_NEXT'])

    # ----------------- Cluster 0 → CatBoost (+Fourier) -----------------
    cb_model = joblib.load(CATBOOST_PATH)

    test_c0 = test_fe[test_fe['CLUSTER'] == 0].copy()
    train_c0 = train_fe[train_fe['CLUSTER'] == 0].copy()

    # Build design matrix in the SAME ORDER used in training: keep_cols then Fourier
    X0_train_base = train_c0[KEEP_COLS].copy()
    X0_test_base  = test_c0[KEEP_COLS].copy()

    # ensure category dtypes are intact for CatBoost
    for c in CAT_COLS:
        if c in X0_train_base.columns:
            X0_train_base[c] = X0_train_base[c].astype('category')
        if c in X0_test_base.columns:
            X0_test_base[c] = X0_test_base[c].astype('category')

    # Reference date = min training date for seasonality phase (same logic as notebook)
    ref_date_c0 = pd.to_datetime(train_c0['BOOKING_DATE']).min()
    fourier_train = make_fourier_df(train_c0['BOOKING_DATE'], FOURIER_CONFIG, ref_date_c0)
    fourier_test  = make_fourier_df(test_c0['BOOKING_DATE'], FOURIER_CONFIG, ref_date_c0)

    X0_train_aug = pd.concat([X0_train_base.reset_index(drop=True), fourier_train.reset_index(drop=True)], axis=1)
    X0_test_aug  = pd.concat([X0_test_base.reset_index(drop=True),  fourier_test.reset_index(drop=True)],  axis=1)

    # Align cat categories to training baseline (avoid unseen-category blowups)
    cat_cols_present = [c for c in CAT_COLS if c in X0_train_aug.columns]
    base_cats = {c: X0_train_aug[c].cat.categories for c in cat_cols_present}
    for c in cat_cols_present:
        X0_test_aug[c] = X0_test_aug[c].astype('category').cat.set_categories(base_cats[c])

    # Predict log-days -> real days
    pred_c0_log = cb_model.predict(X0_test_aug)
    pred_c0 = np.expm1(pred_c0_log).astype(float)
    pred_c0 = np.clip(pred_c0, 0.0, None)
    pred_c0_days_int = np.ceil(pred_c0).astype(int)
    test_c0['pred_days'] = pred_c0
    test_c0['pred_days_int'] = pred_c0_days_int
    test_c0['predicted_next_date'] = pd.to_datetime(test_c0['BOOKING_DATE']) + pd.to_timedelta(test_c0['pred_days_int'], unit='D')

    # ----------------- Cluster 1 → Ridge -----------------
    ridge_model = joblib.load(RIDGE_PATH)

    test_c1 = test_fe[test_fe['CLUSTER'] == 1].copy()
    train_c1 = train_fe[train_fe['CLUSTER'] == 1].copy()

    X1_train = train_c1[KEEP_COLS].copy()
    X1_test  = test_c1[KEEP_COLS].copy()

    # Ridge training in your notebook:
    # - LabelEncode every categorical/object col
    # - StandardScale all features
    X1_train_sc, X1_test_sc, scaler_ridge = prepare_ridge_inputs(X1_train, X1_test)

    pred_c1_log = ridge_model.predict(X1_test_sc)
    pred_c1 = np.expm1(pred_c1_log).astype(float)
    pred_c1 = np.clip(pred_c1, 0.0, None)
    pred_c1_days_int = np.ceil(pred_c1).astype(int)
    test_c1['pred_days'] = pred_c1
    test_c1['pred_days_int'] = pred_c1_days_int
    test_c1['predicted_next_date'] = pd.to_datetime(test_c1['BOOKING_DATE']) + pd.to_timedelta(test_c1['pred_days_int'], unit='D')

    # ----------------- Combine & return -----------------
    out = pd.concat([test_c0, test_c1], axis=0).sort_index()
    # nice minimal output; add more columns if you want
    cols = [
        'REFERENCE_OPERATION_T24', 'CLIENT_ID', 'BOOKING_DATE',
        'CLUSTER', 'pred_days', 'pred_days_int', 'predicted_next_date'
    ]
    cols = [c for c in cols if c in out.columns]
    return out[cols].reset_index(drop=True)

# ============================
# Example usage
# ============================
# train = ...  # your raw train df (columns as you listed)
# test  = ...  # your raw test df
# preds = predict_next_dates(train, test)
# display(preds.head())
