User Segmentation Using RFM and debt/income/credit_score

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, PowerTransformer
import calendar
import holidays
import warnings


# Load datasets

In [2]:
DATA_PATH = Path.cwd().parent / 'merged-df.csv'
df = pd.read_csv(DATA_PATH)
print(f"Loaded dataset with {df.shape[0]:,} rows and {df.shape[1]:,} columns")

Loaded dataset with 13,305,915 rows and 36 columns


In [6]:
# sample 10% of the rows for dev
_original_n = len(df)
df = df.sample(frac=0.1, random_state=42).reset_index(drop=True)
print(f"Sampled {len(df):,} rows ({len(df)/_original_n:.2%} of original) for development") 

Sampled 1,330,592 rows (10.00% of original) for development


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1330592 entries, 0 to 1330591
Data columns (total 38 columns):
 #   Column                      Non-Null Count    Dtype         
---  ------                      --------------    -----         
 0   transaction_id              1330592 non-null  int64         
 1   date                        1330592 non-null  datetime64[ns]
 2   user_id                     1330592 non-null  int64         
 3   card_id                     1330592 non-null  int64         
 4   amount                      1330592 non-null  float64       
 5   use_chip                    1330592 non-null  object        
 6   merchant_id                 1330592 non-null  int64         
 7   merchant_city               1330592 non-null  object        
 8   merchant_state              1330592 non-null  object        
 9   zip                         1330592 non-null  object        
 10  mcc                         1330592 non-null  int64         
 11  description             

In [42]:
neg_tx = df[df['amount'] > 0].copy()
display(neg_tx[["transaction_id", "amount", "merchant_id", "category", "description", "card_card_type"]])

Unnamed: 0,transaction_id,amount,merchant_id,category,description,card_card_type
0,11978328,263.43,54850,Home & Utilities,Telecommunication Services,Debit
1,11363233,38.26,68135,Food & Dining,"Grocery Stores, Supermarkets",Debit
2,8117710,52.57,81833,Shopping & Retail,Drug Stores and Pharmacies,Debit
3,12606562,40.00,27092,Financial & Insurance,Money Transfer,Debit
4,12628171,4.58,44578,Food & Dining,Eating Places and Restaurants,Debit
...,...,...,...,...,...,...
1330587,22261028,8.19,9343,Food & Dining,Eating Places and Restaurants,Debit
1330588,12782263,38.46,61195,Transportation & Travel,Service Stations,Debit
1330589,15502502,44.75,20519,Food & Dining,Book Stores,Credit
1330590,12467135,49.03,43293,Food & Dining,Miscellaneous Food Stores,Credit


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13305915 entries, 0 to 13305914
Data columns (total 36 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   transaction_id              int64  
 1   date                        object 
 2   user_id                     int64  
 3   card_id                     int64  
 4   amount                      float64
 5   use_chip                    object 
 6   merchant_id                 int64  
 7   merchant_city               object 
 8   merchant_state              object 
 9   zip                         object 
 10  mcc                         int64  
 11  description                 object 
 12  category                    object 
 13  user_current_age            int64  
 14  user_retirement_age         int64  
 15  user_birth_year             int64  
 16  user_birth_month            int64  
 17  user_gender                 object 
 18  user_address                object 
 19  user_latitude      

# Problem definition — monthly per-customer forecasting

Goal
- Granularity: monthly per customer.
- Series: for each user_id define $amt_t$ = monthly total spend for month t.
- Look-back (LB): 12 months (t-11 … t).
- Target: $y_{t+1} = amt_{t+1}$ (next month).
- Supervised windows: for each user and cut-off t build one training row containing:
  - past-known covariates (lags, rolling windows, expanding stats computed using ≤ t)
  - future-known covariates (calendar/holiday dummies for t+1)
  - label: $amt_{t+1}$
- Data-leakage rule: Every feature for a window ending at month t must use only data ≤ t. Practically: always shift the series (e.g., `.shift(1)` or `.shift(lag)`) before computing rolling/expanding aggregates.



# Core transformations

In [None]:
# Parameters (tweakable)
Q_LOW = 0.01
Q_HIGH = 0.99
REINDEX_GLOBAL_THRESHOLD = 50_000_000  # heuristic: if users * months <= this we will expand globally
REINDEX_GLOBAL = False                 # set True to force global user x month grid (only if you understand memory)
DO_YEO_JOHNSON = False                 # if True, use PowerTransformer(method='yeo-johnson') fitted on train for amt
DO_SIGNED_LOG = True                   # if True (and YE0 not used) use sign(x)*log1p(|x|)
SCALER_TYPE = 'standard'               # options: 'standard', 'robust', 'minmax'
SCALED_FEAT_SUFFIX = '_s'

# Safety checks
required_cols = {'date', 'user_id', 'amount'}
missing = required_cols - set(df.columns)
if missing:
    raise KeyError(f"Missing required columns in df: {missing}")

# 1) monthly aggregation (amt_t, txn_cnt_t, avg_txn_amt_t)
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.to_period('M').dt.to_timestamp()

count_col = 'transaction_id' if 'transaction_id' in df.columns else None
if count_col:
    monthly = (
        df
        .groupby(['user_id', 'month'], as_index=False)
        .agg(amt_t=('amount', 'sum'), txn_cnt_t=(count_col, 'count'))
    )
else:
    monthly = (
        df
        .groupby(['user_id', 'month'], as_index=False)
        .agg(amt_t=('amount', 'sum'))
    )
    monthly['txn_cnt_t'] = (
        df
        .groupby(['user_id', 'month'])
        .size()
        .values
    )

monthly['avg_txn_amt_t'] = monthly['amt_t'] / monthly['txn_cnt_t'].clip(lower=1)

# 2) reindex to fill missing months per user (choose global or per-user min/max)
all_months = pd.date_range(monthly['month'].min(), monthly['month'].max(), freq='MS')
users = monthly['user_id'].unique()
estimated_rows = len(users) * len(all_months)
print(f"Users: {len(users):,}  Months: {len(all_months):,}  est rows (users x months): {estimated_rows:,}")

if REINDEX_GLOBAL or estimated_rows <= REINDEX_GLOBAL_THRESHOLD:
    # global full reindex (user x full date range)
    print("Performing global user x months reindex (fills every user for every month in the global span).")
    full_index = pd.MultiIndex.from_product([users, all_months], names=['user_id', 'month'])
    monthly = (
        monthly
        .set_index(['user_id', 'month'])
        .reindex(full_index, fill_value=0)
        .reset_index()
    )
else:
    # memory-saving per-user reindex: fill each user's missing months between their min and max month
    print("Large expansion avoided — reindexing each user to their local min/max months (memory-saving). This still fills missing months with zeros.")
    parts = []
    for uid, g in monthly.groupby('user_id'):
        idx = pd.date_range(g['month'].min(), g['month'].max(), freq='MS')
        g2 = (
            g
            .set_index('month')
            .reindex(idx, fill_value=0)
            .rename_axis('month')
            .reset_index()
        )
        g2['user_id'] = uid
        parts.append(g2)
    monthly = pd.concat(parts, ignore_index=True)

# ensure consistent dtypes
monthly = monthly.sort_values(['user_id', 'month']).reset_index(drop=True)
monthly['txn_cnt_t'] = monthly['txn_cnt_t'].fillna(0).astype(int)
monthly['amt_t'] = monthly['amt_t'].fillna(0).astype(float)
monthly['avg_txn_amt_t'] = monthly['avg_txn_amt_t'].fillna(0).astype(float)

print('monthly shape after reindex:', monthly.shape)
display(monthly.head())

# 3) Define time splits used to compute train-only transforms
max_month = monthly['month'].max()
test_start = max_month - pd.DateOffset(months=11)   # last 12 months -> test
val_start = test_start - pd.DateOffset(months=12)   # previous 12 months -> val
print('val_start:', val_start, 'test_start:', test_start)

train_mask = monthly['month'] < val_start
val_mask = (monthly['month'] >= val_start) & (monthly['month'] < test_start)
test_mask = monthly['month'] >= test_start

# 4) Outlier handling (winsorize) — compute thresholds on TRAIN only and apply to all splits
q_low, q_high = Q_LOW, Q_HIGH
train_monthly = monthly[train_mask]
if train_monthly.empty:
    warnings.warn('Training partition is empty — cannot compute train-only winsorization thresholds. Using global quantiles on entire data instead.')
    train_monthly = monthly

# per-user quantiles (train-only). Users without train rows will get filled with global quantiles
global_amt_q_low = train_monthly['amt_t'].quantile(q_low)
global_amt_q_high = train_monthly['amt_t'].quantile(q_high)
user_amt_q = (
    train_monthly
    .groupby('user_id')['amt_t']
    .agg(amt_q01=lambda x: x.quantile(q_low), amt_q99=lambda x: x.quantile(q_high))
)
user_amt_q['amt_q01'] = user_amt_q['amt_q01'].fillna(global_amt_q_low)
user_amt_q['amt_q99'] = user_amt_q['amt_q99'].fillna(global_amt_q_high)

monthly = monthly.merge(user_amt_q[['amt_q01', 'amt_q99']], left_on='user_id', right_index=True, how='left')
monthly['amt_q01'] = monthly['amt_q01'].fillna(global_amt_q_low)
monthly['amt_q99'] = monthly['amt_q99'].fillna(global_amt_q_high)
monthly['amt_t_wins'] = monthly['amt_t'].clip(lower=monthly['amt_q01'], upper=monthly['amt_q99'])

# avg_txn_amt winsorization (same approach)
global_avg_q_low = train_monthly['avg_txn_amt_t'].quantile(q_low)
global_avg_q_high = train_monthly['avg_txn_amt_t'].quantile(q_high)
user_avg_q = (
    train_monthly
    .groupby('user_id')['avg_txn_amt_t']
    .agg(avg_q01=lambda x: x.quantile(q_low), avg_q99=lambda x: x.quantile(q_high))
)
user_avg_q['avg_q01'] = user_avg_q['avg_q01'].fillna(global_avg_q_low)
user_avg_q['avg_q99'] = user_avg_q['avg_q99'].fillna(global_avg_q_high)

monthly = monthly.merge(user_avg_q[['avg_q01', 'avg_q99']], left_on='user_id', right_index=True, how='left')
monthly['avg_q01'] = monthly['avg_q01'].fillna(global_avg_q_low)
monthly['avg_q99'] = monthly['avg_q99'].fillna(global_avg_q_high)
monthly['avg_txn_amt_t_wins'] = monthly['avg_txn_amt_t'].clip(lower=monthly['avg_q01'], upper=monthly['avg_q99'])

# 5) Variance stabilization (optional). Default: sign-preserving log1p. Alternate: Yeo-Johnson fitted on train only.
if DO_YEO_JOHNSON:
    pt = PowerTransformer(method='yeo-johnson')
    # fit on the winsorized training values
    pt.fit(monthly.loc[train_mask, ['amt_t_wins']].values.reshape(-1, 1))
    monthly['amt_t_trans'] = pt.transform(monthly[['amt_t_wins']])
    core_transformers = {'amt_yeo': pt}
else:
    # signed log1p: preserves sign and compresses magnitude, safe for negatives/zeros
    monthly['amt_t_trans'] = np.sign(monthly['amt_t_wins']) * np.log1p(np.abs(monthly['amt_t_wins']))
    core_transformers = {}

# 6) Scaling: fit scaler on TRAIN features only and apply to all data. Keep scaler object for inverse transform.
feature_candidates = ['amt_t_trans', 'txn_cnt_t', 'avg_txn_amt_t_wins']
feature_cols = [c for c in feature_candidates if c in monthly.columns]
print('Feature columns available for scaling:', feature_cols)

scaler_cls = {'standard': StandardScaler, 'robust': RobustScaler, 'minmax': MinMaxScaler}[SCALER_TYPE]
feature_scaler = scaler_cls()
feature_scaler.fit(monthly.loc[train_mask, feature_cols].fillna(0))
scaled_cols = [c + SCALED_FEAT_SUFFIX for c in feature_cols]
monthly[scaled_cols] = feature_scaler.transform(monthly[feature_cols].fillna(0))

# Save artifacts for downstream use (winsorization thresholds, scaler, etc.)
core_transforms = {
    'q_low': q_low,
    'q_high': q_high,
    'global_amt_q': (global_amt_q_low, global_amt_q_high),
    'global_avg_q': (global_avg_q_low, global_avg_q_high),
    'user_amt_q': user_amt_q,           # DataFrame keyed by user_id with amt_q01/amt_q99
    'user_avg_q': user_avg_q,           # DataFrame keyed by user_id with avg_q01/avg_q99
    'feature_scaler': feature_scaler,
    'scaler_features': feature_cols,
    'scaler_suffix': SCALED_FEAT_SUFFIX,
    'transformers': core_transformers,
    'reindex_strategy': 'global' if (REINDEX_GLOBAL or estimated_rows <= REINDEX_GLOBAL_THRESHOLD) else 'per_user_minmax',
    'val_start': val_start,
    'test_start': test_start
}

print('\nCore transforms complete.')
print('monthly (post-core) shape:', monthly.shape)
display(monthly.head())

Users: 1,219  Months: 118  est rows (users x months): 143,842
Performing global user x months reindex (fills every user for every month in the global span).
monthly shape after reindex: (143842, 5)


Unnamed: 0,user_id,month,amt_t,txn_cnt_t,avg_txn_amt_t
0,0,2010-01-01,547.05,10,54.705
1,0,2010-02-01,864.79,11,78.617273
2,0,2010-03-01,783.05,14,55.932143
3,0,2010-04-01,857.4,8,107.175
4,0,2010-05-01,112.22,5,22.444


val_start: 2017-11-01 00:00:00 test_start: 2018-11-01 00:00:00
Feature columns available for scaling: ['amt_t_trans', 'txn_cnt_t', 'avg_txn_amt_t_wins']

Core transforms complete.
monthly (post-core) shape: (143842, 15)


Unnamed: 0,user_id,month,amt_t,txn_cnt_t,avg_txn_amt_t,amt_q01,amt_q99,amt_t_wins,avg_q01,avg_q99,avg_txn_amt_t_wins,amt_t_trans,amt_t_trans_s,txn_cnt_t_s,avg_txn_amt_t_wins_s
0,0,2010-01-01,547.05,10,54.705,-211.9624,1638.0521,547.05,-23.590975,124.533069,54.705,6.306367,0.521694,0.141389,0.371876
1,0,2010-02-01,864.79,11,78.617273,-211.9624,1638.0521,864.79,-23.590975,124.533069,78.617273,6.763642,0.72685,0.313203,1.065374
2,0,2010-03-01,783.05,14,55.932143,-211.9624,1638.0521,783.05,-23.590975,124.533069,55.932143,6.664473,0.682358,0.828644,0.407466
3,0,2010-04-01,857.4,8,107.175,-211.9624,1638.0521,857.4,-23.590975,124.533069,107.175,6.75507,0.723004,-0.202239,1.893597
4,0,2010-05-01,112.22,5,22.444,-211.9624,1638.0521,112.22,-23.590975,124.533069,22.444,4.729333,-0.185838,-0.71768,-0.563748


# Temporal & seasonal features

These features are known at forecast time and safe to include as future-known covariates. This cell will add:

- calendar one-hots (month_of_year, quarter),
- Fourier seasonality features (monthly, order K) for both t and t+1,
- holiday / event presence flags (country-specific via python-holidays when available) and a Black Friday detector,
- lead/lag event windows (month-level flags) such as "is_month_before_black_friday" and "is_week_before_black_friday",
- per-user Seasonal Index SI_{u,m} computed on TRAIN only: median(amt) for user & month / median(amt) overall for that user.

All user-level statistics (seasonal index) are computed using the training partition only to avoid leakage. The resulting features are attached for both t and t+1 so they can be joined into supervised windows later.

In [None]:
# Parameters for seasonal features
FOURIER_ORDER = 4  # K: number of harmonics to include
COUNTRY = 'US'     # country code for holidays (holidays import assumed at top)

m = 12  # monthly period

# base calendar features on the month timestamp (start of month)
monthly['month_of_year'] = monthly['month'].dt.month
monthly['quarter'] = monthly['month'].dt.quarter

# one-hot months and quarter (sparse if many users/months)
month_dummies = pd.get_dummies(monthly['month_of_year'].astype(int).astype(str), prefix='mo')
quarter_dummies = pd.get_dummies(monthly['quarter'].astype(int).astype(str), prefix='q')
monthly = pd.concat([monthly, month_dummies, quarter_dummies], axis=1)

# Fourier terms (for month index)
monthly = monthly.reset_index(drop=True)
# Create an integer month index (months since the global minimum month) — robust and avoids using unsupported np.timedelta64('M')
base = monthly['month'].min()
base_idx = base.year * 12 + base.month
monthly['month_index'] = monthly['month'].dt.year * 12 + monthly['month'].dt.month - base_idx
for k in range(1, FOURIER_ORDER + 1):
    monthly[f'sin_{k}'] = np.sin(2 * np.pi * k * monthly['month_index'] / m)
    monthly[f'cos_{k}'] = np.cos(2 * np.pi * k * monthly['month_index'] / m)

# Holidays and events (assumes holidays package available at top-level)
monthly['is_holiday'] = False
monthly['is_black_friday'] = False

yrs = range(monthly['month'].dt.year.min(), monthly['month'].dt.year.max() + 1)
hols = holidays.CountryHoliday(COUNTRY, years=yrs)
# For monthly data, mark a month as holiday-month if it contains any holiday
holiday_months = set(pd.to_datetime(list(hols.keys())).to_period('M').to_timestamp())
monthly['is_holiday'] = monthly['month'].isin(holiday_months)

# Black Friday detection: assume Black Friday = day after US Thanksgiving (4th Thu in Nov)
# Mark the month of Black Friday and a month-before flag to capture pre-event effects

def black_friday_months(year):
    # Thanksgiving: fourth Thursday in November
    nov = calendar.monthcalendar(year, 11)
    # find the Thursday index for the 4th Thursday
    thursdays = [week[calendar.THURSDAY] for week in nov if week[calendar.THURSDAY] != 0]
    if len(thursdays) >= 4:
        thanksgiving_day = thursdays[3]
    else:
        thanksgiving_day = thursdays[-1]
    bf = pd.Timestamp(year=year, month=11, day=thanksgiving_day + 1)
    return bf.to_period('M').to_timestamp()

bf_months = {black_friday_months(y) for y in range(monthly['month'].dt.year.min(), monthly['month'].dt.year.max() + 1)}
monthly['is_black_friday'] = monthly['month'].isin(bf_months)
monthly['is_month_before_black_friday'] = monthly['month'].isin([m_ - pd.DateOffset(months=1) for m_ in bf_months])

# -----------------------------
# Per-user Seasonal Index (SI_{u,m}) — VECTORISED (TRAIN ONLY)
# -----------------------------
train_monthly = monthly[train_mask]

# median per user overall (train only)
user_median_all = train_monthly.groupby('user_id')['amt_t_wins'].median().rename('user_med_all')
# median per user per month-of-year (train only)
user_median_by_month = (
    train_monthly
    .groupby(['user_id', 'month_of_year'])['amt_t_wins']
    .median()
    .rename('user_med_m')
    .reset_index()
)

# merge seasonal index inputs into monthly (vectorized join — avoids apply)
monthly = monthly.merge(user_median_all.reset_index(), on='user_id', how='left')
monthly = monthly.merge(user_median_by_month, on=['user_id', 'month_of_year'], how='left')

# guard against missing medians and zero denominators
monthly['user_med_all'] = monthly['user_med_all'].fillna(0)
monthly['seasonal_index'] = (monthly['user_med_m'] / monthly['user_med_all']).replace([np.inf, -np.inf], np.nan).fillna(1.0)

# attach SI for t+1 (month t+1) using a vectorized join on (user_id, month_tplus1_mo)
monthly['month_tplus1'] = monthly['month'] + pd.DateOffset(months=1)
monthly['month_tplus1_mo'] = monthly['month_tplus1'].dt.month
user_month_map_t1 = user_median_by_month.rename(columns={'month_of_year': 'month_tplus1_mo', 'user_med_m': 'user_med_m_tplus1'})
monthly = monthly.merge(user_month_map_t1[['user_id', 'month_tplus1_mo', 'user_med_m_tplus1']], on=['user_id', 'month_tplus1_mo'], how='left')
monthly['seasonal_index_tplus1'] = (monthly['user_med_m_tplus1'] / monthly['user_med_all']).replace([np.inf, -np.inf], np.nan).fillna(1.0)

# season tag one-hot
monthly['season'] = ((monthly['month_of_year'] % 12 + 3) // 3).map({1: 'winter', 2: 'spring', 3: 'summer', 4: 'fall'})
season_dummies = pd.get_dummies(monthly['season'], prefix='sea')
monthly = pd.concat([monthly, season_dummies], axis=1)

print('Temporal & seasonal features added. Columns now:', [c for c in monthly.columns if 'seasonal' in c or c.startswith('mo_') or c.startswith('sin_') or c.startswith('is_')][:40])


Temporal & seasonal features added. Columns now: ['mo_1', 'mo_10', 'mo_11', 'mo_12', 'mo_2', 'mo_3', 'mo_4', 'mo_5', 'mo_6', 'mo_7', 'mo_8', 'mo_9', 'mo_1', 'mo_10', 'mo_11', 'mo_12', 'mo_2', 'mo_3', 'mo_4', 'mo_5', 'mo_6', 'mo_7', 'mo_8', 'mo_9', 'sin_1', 'sin_2', 'sin_3', 'sin_4', 'is_holiday', 'is_black_friday', 'is_month_before_black_friday', 'seasonal_index', 'seasonal_index_tplus1']


Unnamed: 0,user_id,month,amt_t_wins,seasonal_index,seasonal_index_tplus1
0,0,2010-01-01,547.05,1.050831,0.683537
1,0,2010-02-01,864.79,0.683537,1.362648
2,0,2010-03-01,783.05,1.362648,1.040518
3,0,2010-04-01,857.4,1.040518,0.873485
4,0,2010-05-01,112.22,0.873485,1.395791
5,0,2010-06-01,268.53,1.395791,1.356828
6,0,2010-07-01,719.03,1.356828,1.166422
7,0,2010-08-01,535.2,1.166422,0.633069
8,0,2010-09-01,-211.9624,0.633069,0.827778
9,0,2010-10-01,262.56,0.827778,1.187411


In [17]:
monthly[['user_id','month','amt_t_wins','seasonal_index','seasonal_index_tplus1', 'is_black_friday', 'is_holiday']].head(15)

Unnamed: 0,user_id,month,amt_t_wins,seasonal_index,seasonal_index_tplus1,is_black_friday,is_holiday
0,0,2010-01-01,547.05,1.050831,0.683537,False,True
1,0,2010-02-01,864.79,0.683537,1.362648,False,True
2,0,2010-03-01,783.05,1.362648,1.040518,False,False
3,0,2010-04-01,857.4,1.040518,0.873485,False,False
4,0,2010-05-01,112.22,0.873485,1.395791,False,True
5,0,2010-06-01,268.53,1.395791,1.356828,False,False
6,0,2010-07-01,719.03,1.356828,1.166422,False,True
7,0,2010-08-01,535.2,1.166422,0.633069,False,False
8,0,2010-09-01,-211.9624,0.633069,0.827778,False,True
9,0,2010-10-01,262.56,0.827778,1.187411,False,True


# Lag features (amount & count lags, seasonal lags, seasonal diffs)

This cell computes classic lag features per user using groupby.shift to avoid leakage. By default it will create compact lag set [1,3,6,12] plus seasonal lags (12, 24) and seasonal difference s_diff_12 computed on the winsorized amount (and optionally on log-transformed values).

In [None]:
# === LAG FEATURES ===
# Configurable lag list; default compact set
LAGS = [1, 3, 6, 12]
SEASONAL_LAGS = [12, 24]  # include 24 only when at least 2 years of data per user
USE_COUNT_LAGS = True
COMPUTE_SEASONAL_DIFFS = True

# Ensure monthly is sorted
monthly = monthly.sort_values(['user_id','month']).reset_index(drop=True)

# Amount lags on winsorized amount and transformed amount (amt_t_trans)
for lag in sorted(set(LAGS + SEASONAL_LAGS)):
    monthly[f'lag_amt_{lag}'] = monthly.groupby('user_id')['amt_t_wins'].shift(lag)
    if 'amt_t_trans' in monthly.columns:
        monthly[f'lag_amt_trans_{lag}'] = monthly.groupby('user_id')['amt_t_trans'].shift(lag)

# Count lags
if USE_COUNT_LAGS:
    for lag in LAGS:
        monthly[f'lag_cnt_{lag}'] = monthly.groupby('user_id')['txn_cnt_t'].shift(lag)

# Seasonal differences (amt_t - amt_{t-12}) — also provide on transformed scale if available
if COMPUTE_SEASONAL_DIFFS:
    monthly['s_diff_12'] = monthly['amt_t_wins'] - monthly.groupby('user_id')['amt_t_wins'].shift(12)
    if 'amt_t_trans' in monthly.columns:
        monthly['s_diff_12_trans'] = monthly['amt_t_trans'] - monthly.groupby('user_id')['amt_t_trans'].shift(12)

# Feature width guard: if you want to reduce feature count, we can keep only LAGS + seasonal 12
compact_cols = []
for lag in LAGS:
    compact_cols.append(f'lag_amt_{lag}')
    if 'amt_t_trans' in monthly.columns:
        compact_cols.append(f'lag_amt_trans_{lag}')
    if USE_COUNT_LAGS:
        compact_cols.append(f'lag_cnt_{lag}')
# always include lag_12 for seasonality
compact_cols += [f'lag_amt_12']
if 'amt_t_trans' in monthly.columns:
    compact_cols += [f'lag_amt_trans_12']

print('Created lag columns; sample columns:', [c for c in monthly.columns if c.startswith('lag_')][:30])

Created lag columns; sample columns: ['lag_amt_1', 'lag_amt_trans_1', 'lag_amt_3', 'lag_amt_trans_3', 'lag_amt_6', 'lag_amt_trans_6', 'lag_amt_12', 'lag_amt_trans_12', 'lag_amt_24', 'lag_amt_trans_24', 'lag_cnt_1', 'lag_cnt_3', 'lag_cnt_6', 'lag_cnt_12']


Unnamed: 0,user_id,month,lag_amt_1,lag_amt_trans_1,lag_cnt_1,lag_amt_3,lag_amt_trans_3,lag_cnt_3,lag_amt_6,lag_amt_trans_6,lag_cnt_6,lag_amt_12,lag_amt_trans_12,lag_cnt_12,lag_amt_12.1,lag_amt_trans_12.1,s_diff_12
0,0,2010-01-01,,,,,,,,,,,,,,,
1,0,2010-02-01,547.05,6.306367,10.0,,,,,,,,,,,,
2,0,2010-03-01,864.79,6.763642,11.0,,,,,,,,,,,,
3,0,2010-04-01,783.05,6.664473,14.0,547.05,6.306367,10.0,,,,,,,,,
4,0,2010-05-01,857.4,6.75507,8.0,864.79,6.763642,11.0,,,,,,,,,
5,0,2010-06-01,112.22,4.729333,5.0,783.05,6.664473,14.0,,,,,,,,,
6,0,2010-07-01,268.53,5.59668,9.0,857.4,6.75507,8.0,547.05,6.306367,10.0,,,,,,
7,0,2010-08-01,719.03,6.579293,12.0,112.22,4.729333,5.0,864.79,6.763642,11.0,,,,,,
8,0,2010-09-01,535.2,6.284507,10.0,268.53,5.59668,9.0,783.05,6.664473,14.0,,,,,,
9,0,2010-10-01,-211.9624,-5.361116,4.0,719.03,6.579293,12.0,857.4,6.75507,8.0,,,,,,


In [20]:
# === ROLLING & EXPANDING FEATURES ===
# Fixed-width rolling windows (means, sums, stds, cv), momentum, acceleration, z-score, and EWM stats.
# Uses data ≤ t (rolling windows include current row "t").

# Parameters
ROLL_WINDOWS = [3, 6, 12]   # short, medium, long memory
EWM_ALPHAS = [0.5]          # list of alphas for EWM; default 0.5 (fast reaction)
EPS = 1e-8
ROLL_DDF = 0                # ddof for std (0 -> population std, avoids NaN for single obs)
MIN_PERIODS = 1             # allow shorter windows at start

# Ensure sorted
monthly = monthly.sort_values(['user_id', 'month']).reset_index(drop=True)

rolling_cols = []
ewm_cols = []
ewm_std_cols = []

# Rolling stats on winsorized amount
for w in ROLL_WINDOWS:
    roll_mean_col = f'roll_mean_{w}'
    roll_sum_col = f'roll_sum_{w}'
    roll_std_col = f'roll_std_{w}'
    monthly[roll_mean_col] = monthly.groupby('user_id')['amt_t_wins'].transform(lambda x: x.rolling(window=w, min_periods=MIN_PERIODS).mean())
    monthly[roll_sum_col] = monthly.groupby('user_id')['amt_t_wins'].transform(lambda x: x.rolling(window=w, min_periods=MIN_PERIODS).sum())
    # use ddof=0 to avoid NaN for single-value windows
    monthly[roll_std_col] = monthly.groupby('user_id')['amt_t_wins'].transform(lambda x: x.rolling(window=w, min_periods=MIN_PERIODS).std(ddof=ROLL_DDF)).fillna(0)
    monthly[f'roll_cv_{w}'] = monthly[roll_std_col] / (monthly[roll_mean_col] + EPS)

    rolling_cols += [roll_mean_col, roll_sum_col, roll_std_col, f'roll_cv_{w}']

    # also compute on transformed amount if available
    if 'amt_t_trans' in monthly.columns:
        roll_mean_t = f'roll_mean_trans_{w}'
        roll_std_t = f'roll_std_trans_{w}'
        monthly[roll_mean_t] = monthly.groupby('user_id')['amt_t_trans'].transform(lambda x: x.rolling(window=w, min_periods=MIN_PERIODS).mean())
        monthly[roll_std_t] = monthly.groupby('user_id')['amt_t_trans'].transform(lambda x: x.rolling(window=w, min_periods=MIN_PERIODS).std(ddof=ROLL_DDF)).fillna(0)
        monthly[f'roll_cv_trans_{w}'] = monthly[roll_std_t] / (monthly[roll_mean_t].abs() + EPS)
        rolling_cols += [roll_mean_t, roll_std_t, f'roll_cv_trans_{w}']

# Momentum & acceleration
monthly['mom_1'] = monthly['amt_t_wins'] - monthly.groupby('user_id')['amt_t_wins'].shift(1)
monthly['mom_3'] = None
if 3 in ROLL_WINDOWS:
    # mom_3 defined as amt_t - mean(amt_{t-2..t}) --- uses the roll_mean_3 computed above
    monthly['mom_3'] = monthly['amt_t_wins'] - monthly['roll_mean_3']
else:
    monthly['mom_3'] = monthly['amt_t_wins'] - monthly.groupby('user_id')['amt_t_wins'].transform(lambda x: x.rolling(window=3, min_periods=MIN_PERIODS).mean())

# accel = mom_1 - (amt_{t-1} - amt_{t-2})
prev_diff = monthly.groupby('user_id')['amt_t_wins'].shift(1) - monthly.groupby('user_id')['amt_t_wins'].shift(2)
monthly['accel'] = monthly['mom_1'] - prev_diff
rolling_cols += ['mom_1', 'mom_3', 'accel']

# Relative position (z-score)
if 6 in ROLL_WINDOWS:
    monthly['z_6'] = (monthly['amt_t_wins'] - monthly['roll_mean_6']) / (monthly['roll_std_6'] + EPS)
else:
    monthly['z_6'] = (monthly['amt_t_wins'] - monthly.groupby('user_id')['amt_t_wins'].transform(lambda x: x.rolling(window=6, min_periods=MIN_PERIODS).mean())) / (
        monthly.groupby('user_id')['amt_t_wins'].transform(lambda x: x.rolling(window=6, min_periods=MIN_PERIODS).std(ddof=ROLL_DDF)).fillna(0) + EPS
    )
rolling_cols += ['z_6']

# EWM stats (fast-reacting)
ewm_cols = []
ewm_std_cols = []
for alpha in EWM_ALPHAS:
    suffix = f'a{int(alpha*100):02d}'
    ewm_mean_col = f'ewm_mean_{suffix}'
    ewm_std_col = f'ewm_std_{suffix}'
    monthly[ewm_mean_col] = monthly.groupby('user_id')['amt_t_wins'].transform(lambda x: x.ewm(alpha=alpha, adjust=False).mean())
    # ewm.std may produce NaN on first obs; fill with 0
    monthly[ewm_std_col] = monthly.groupby('user_id')['amt_t_wins'].transform(lambda x: x.ewm(alpha=alpha, adjust=False).std()).fillna(0)
    rolling_cols += [ewm_mean_col, ewm_std_col]
    if 'amt_t_trans' in monthly.columns:
        emt = f'ewm_mean_trans_{suffix}'
        est = f'ewm_std_trans_{suffix}'
        monthly[emt] = monthly.groupby('user_id')['amt_t_trans'].transform(lambda x: x.ewm(alpha=alpha, adjust=False).mean())
        monthly[est] = monthly.groupby('user_id')['amt_t_trans'].transform(lambda x: x.ewm(alpha=alpha, adjust=False).std()).fillna(0)
        rolling_cols += [emt, est]

# Expanding stats (optional): expanding mean and count already exist in earlier steps as expanding_mean/expanding_count in supervised builder; include here if needed
monthly['expanding_mean'] = monthly.groupby('user_id')['amt_t_wins'].transform(lambda x: x.expanding(min_periods=1).mean())
monthly['expanding_count'] = monthly.groupby('user_id')['amt_t_wins'].transform(lambda x: x.expanding(min_periods=1).count())
rolling_cols += ['expanding_mean', 'expanding_count']

# Final housekeeping: ensure numeric dtypes and update core_transforms
rolling_cols = [c for c in rolling_cols if c in monthly.columns]
core_transforms['rolling_windows'] = ROLL_WINDOWS
core_transforms['ewm_alphas'] = EWM_ALPHAS
core_transforms['rolling_cols'] = rolling_cols
core_transforms['rolling_eps'] = EPS

print('Added rolling features:', len(rolling_cols), 'columns')
print(rolling_cols[:80])
monthly[['user_id','month','amt_t_wins'] + rolling_cols[:20]].head(10)

Added rolling features: 31 columns
['roll_mean_3', 'roll_sum_3', 'roll_std_3', 'roll_cv_3', 'roll_mean_trans_3', 'roll_std_trans_3', 'roll_cv_trans_3', 'roll_mean_6', 'roll_sum_6', 'roll_std_6', 'roll_cv_6', 'roll_mean_trans_6', 'roll_std_trans_6', 'roll_cv_trans_6', 'roll_mean_12', 'roll_sum_12', 'roll_std_12', 'roll_cv_12', 'roll_mean_trans_12', 'roll_std_trans_12', 'roll_cv_trans_12', 'mom_1', 'mom_3', 'accel', 'z_6', 'ewm_mean_a50', 'ewm_std_a50', 'ewm_mean_trans_a50', 'ewm_std_trans_a50', 'expanding_mean', 'expanding_count']


Unnamed: 0,user_id,month,amt_t_wins,roll_mean_3,roll_sum_3,roll_std_3,roll_cv_3,roll_mean_trans_3,roll_std_trans_3,roll_cv_trans_3,...,roll_cv_6,roll_mean_trans_6,roll_std_trans_6,roll_cv_trans_6,roll_mean_12,roll_sum_12,roll_std_12,roll_cv_12,roll_mean_trans_12,roll_std_trans_12
0,0,2010-01-01,547.05,547.05,547.05,0.0,0.0,6.306367,0.0,0.0,...,0.0,6.306367,0.0,0.0,547.05,547.05,0.0,0.0,6.306367,0.0
1,0,2010-02-01,864.79,705.92,1411.84,158.87,0.225054,6.535004,0.228638,0.034987,...,0.225054,6.535004,0.228638,0.034987,705.92,1411.84,158.87,0.225054,6.535004,0.228638
2,0,2010-03-01,783.05,731.63,2194.89,134.716218,0.184132,6.578161,0.196405,0.029857,...,0.184132,6.578161,0.196405,0.029857,731.63,2194.89,134.716218,0.184132,6.578161,0.196405
3,0,2010-04-01,857.4,835.08,2505.24,36.914258,0.044204,6.727728,0.044865,0.006669,...,0.168729,6.622388,0.186546,0.028169,763.0725,3052.29,128.752619,0.168729,6.622388,0.186546
4,0,2010-05-01,112.22,584.223333,1752.67,335.134143,0.57364,6.049625,0.93432,0.154443,...,0.449791,6.243777,0.775387,0.124186,632.902,3164.51,284.673894,0.449791,6.243777,0.775387
5,0,2010-06-01,268.53,412.716667,1238.15,320.848515,0.777406,5.693694,0.829844,0.145748,...,0.512451,6.135927,0.747782,0.121869,572.173333,3433.04,293.210748,0.512451,6.135927,0.747782
6,0,2010-07-01,719.03,366.593333,1099.78,257.25073,0.701733,5.635102,0.755732,0.134111,...,0.495518,6.181415,0.764872,0.123737,593.152857,4152.07,276.281729,0.465785,6.199265,0.709483
7,0,2010-08-01,535.2,507.586667,1522.76,184.949425,0.36437,6.153493,0.411708,0.066906,...,0.500757,6.101559,0.723827,0.11863,585.90875,4687.27,259.147611,0.4423,6.209921,0.664259
8,0,2010-09-01,-211.9624,347.422533,1042.2676,402.601536,1.158824,2.500895,5.560583,2.223437,...,0.961635,4.097295,4.284023,1.045574,497.2564,4475.3076,350.099207,0.704062,4.92425,3.68996
9,0,2010-10-01,262.56,195.265867,585.7976,308.71701,1.581009,2.165891,5.330289,2.461015,...,1.056433,3.900496,4.183349,1.072517,473.78676,4737.8676,339.514249,0.716597,4.989253,3.506031


# Expanding (cumulative) history features

Compute per-user cumulative statistics up to and including time t (no leakage):

- exp_mean, exp_sum, exp_std (cumulative mean, sum, std)
- seasonal_mean_cum (mean for same month_of_year up to t) and seasonal_ratio = seasonal_mean_cum / exp_mean
- behavioral persistence: nonzero_rate (fraction non-zero months up to t) and run_length_since_zero

These features provide a long-run baseline and capture persistence in behavior.

In [21]:
# === EXPANDING / CUMULATIVE FEATURES ===
# Cumulative (expanding) features per user up to time t.

# Reuse EPS & ROLL_DDF if defined, else fall back
EPS = globals().get('EPS', 1e-8)
ROLL_DDF = globals().get('ROLL_DDF', 0)

# Ensure sorted
monthly = monthly.sort_values(['user_id','month']).reset_index(drop=True)

# Cumulative mean, sum, std
monthly['exp_mean'] = monthly.groupby('user_id')['amt_t_wins'].transform(lambda s: s.expanding(min_periods=1).mean())
monthly['exp_sum'] = monthly.groupby('user_id')['amt_t_wins'].transform(lambda s: s.expanding(min_periods=1).sum())
monthly['exp_std'] = monthly.groupby('user_id')['amt_t_wins'].transform(lambda s: s.expanding(min_periods=1).std(ddof=ROLL_DDF)).fillna(0)

# Seasonal cumulative mean: mean of same month_of_year up to t
monthly['seasonal_mean_cum'] = monthly.groupby(['user_id','month_of_year'])['amt_t_wins'].transform(lambda s: s.expanding(min_periods=1).mean())
monthly['seasonal_ratio'] = monthly['seasonal_mean_cum'] / (monthly['exp_mean'] + EPS)
monthly['seasonal_ratio'] = monthly['seasonal_ratio'].replace([np.inf, -np.inf], np.nan).fillna(1.0)

# Behavioral persistence
monthly['nonzero_cum'] = monthly.groupby('user_id')['amt_t_wins'].transform(lambda s: s.gt(0).expanding(min_periods=1).sum())
monthly['months_seen_cum'] = monthly.groupby('user_id')['amt_t_wins'].transform(lambda s: s.expanding(min_periods=1).count())
monthly['nonzero_rate'] = monthly['nonzero_cum'] / (monthly['months_seen_cum'] + EPS)

# run_length_since_zero: months since last zero-spend month (0 if current month is zero)
def _run_length_since_zero(s):
    zm = s.eq(0)
    gid = zm.cumsum()
    out = gid.groupby(gid).cumcount().astype(int)
    # for pre-zero group (gid == 0) and non-zero values, interpret as months since start
    mask_pre_zero_nonzero = (~zm) & (gid == 0)
    out[mask_pre_zero_nonzero] = out[mask_pre_zero_nonzero] + 1
    out[zm] = 0
    return out

monthly['run_length_since_zero'] = monthly.groupby('user_id')['amt_t_wins'].transform(_run_length_since_zero)

# Housekeeping
exp_cols = ['exp_mean','exp_sum','exp_std','seasonal_mean_cum','seasonal_ratio','nonzero_cum','months_seen_cum','nonzero_rate','run_length_since_zero']
core_transforms['expanding_cols'] = exp_cols

print('Added expanding (cumulative) features:', exp_cols)
monthly[['user_id','month'] + exp_cols].head(12)

Added expanding (cumulative) features: ['exp_mean', 'exp_sum', 'exp_std', 'seasonal_mean_cum', 'seasonal_ratio', 'nonzero_cum', 'months_seen_cum', 'nonzero_rate', 'run_length_since_zero']


Unnamed: 0,user_id,month,exp_mean,exp_sum,exp_std,seasonal_mean_cum,seasonal_ratio,nonzero_cum,months_seen_cum,nonzero_rate,run_length_since_zero
0,0,2010-01-01,547.05,547.05,0.0,547.05,1.0,1.0,1.0,1.0,1
1,0,2010-02-01,705.92,1411.84,158.87,864.79,1.225054,2.0,2.0,1.0,2
2,0,2010-03-01,731.63,2194.89,134.716218,783.05,1.070281,3.0,3.0,1.0,3
3,0,2010-04-01,763.0725,3052.29,128.752619,857.4,1.123615,4.0,4.0,1.0,4
4,0,2010-05-01,632.902,3164.51,284.673894,112.22,0.17731,5.0,5.0,1.0,5
5,0,2010-06-01,572.173333,3433.04,293.210748,268.53,0.469316,6.0,6.0,1.0,6
6,0,2010-07-01,593.152857,4152.07,276.281729,719.03,1.212217,7.0,7.0,1.0,7
7,0,2010-08-01,585.90875,4687.27,259.147611,535.2,0.913453,8.0,8.0,1.0,8
8,0,2010-09-01,497.2564,4475.3076,350.099207,-211.9624,-0.426264,8.0,9.0,0.888889,9
9,0,2010-10-01,473.78676,4737.8676,339.514249,262.56,0.554173,9.0,10.0,0.9,10


# Category / mix features (top-k shares, trends, entropy, weekend bias)

If a category column exists (e.g., `category`, `mcc`), this cell will:

- pick top-K categories from the TRAIN partition,
- compute per-user-month shares for each top-K category and an 'other' share,
- compute 3-month rolling averages of shares and change vs 12-month average,
- compute category entropy per month (diversity of spending),
- compute weekend_share per month (weekend vs weekday allocation).

This is optional and skipped if no category-like column is found.

In [22]:
# === CATEGORY / MIX FEATURES ===
import re

# Candidate category-like columns in transactions
cat_candidates = ['category', 'mcc', 'merchant_id', 'merchant_city', 'description']
cat_col = next((c for c in cat_candidates if c in df.columns), None)
TOP_K = 5

if cat_col is None:
    print('No category-like column found in transactions; skipping category/mix features.')
else:
    print('Using category column:', cat_col)
    # Use TRAIN transactions to compute global top-K categories to avoid label leakage
    train_tx = df[df['month'] < core_transforms['val_start']]
    topk = train_tx[cat_col].value_counts().nlargest(TOP_K).index.tolist()
    print('Top-k categories (train):', topk)

    # per-user-month category amounts
    cat_monthly = (
        df
        .groupby(['user_id','month', cat_col], as_index=False)
        ['amount']
        .sum()
        .rename(columns={'amount':'amt_cat'})
    )

    # pivot only for top-k categories (others will be aggregated into 'other')
    cat_top = cat_monthly[cat_monthly[cat_col].isin(topk)].copy()
    cat_wide = cat_top.pivot_table(index=['user_id','month'], columns=cat_col, values='amt_cat', aggfunc='sum').fillna(0)
    cat_wide = cat_wide.reset_index()

    # merge top-k amounts into monthly
    monthly = monthly.merge(cat_wide, on=['user_id','month'], how='left')
    # fill NA for top-k columns with 0
    for c in topk:
        if c in monthly.columns:
            monthly[c] = monthly[c].fillna(0)

    # compute other amount as remainder
    top_cols = [c for c in topk if c in monthly.columns]
    monthly['amt_topk_sum'] = monthly[top_cols].sum(axis=1) if top_cols else 0.0
    monthly['amt_other'] = (monthly['amt_t'] - monthly['amt_topk_sum']).clip(lower=0.0)

    # safe denominator
    denom = monthly['amt_t'].clip(lower=EPS)

    # create sanitized share column names
    def _sanitize(x):
        s = re.sub(r'[^0-9a-zA-Z]+', '_', str(x)).lower()
        return s[:40]

    share_cols = []
    for c in top_cols:
        sc = f'share_{_sanitize(c)}'
        monthly[sc] = monthly[c] / denom
        share_cols.append(sc)
        # rolling mean and change vs 12-month average
        monthly[f'{sc}_roll3'] = monthly.groupby('user_id')[sc].transform(lambda s: s.rolling(3, min_periods=1).mean())
        monthly[f'{sc}_chg_vs12'] = monthly[sc] - monthly.groupby('user_id')[sc].transform(lambda s: s.rolling(12, min_periods=1).mean())

    # other share
    monthly['share_other'] = monthly['amt_other'] / denom

    # entropy over top_k + other
    ent_cols = share_cols + ['share_other']
    monthly[ent_cols] = monthly[ent_cols].fillna(0)
    monthly['cat_entropy'] = -(monthly[ent_cols] * np.log(monthly[ent_cols] + EPS)).sum(axis=1)

    # weekend vs weekday bias
    df['is_weekend'] = df['date'].dt.weekday >= 5
    weekend_sum = (
        df[df['is_weekend']]
        .groupby(['user_id','month'], as_index=False)['amount']
        .sum()
        .rename(columns={'amount':'amt_weekend'})
    )
    monthly = monthly.merge(weekend_sum, on=['user_id','month'], how='left')
    monthly['amt_weekend'] = monthly['amt_weekend'].fillna(0)
    monthly['weekend_share'] = monthly['amt_weekend'] / denom

    # housekeeping: collect feature names
    cat_feature_cols = share_cols + [f'{c}_roll3' for c in share_cols] + [f'{c}_chg_vs12' for c in share_cols] + ['share_other','cat_entropy','weekend_share']
    core_transforms['category_topk'] = topk
    core_transforms['category_feature_cols'] = cat_feature_cols

    print('Added category/mix features:', cat_feature_cols)
    monthly[['user_id','month','amt_t'] + cat_feature_cols].head(10)

Using category column: category
Top-k categories (train): ['Food & Dining', 'Transportation & Travel', 'Shopping & Retail', 'Financial & Insurance', 'Home & Utilities']
Added category/mix features: ['share_food_dining', 'share_transportation_travel', 'share_shopping_retail', 'share_financial_insurance', 'share_home_utilities', 'share_food_dining_roll3', 'share_transportation_travel_roll3', 'share_shopping_retail_roll3', 'share_financial_insurance_roll3', 'share_home_utilities_roll3', 'share_food_dining_chg_vs12', 'share_transportation_travel_chg_vs12', 'share_shopping_retail_chg_vs12', 'share_financial_insurance_chg_vs12', 'share_home_utilities_chg_vs12', 'share_other', 'cat_entropy', 'weekend_share']


  result = func(self.values, **kwargs)


# Static (time-invariant) user features

Add demographic and static user-level features that are repeated for every month for each user:

- age_bucket (computed from birth year or current age)
- income_bucket (quantile buckets computed on TRAIN users only)
- region (derived from coordinates via KMeans on TRAIN users or extracted from address)
- account_tenure_months (months since user start date at each t)

These are stable or monotone features that help models learn cross-sectional differences.

In [None]:
import re

# Candidate static columns to pull from df
static_candidates = [
    'user_current_age', 'user_birth_year', 'user_birth_month',
    'user_yearly_income', 'user_per_capita_income', 'user_gender',
    'user_total_debt', 'user_credit_score', 'user_num_credit_cards',
    'user_address', 'user_latitude', 'user_longitude', 'zip', 'card_acct_open_date'
]
available_static = [c for c in static_candidates if c in df.columns]
print('Available static columns:', available_static)

# Build a per-user static table (last-known values)
user_static = df.sort_values('date').drop_duplicates('user_id', keep='last')[['user_id'] + available_static].copy()

# add first-transaction date per user (useful for tenure)
user_first_tx = df.groupby('user_id', as_index=False)['date'].min().rename(columns={'date':'first_tx_date'})
user_static = user_static.merge(user_first_tx, on='user_id', how='left')

# card account open date (if available) - take earliest per user
if 'card_acct_open_date' in df.columns:
    card_dates = df[['user_id','card_acct_open_date']].dropna().copy()
    card_dates['card_acct_open_date'] = pd.to_datetime(card_dates['card_acct_open_date'], errors='coerce')
    user_card_open = card_dates.groupby('user_id', as_index=False)['card_acct_open_date'].min()
    user_static = user_static.merge(user_card_open, on='user_id', how='left')
else:
    user_static['card_acct_open_date'] = pd.NaT

# determine a conservative start date per user: earliest of first_tx_date and card_acct_open_date
# Ensure both columns are datetimelike and coerce invalid values to NaT before taking row-wise min
user_static['first_tx_date'] = pd.to_datetime(user_static.get('first_tx_date'), errors='coerce')
user_static['card_acct_open_date'] = pd.to_datetime(user_static.get('card_acct_open_date'), errors='coerce')
user_static['user_start_date'] = user_static[['first_tx_date','card_acct_open_date']].min(axis=1)

# AGE: compute age at each month if birth year available, else use user_current_age as static
if 'user_birth_year' in user_static.columns:
    # merge birth year into monthly to compute age at each month
    monthly = monthly.merge(user_static[['user_id','user_birth_year']], on='user_id', how='left')
    monthly['age_at_t'] = monthly['month'].dt.year - monthly['user_birth_year']
else:
    if 'user_current_age' in user_static.columns:
        monthly = monthly.merge(user_static[['user_id','user_current_age']], on='user_id', how='left')
        monthly['age_at_t'] = monthly['user_current_age']
    else:
        monthly['age_at_t'] = np.nan

# Age bucket (fixed bins)
AGE_BINS = [0,25,35,45,55,65,200]
AGE_LABELS = ['<25','25-34','35-44','45-54','55-64','65+']
monthly['age_bucket'] = pd.cut(monthly['age_at_t'], bins=AGE_BINS, labels=AGE_LABELS, include_lowest=True).astype(object).fillna('unknown')

# INCOME: choose yearly income then per-capita as fallback
income_col = 'user_yearly_income' if 'user_yearly_income' in user_static.columns else ('user_per_capita_income' if 'user_per_capita_income' in user_static.columns else None)
if income_col is None:
    monthly['income_bucket'] = 'income_unknown'
    user_static['income'] = np.nan
else:
    user_static['income'] = user_static[income_col]
    # compute quantile thresholds on TRAIN USERS only
    train_user_ids = monthly.loc[train_mask, 'user_id'].unique()
    train_incomes = user_static.loc[user_static['user_id'].isin(train_user_ids), 'income'].dropna()
    if train_incomes.nunique() >= 5:
        # robust quantile bins (20% increments)
        probs = [0.0, 0.2, 0.4, 0.6, 0.8, 1.0]
        quantile_vals = np.unique(train_incomes.quantile(probs).values)
        if len(quantile_vals) >= 3:
            bins = quantile_vals
            labels = [f'inc_q{i+1}' for i in range(len(bins)-1)]
            user_static['income_bucket'] = pd.cut(user_static['income'], bins=bins, labels=labels, include_lowest=True).astype(object)
        else:
            # fallback to median split
            med = train_incomes.median()
            user_static['income_bucket'] = np.where(user_static['income'] >= med, 'inc_high', 'inc_low')
    elif train_incomes.nunique() >= 2:
        med = train_incomes.median()
        user_static['income_bucket'] = np.where(user_static['income'] >= med, 'inc_high', 'inc_low')
    else:
        user_static['income_bucket'] = 'income_unknown'

    user_static['income_bucket'] = user_static['income_bucket'].fillna('income_unknown')
    monthly = monthly.merge(user_static[['user_id','income','income_bucket']], on='user_id', how='left')

# REGION: use lat/lon clustering if available, otherwise try to extract state from address, else unknown
if 'user_latitude' in user_static.columns and 'user_longitude' in user_static.columns:
    coords = user_static[['user_id','user_latitude','user_longitude']].dropna()
    # fit KMeans on TRAIN users with coordinates
    train_user_ids = monthly.loc[train_mask, 'user_id'].unique()
    coord_train = coords[coords['user_id'].isin(train_user_ids)]
    n_unique_coords = coord_train[['user_latitude','user_longitude']].drop_duplicates().shape[0]
    N_REGIONS = 4
    if n_unique_coords >= 2:
        n_clusters = min(N_REGIONS, n_unique_coords)
        kmeans = KMeans(n_clusters=n_clusters, random_state=42)
        kmeans.fit(coord_train[['user_latitude','user_longitude']])
        # assign cluster for all users with coords
        coords['region_cluster'] = kmeans.predict(coords[['user_latitude','user_longitude']])
        coords['region'] = coords['region_cluster'].apply(lambda x: f'region_{x}')
        # merge region into user_static
        user_static = user_static.merge(coords[['user_id','region']], on='user_id', how='left')
        core_transforms['region_kmeans'] = kmeans
    else:
        user_static['region'] = 'region_unknown'
else:
    # try to extract state from 'user_address' if present
    if 'user_address' in user_static.columns:
        # naive extraction: look for two-letter state code before ZIP or at the end
        def _extract_state(addr):
            if pd.isna(addr):
                return None
            # common patterns: 'City, ST 12345' or 'City, ST'
            m = re.search(r',\s*([A-Z]{2})(?:\s|$)', str(addr))
            if m:
                return m.group(1)
            return None
        user_static['region'] = user_static['user_address'].apply(_extract_state).fillna('region_unknown')
    else:
        user_static['region'] = 'region_unknown'

# Merge region into monthly (if not already)
if 'region' not in monthly.columns:
    monthly = monthly.merge(user_static[['user_id','region']], on='user_id', how='left')

# ACCOUNT TENURE (months since user start) — compute per month t
# ensure user_start_date exists
user_static['user_start_date'] = pd.to_datetime(user_static.get('user_start_date'), errors='coerce')

monthly = monthly.merge(user_static[['user_id','user_start_date']], on='user_id', how='left')
monthly['user_start_date'] = pd.to_datetime(monthly['user_start_date'], errors='coerce')
# compute months difference vectorized
start_month_num = monthly['user_start_date'].dt.year * 12 + monthly['user_start_date'].dt.month
month_num = monthly['month'].dt.year * 12 + monthly['month'].dt.month
monthly['account_tenure_months'] = (month_num - start_month_num).fillna(0).astype(int)
monthly.loc[monthly['account_tenure_months'] < 0, 'account_tenure_months'] = 0

# Housekeeping: collect static feature column names
static_feature_cols = ['age_at_t','age_bucket','income','income_bucket','region','account_tenure_months']
core_transforms['static_feature_cols'] = static_feature_cols

print('Added static user features:', static_feature_cols)
monthly[['user_id','month'] + static_feature_cols].head(12)

Available static columns: ['user_current_age', 'user_birth_year', 'user_birth_month', 'user_yearly_income', 'user_per_capita_income', 'user_gender', 'user_total_debt', 'user_credit_score', 'user_num_credit_cards', 'user_address', 'user_latitude', 'user_longitude', 'zip', 'card_acct_open_date']


TypeError: '<=' not supported between instances of 'Timestamp' and 'float'