# Feature Engineering for Financial Distress Prediction

**Purpose**: Transform transaction data into ML-ready features with temporal structure.

## Pipeline Steps:
1. **Daily Aggregations**: Transaction summaries per customer-day
2. **Rolling Windows**: 30-day moving statistics (sums, averages, ratios)
3. **Drift Features**: Recent behavior vs. baseline comparisons
4. **Label Integration**: Join with daily_labels for supervised learning
5. **Temporal Split**: Train/test split respecting time order

**Output**: ML-ready dataset with features and labels, properly split for time-series validation.


In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Load data
DATA_DIR = Path("../data")
print("Loading data...")

customers = pd.read_csv(DATA_DIR / "customers.csv", parse_dates=["signup_date"])
transactions = pd.read_csv(DATA_DIR / "transactions.csv", parse_dates=["date"])
outcomes = pd.read_csv(DATA_DIR / "outcomes.csv", parse_dates=["distress_start_date", "event_date"])
daily_labels = pd.read_csv(DATA_DIR / "daily_labels.csv", parse_dates=["date"])

print(f"Loaded: {len(customers):,} customers, {len(transactions):,} transactions")
print(f"Date range: {transactions['date'].min().date()} → {transactions['date'].max().date()}")


Loading data...
Loaded: 2,500 customers, 1,122,312 transactions
Date range: 2025-02-16 → 2025-08-20


## 1. Daily Transaction Aggregations

Create comprehensive daily summaries per customer with category breakdowns and behavioral indicators.


In [2]:
def create_daily_features(transactions_df):
    """
    Aggregate transactions into daily features per customer.
    
    Returns DataFrame with columns:
    - Basic: total_spend, total_income, net_flow, tx_count
    - Categories: spend by category (groceries, transport, etc.)
    - Risk indicators: cash_advance_amt, payday_loan_amt, bills_delayed
    - Behavioral: unique_merchants, weekend_spend_ratio
    """
    
    # Separate income vs spending
    income_tx = transactions_df[transactions_df['is_income'] == True].copy()
    spend_tx = transactions_df[transactions_df['is_income'] == False].copy()
    
    print("Aggregating daily features...")
    
    # Basic daily aggregations
    daily_basic = transactions_df.groupby(['customer_id', 'date']).agg({
        'amount': ['sum', 'count'],
        'merchant': 'nunique'
    }).round(2)
    
    daily_basic.columns = ['total_flow', 'tx_count', 'unique_merchants']
    daily_basic = daily_basic.reset_index()
    
    # Income aggregations
    daily_income = income_tx.groupby(['customer_id', 'date'])['amount'].sum().reset_index()
    daily_income.columns = ['customer_id', 'date', 'total_income']
    
    # Spending aggregations (negative amounts, so we'll make them positive)
    spend_tx['spend_amount'] = -spend_tx['amount']  # Convert to positive spending
    
    daily_spend = spend_tx.groupby(['customer_id', 'date'])['spend_amount'].sum().reset_index()
    daily_spend.columns = ['customer_id', 'date', 'total_spend']
    
    # Category breakdown (spending only)
    category_spend = spend_tx.groupby(['customer_id', 'date', 'category'])['spend_amount'].sum().unstack(fill_value=0)
    category_spend = category_spend.reset_index()
    
    # Add category prefixes to avoid confusion
    category_cols = [col for col in category_spend.columns if col not in ['customer_id', 'date']]
    category_spend = category_spend.rename(columns={col: f'spend_{col}' for col in category_cols})
    
    # Risk indicators
    risk_categories = ['cash_advance', 'payday_loan']
    risk_tx = spend_tx[spend_tx['category'].isin(risk_categories)]
    
    daily_risk = risk_tx.groupby(['customer_id', 'date', 'category'])['spend_amount'].sum().unstack(fill_value=0)
    daily_risk = daily_risk.reset_index()
    if 'cash_advance' in daily_risk.columns:
        daily_risk = daily_risk.rename(columns={'cash_advance': 'cash_advance_amt'})
    if 'payday_loan' in daily_risk.columns:
        daily_risk = daily_risk.rename(columns={'payday_loan': 'payday_loan_amt'})
    
    # Weekend indicator
    transactions_df['is_weekend'] = transactions_df['date'].dt.dayofweek >= 5
    weekend_spend = spend_tx.merge(transactions_df[['customer_id', 'date', 'is_weekend']], 
                                   on=['customer_id', 'date'], how='left')
    
    daily_weekend = weekend_spend.groupby(['customer_id', 'date']).agg({
        'spend_amount': 'sum',
        'is_weekend': 'first'
    }).reset_index()
    
    # Merge all daily features
    daily_features = daily_basic
    
    for df in [daily_income, daily_spend, category_spend, daily_risk]:
        daily_features = daily_features.merge(df, on=['customer_id', 'date'], how='left')
    
    # Fill missing values
    daily_features = daily_features.fillna(0)
    
    # Calculate net flow
    daily_features['net_flow'] = daily_features['total_income'] - daily_features['total_spend']
    
    # Add weekend flag
    daily_features['is_weekend'] = daily_features['date'].dt.dayofweek >= 5
    
    return daily_features

# Create daily features
daily_features = create_daily_features(transactions)
print(f"Created daily features: {len(daily_features):,} customer-days, {len(daily_features.columns)} features")
print("\\nFeature columns:", list(daily_features.columns))


Aggregating daily features...
Created daily features: 410,073 customer-days, 20 features
\nFeature columns: ['customer_id', 'date', 'total_flow', 'tx_count', 'unique_merchants', 'total_income', 'total_spend', 'spend_cash_advance', 'spend_dining', 'spend_ecommerce', 'spend_entertainment', 'spend_groceries', 'spend_payday_loan', 'spend_rent', 'spend_transport', 'spend_utilities', 'cash_advance_amt', 'payday_loan_amt', 'net_flow', 'is_weekend']


## 2. Rolling Window Features

Build 30-day rolling statistics to capture recent behavioral patterns and trends.


In [3]:
def add_rolling_features(daily_df, window=30):
    """
    Add rolling window features (30-day moving statistics).
    
    For each customer, compute rolling:
    - Sums: spending, income, transaction counts
    - Averages: daily spending patterns
    - Ratios: cash advance / total spend, etc.
    - Volatility: standard deviation of daily patterns
    """
    
    print(f"Computing {window}-day rolling features...")
    
    # Sort by customer and date
    df = daily_df.sort_values(['customer_id', 'date']).copy()
    
    # Key columns for rolling stats
    rolling_cols = [
        'total_spend', 'total_income', 'net_flow', 'tx_count',
        'cash_advance_amt', 'payday_loan_amt', 'unique_merchants'
    ]
    
    # Add category spending if they exist
    category_cols = [col for col in df.columns if col.startswith('spend_') and col != 'spend_amount']
    rolling_cols.extend(category_cols)
    
    # Rolling sums (30-day totals)
    rolling_sums = df.groupby('customer_id')[rolling_cols].rolling(
        window=window, min_periods=1
    ).sum().reset_index(level=0, drop=True)
    
    rolling_sums = rolling_sums.add_suffix(f'_{window}d_sum')
    
    # Rolling means (30-day averages)
    rolling_means = df.groupby('customer_id')[rolling_cols].rolling(
        window=window, min_periods=1
    ).mean().reset_index(level=0, drop=True)
    
    rolling_means = rolling_means.add_suffix(f'_{window}d_avg')
    
    # Rolling standard deviations (volatility)
    volatility_cols = ['total_spend', 'net_flow', 'tx_count']
    rolling_stds = df.groupby('customer_id')[volatility_cols].rolling(
        window=window, min_periods=2
    ).std().reset_index(level=0, drop=True)
    
    rolling_stds = rolling_stds.add_suffix(f'_{window}d_std')
    
    # Combine with original data
    result = pd.concat([df, rolling_sums, rolling_means, rolling_stds], axis=1)
    
    # Derived ratios
    result[f'cash_advance_ratio_{window}d'] = (
        result[f'cash_advance_amt_{window}d_sum'] / 
        (result[f'total_spend_{window}d_sum'] + 1e-8)  # Avoid division by zero
    )
    
    result[f'payday_loan_ratio_{window}d'] = (
        result[f'payday_loan_amt_{window}d_sum'] / 
        (result[f'total_spend_{window}d_sum'] + 1e-8)
    )
    
    result[f'income_spend_ratio_{window}d'] = (
        result[f'total_income_{window}d_sum'] / 
        (result[f'total_spend_{window}d_sum'] + 1e-8)
    )
    
    # Transaction frequency
    result[f'avg_daily_tx_{window}d'] = result[f'tx_count_{window}d_sum'] / window
    
    return result

# Add rolling features
features_with_rolling = add_rolling_features(daily_features, window=30)
print(f"Added rolling features: {len(features_with_rolling.columns)} total columns")

# Show some key rolling features
rolling_feature_cols = [col for col in features_with_rolling.columns if '_30d_' in col or '_ratio_' in col]
print(f"\\nKey rolling features ({len(rolling_feature_cols)}): {rolling_feature_cols[:10]}...")


Computing 30-day rolling features...
Added rolling features: 59 total columns
\nKey rolling features (38): ['total_spend_30d_sum', 'total_income_30d_sum', 'net_flow_30d_sum', 'tx_count_30d_sum', 'cash_advance_amt_30d_sum', 'payday_loan_amt_30d_sum', 'unique_merchants_30d_sum', 'spend_cash_advance_30d_sum', 'spend_dining_30d_sum', 'spend_ecommerce_30d_sum']...


## 3. Drift Features

Compare recent behavior (last 7-14 days) against baseline (earlier period) to detect behavioral changes.


In [4]:
def add_drift_features(df, recent_days=7, baseline_days=30):
    """
    Add drift features comparing recent vs baseline behavior.
    
    For each customer-date:
    - Recent: average of last 7 days
    - Baseline: average of days 14-44 ago (avoiding overlap)
    - Drift: recent / baseline ratios
    """
    
    print(f"Computing drift features (recent {recent_days}d vs baseline {baseline_days}d)...")
    
    df = df.sort_values(['customer_id', 'date']).copy()
    
    # Columns to analyze for drift
    drift_cols = [
        'total_spend', 'cash_advance_amt', 'payday_loan_amt', 
        'tx_count', 'unique_merchants'
    ]
    
    # Recent period (last 7 days)
    recent_stats = df.groupby('customer_id')[drift_cols].rolling(
        window=recent_days, min_periods=1
    ).mean().reset_index(level=0, drop=True)
    recent_stats = recent_stats.add_suffix('_recent_avg')
    
    # Baseline period (days 14-44 ago, shifted to avoid overlap)
    # We'll use a longer rolling window and shift it
    baseline_stats = df.groupby('customer_id')[drift_cols].rolling(
        window=baseline_days, min_periods=5  # Need at least 5 days for baseline
    ).mean().shift(14).reset_index(level=0, drop=True)  # Shift 14 days back
    baseline_stats = baseline_stats.add_suffix('_baseline_avg')
    
    # Combine
    result = pd.concat([df, recent_stats, baseline_stats], axis=1)
    
    # Calculate drift ratios
    for col in drift_cols:
        recent_col = f'{col}_recent_avg'
        baseline_col = f'{col}_baseline_avg'
        drift_col = f'{col}_drift_ratio'
        
        result[drift_col] = (
            result[recent_col] / (result[baseline_col] + 1e-8)
        )
        
        # Cap extreme ratios
        result[drift_col] = result[drift_col].clip(0, 10)
    
    # Special risk drift indicators
    result['risk_spend_drift'] = (
        (result['cash_advance_amt_recent_avg'] + result['payday_loan_amt_recent_avg']) /
        (result['cash_advance_amt_baseline_avg'] + result['payday_loan_amt_baseline_avg'] + 1e-8)
    ).clip(0, 10)
    
    # Spending acceleration (recent vs 30-day average)
    result['spend_acceleration'] = (
        result['total_spend_recent_avg'] / (result['total_spend_30d_avg'] + 1e-8)
    ).clip(0, 5)
    
    return result

# Add drift features
features_with_drift = add_drift_features(features_with_rolling)
print(f"Added drift features: {len(features_with_drift.columns)} total columns")

# Show drift feature columns
drift_feature_cols = [col for col in features_with_drift.columns if 'drift' in col or 'acceleration' in col]
print(f"\\nDrift features ({len(drift_feature_cols)}): {drift_feature_cols}")


Computing drift features (recent 7d vs baseline 30d)...
Added drift features: 76 total columns
\nDrift features (7): ['total_spend_drift_ratio', 'cash_advance_amt_drift_ratio', 'payday_loan_amt_drift_ratio', 'tx_count_drift_ratio', 'unique_merchants_drift_ratio', 'risk_spend_drift', 'spend_acceleration']


## 4. Add Customer Static Features

Merge customer demographics and account information.


In [5]:
# Add customer static features
customer_features = customers[[
    'customer_id', 'age', 'tenure_months', 'base_income', 
    'credit_limit', 'rent_amount', 'util_weekly'
]].copy()

# Derived customer features
customer_features['rent_income_ratio'] = -customer_features['rent_amount'] / customer_features['base_income']
customer_features['credit_utilization_capacity'] = customer_features['credit_limit'] / customer_features['base_income']
customer_features['monthly_util_cost'] = -customer_features['util_weekly'] * 4.3

# Merge with daily features
all_features = features_with_drift.merge(customer_features, on='customer_id', how='left')

print(f"Added customer features: {len(all_features.columns)} total columns")
print(f"Final feature dataset: {len(all_features):,} rows × {len(all_features.columns)} columns")


Added customer features: 85 total columns
Final feature dataset: 410,073 rows × 85 columns


## 5. Join with Labels

Add the target variable for supervised learning.


In [6]:
# Join with daily labels
ml_dataset = all_features.merge(
    daily_labels[['customer_id', 'date', 'label']], 
    on=['customer_id', 'date'], 
    how='inner'
)

print(f"ML dataset with labels: {len(ml_dataset):,} rows")
print(f"Label distribution: {ml_dataset['label'].value_counts().to_dict()}")
print(f"Positive label rate: {ml_dataset['label'].mean():.3f}")

# Show feature summary
feature_cols = [col for col in ml_dataset.columns if col not in 
               ['customer_id', 'date', 'label', 'is_weekend']]
print(f"\\nTotal features for modeling: {len(feature_cols)}")


ML dataset with labels: 408,749 rows
Label distribution: {0: 404939, 1: 3810}
Positive label rate: 0.009
\nTotal features for modeling: 82


## 6. Temporal Train/Test Split

Split data chronologically to respect time-series nature and avoid data leakage.


In [7]:
def temporal_train_test_split(df, test_days=30, gap_days=7):
    """
    Split dataset temporally with a gap to prevent leakage.
    
    Args:
        df: Dataset with 'date' column
        test_days: Number of days for test set (from the end)
        gap_days: Gap between train and test to prevent leakage
    
    Returns:
        train_df, test_df
    """
    
    max_date = df['date'].max()
    min_date = df['date'].min()
    
    # Define split points
    test_start = max_date - pd.Timedelta(days=test_days)
    train_end = test_start - pd.Timedelta(days=gap_days)
    
    print(f"Temporal split:")
    print(f"  Data range: {min_date.date()} → {max_date.date()}")
    print(f"  Train: {min_date.date()} → {train_end.date()}")
    print(f"  Gap: {train_end.date()} → {test_start.date()} ({gap_days} days)")
    print(f"  Test: {test_start.date()} → {max_date.date()} ({test_days} days)")
    
    # Split
    train_df = df[df['date'] <= train_end].copy()
    test_df = df[df['date'] >= test_start].copy()
    
    return train_df, test_df

# Perform temporal split
train_data, test_data = temporal_train_test_split(ml_dataset, test_days=30, gap_days=7)

print(f"\\nSplit results:")
print(f"  Train: {len(train_data):,} rows, {train_data['label'].mean():.3f} positive rate")
print(f"  Test:  {len(test_data):,} rows, {test_data['label'].mean():.3f} positive rate")

# Customer coverage
train_customers = train_data['customer_id'].nunique()
test_customers = test_data['customer_id'].nunique()
overlap_customers = len(set(train_data['customer_id']) & set(test_data['customer_id']))

print(f"\\nCustomer coverage:")
print(f"  Train: {train_customers:,} unique customers")
print(f"  Test:  {test_customers:,} unique customers")
print(f"  Overlap: {overlap_customers:,} customers in both sets")


Temporal split:
  Data range: 2025-02-16 → 2025-08-14
  Train: 2025-02-16 → 2025-07-08
  Gap: 2025-07-08 → 2025-07-15 (7 days)
  Test: 2025-07-15 → 2025-08-14 (30 days)
\nSplit results:
  Train: 324,642 rows, 0.009 positive rate
  Test:  70,633 rows, 0.010 positive rate
\nCustomer coverage:
  Train: 2,500 unique customers
  Test:  2,500 unique customers
  Overlap: 2,500 customers in both sets
