## Feature Engineering

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

### 1. Load Data

In [None]:
df_transactions = pd.read_csv('../data/transactions.csv')
df_offers = pd.read_csv('../data/offer_interactions.csv')

df_transactions['transaction_date'] = pd.to_datetime(df_transactions['transaction_date'])
df_offers['sent_date'] = pd.to_datetime(df_offers['sent_date'])

REFERENCE_DATE = pd.to_datetime('2024-01-01')

print(f"Transactions: {len(df_transactions):,} | Offers: {len(df_offers):,}")

### 2. RFM Features (Recency, Frequency, Monetary)

In [None]:
rfm = df_transactions.groupby('customer_id').agg(
    last_purchase=('transaction_date', 'max'),
    first_purchase=('transaction_date', 'min'),
    frequency=('transaction_id', 'count'),
    monetary_total=('amount', 'sum'),
    monetary_avg=('amount', 'mean'),
    monetary_min=('amount', 'min'),
    monetary_max=('amount', 'max'),
    monetary_std=('amount', 'std')
).round(2)

rfm['recency_days'] = (REFERENCE_DATE - rfm['last_purchase']).dt.days
rfm['first_purchase_days'] = (REFERENCE_DATE - rfm['first_purchase']).dt.days
rfm['customer_tenure_days'] = (rfm['last_purchase'] - rfm['first_purchase']).dt.days

rfm = rfm.drop(columns=['last_purchase', 'first_purchase'])
rfm['monetary_std'] = rfm['monetary_std'].fillna(0)

rfm['avg_days_between_purchases'] = (rfm['customer_tenure_days'] / rfm['frequency'].clip(lower=1)).round(1)
rfm['is_one_time_buyer'] = (rfm['frequency'] == 1).astype(int)

rfm.head()

### Flag Features

These binary flags help the model capture non-linear thresholds that drive redemption.

In [None]:
# Frequency flags
rfm['is_frequent'] = (rfm['frequency'] >= 8).astype(int)
rfm['is_very_frequent'] = (rfm['frequency'] >= 12).astype(int)
rfm['is_vip'] = (rfm['frequency'] >= 15).astype(int)

# Recency flags
rfm['is_recent'] = (rfm['recency_days'] <= 60).astype(int)
rfm['is_very_recent'] = (rfm['recency_days'] <= 30).astype(int)
rfm['is_lapsed'] = (rfm['recency_days'] > 365).astype(int)
rfm['is_semi_lapsed'] = ((rfm['recency_days'] > 180) & (rfm['recency_days'] <= 365)).astype(int)

# Monetary flags
monetary_q75 = rfm['monetary_avg'].quantile(0.75)
monetary_q25 = rfm['monetary_avg'].quantile(0.25)
rfm['is_high_spender'] = (rfm['monetary_avg'] > monetary_q75).astype(int)
rfm['is_low_spender'] = (rfm['monetary_avg'] < monetary_q25).astype(int)
rfm['is_premium'] = (rfm['monetary_avg'] > 120).astype(int)
rfm['is_budget'] = (rfm['monetary_avg'] < 70).astype(int)

# Combination flags
rfm['is_active_high_value'] = ((rfm['is_frequent'] == 1) & (rfm['is_high_spender'] == 1)).astype(int)
rfm['is_recent_frequent'] = ((rfm['is_recent'] == 1) & (rfm['is_frequent'] == 1)).astype(int)
rfm['is_lapsed_one_time'] = ((rfm['is_lapsed'] == 1) & (rfm['is_one_time_buyer'] == 1)).astype(int)

flag_cols = [c for c in rfm.columns if c.startswith('is_')]
print(f"Created {len(flag_cols)} flag features")

In [None]:
rfm['frequency_bin'] = pd.cut(
    rfm['frequency'],
    bins=[-1, 2, 5, 10, 15, float('inf')],
    labels=[0, 1, 2, 3, 4]
).astype(int)

rfm['recency_bin'] = pd.cut(
    rfm['recency_days'],
    bins=[-1, 30, 90, 180, 365, float('inf')],
    labels=[4, 3, 2, 1, 0]
).astype(int)

rfm['monetary_bin'] = pd.cut(
    rfm['monetary_avg'],
    bins=[-1, 60, 90, 120, float('inf')],
    labels=[0, 1, 2, 3]
).astype(int)

rfm['rfm_score'] = rfm['frequency_bin'] + rfm['recency_bin'] + rfm['monetary_bin']

bin_cols = [c for c in rfm.columns if c.endswith('_bin') or c == 'rfm_score']
print(f"Created {len(bin_cols)} binned features")

### Service Preference Features

In [None]:
service_diversity = df_transactions.groupby('customer_id').agg({
    'service_id': 'nunique',
    'service_category': 'nunique',
    'location': 'nunique',
}).rename(columns={
    'service_id': 'unique_service_count',
    'service_category': 'unique_category_count',
    'location': 'unique_location_count'
})

favorite_category = df_transactions.groupby('customer_id')['service_category'].agg(
    lambda x: x.value_counts().index[0]
).rename('favorite_category')

category_spending = df_transactions.groupby(['customer_id', 'service_category'])['amount'].sum().unstack(fill_value=0)
category_pct = category_spending.div(category_spending.sum(axis=1), axis=0).round(3)
category_pct.columns = [f'pct_spent_{col.lower()}' for col in category_pct.columns]

service_features = service_diversity.join([favorite_category, category_pct])
service_features['is_diverse_customer'] = (service_features['unique_service_count'] >= 5).astype(int)

### Offer Engagement Features

In [None]:
offer_basic = df_offers.groupby('customer_id').agg(
    total_offers_received=('interaction_id', 'count'),
).round(3)

offer_type_clicks = df_offers.groupby(['customer_id', 'offer_type'])['clicked'].sum().unstack(fill_value=0)
offer_type_clicks.columns = [f'clicks_{col.lower().replace(" ", "_")}' for col in offer_type_clicks.columns]

favorite_offer = df_offers.groupby(['customer_id', 'offer_type'])['clicked'].sum().unstack(fill_value=0)
favorite_offer_type = favorite_offer.idxmax(axis=1).rename('favorite_offer_type')

offer_features = offer_basic.join([offer_type_clicks, favorite_offer_type])
offer_features = offer_features.fillna(0)

## 7. Combine All Features

In [None]:
feature_store = rfm.copy()
feature_store = feature_store.join(service_features)
feature_store = feature_store.join(offer_features)
feature_store = feature_store.reset_index().rename(columns={'index': 'customer_id'})

numeric_cols = feature_store.select_dtypes(include=[np.number]).columns
feature_store[numeric_cols] = feature_store[numeric_cols].fillna(0)

print(f"Feature store: {len(feature_store):,} customers, {len(feature_store.columns)} features")

In [None]:
rfm_cols = ['frequency', 'monetary_total', 'monetary_avg', 'monetary_min', 'monetary_max', 
            'monetary_std', 'recency_days', 'first_purchase_days', 'customer_tenure_days',
            'avg_days_between_purchases']
flag_cols = [c for c in feature_store.columns if c.startswith('is_')]
bin_cols = [c for c in feature_store.columns if c.endswith('_bin') or c == 'rfm_score']
service_cols = [c for c in feature_store.columns if 'service' in c or 'category' in c or 'location' in c or 'pct_spent' in c]
offer_cols = [c for c in feature_store.columns if 'offer' in c or 'clicks_' in c]

print(f"RFM: {len(rfm_cols)} | Flags: {len(flag_cols)} | Bins: {len(bin_cols)} | Service: {len(service_cols)} | Offer: {len(offer_cols)}")

In [None]:
leaky_features = ['open_rate', 'click_rate', 'redemption_rate', 'total_opens', 
                  'total_clicks', 'total_redemptions']
found_leaky = [f for f in leaky_features if f in feature_store.columns]
assert len(found_leaky) == 0, f"Leaky features found: {found_leaky}"

## 8. Save Feature Store

In [None]:
feature_store.to_csv('../data/customer_features.csv', index=False)
print(f"Saved: ../data/customer_features.csv ({feature_store.shape[0]:,} x {feature_store.shape[1]})")