## Recommendation Model

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, precision_score, recall_score, roc_curve
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import lightgbm as lgb
import mlflow
import mlflow.lightgbm
import warnings
warnings.filterwarnings('ignore')


### 1. Load Data

In [None]:
df_features = pd.read_csv('../data/customer_features.csv')
df_offers = pd.read_csv('../data/offer_interactions.csv')

print(f"Customers: {len(df_features):,} | Offers: {len(df_offers):,} | Redemption rate: {df_offers['redeemed'].mean()*100:.1f}%")

---
### Stage 1: Customer Clustering (Retrieval)

Group similar customers into segments using KMeans.
Each cluster gets a set of top-performing offers.

In [None]:
cluster_feature_names = [
    'recency_days', 'frequency', 'monetary_total', 'monetary_avg',
    'unique_service_count', 'unique_category_count', 'unique_location_count',
    'customer_tenure_days', 'avg_days_between_purchases'
]
cluster_feature_names = [c for c in cluster_feature_names if c in df_features.columns]

X_cluster = df_features[cluster_feature_names].fillna(0)

scaler = StandardScaler()
X_cluster_scaled = scaler.fit_transform(X_cluster)

### 3. Run KMeans Clustering

In [None]:
N_CLUSTERS = 10

kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
df_features['cluster'] = kmeans.fit_predict(X_cluster_scaled)

df_features['cluster'].value_counts().sort_index()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Cluster sizes
df_features['cluster'].value_counts().sort_index().plot(kind='bar', ax=axes[0], color='steelblue')
axes[0].set_title('Customers per Cluster')
axes[0].set_xlabel('Cluster')
axes[0].set_ylabel('Count')

# Heatmap of cluster profiles
cluster_means = df_features.groupby('cluster')[cluster_feature_names].mean()
cluster_normalized = (cluster_means - cluster_means.min()) / (cluster_means.max() - cluster_means.min())

im = axes[1].imshow(cluster_normalized.T, cmap='YlOrRd', aspect='auto')
axes[1].set_xticks(range(N_CLUSTERS))
axes[1].set_yticks(range(len(cluster_feature_names)))
axes[1].set_yticklabels(cluster_feature_names, fontsize=8)
axes[1].set_xlabel('Cluster')
axes[1].set_title('Cluster Profiles (Normalized)')
plt.colorbar(im, ax=axes[1])

plt.tight_layout()
plt.show()

### 4. Cluster → Offer Affinity Matrix

In [None]:
df_offers_with_cluster = df_offers.merge(
    df_features[['customer_id', 'cluster']],
    on='customer_id',
    how='inner'
)

cluster_offer_affinity = df_offers_with_cluster.pivot_table(
    values='redeemed',
    index='cluster',
    columns='offer_type',
    aggfunc='mean'
).round(3)

cluster_offer_affinity

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))

im = ax.imshow(cluster_offer_affinity.values, cmap='YlGn', aspect='auto')
ax.set_xticks(range(len(cluster_offer_affinity.columns)))
ax.set_xticklabels(cluster_offer_affinity.columns, rotation=45, ha='right', fontsize=9)
ax.set_yticks(range(N_CLUSTERS))
ax.set_yticklabels([f'Cluster {i}' for i in range(N_CLUSTERS)])
ax.set_title('Cluster × Offer Redemption Rate (Stage 1 Retrieval Matrix)')
plt.colorbar(im, ax=ax, label='Redemption Rate')

for i in range(cluster_offer_affinity.shape[0]):
 for j in range(cluster_offer_affinity.shape[1]):
 val = cluster_offer_affinity.values[i, j]
 ax.text(j, i, f'{val:.2f}', ha='center', va='center', fontsize=8,
 color='white' if val > 0.4 else 'black')

plt.tight_layout()
plt.show()

In [None]:
cluster_offer_id_affinity = df_offers_with_cluster.pivot_table(
    values='redeemed',
    index='cluster',
    columns='offer_id',
    aggfunc='mean'
).round(3)

RETRIEVAL_TOP_N = 5

cluster_candidates = {}
for cluster_id in range(N_CLUSTERS):
    top_offers = cluster_offer_id_affinity.loc[cluster_id].nlargest(RETRIEVAL_TOP_N).index.tolist()
    cluster_candidates[cluster_id] = top_offers

print(f"Stage 1: {RETRIEVAL_TOP_N} candidates per cluster")

---
## Stage 2: LightGBM Ranking Model

### 5. Merge & Prepare Training Data

In [None]:
df_train = df_offers.merge(df_features, on='customer_id', how='inner')
df_train = pd.get_dummies(df_train, columns=['offer_type'], prefix='offer')

offer_cols = [c for c in df_train.columns if c.startswith('offer_')]

### 6. Creating Interaction Features

These capture the **customer × offer** combinations that drive redemption.

In [None]:
offer_type_cols = [c for c in df_train.columns if c.startswith('offer_') and c != 'offer_value' and c != 'offer_id' and c != 'offer_name']

for col in offer_type_cols:
    df_train[f'freq_x_{col}'] = df_train['frequency'] * df_train[col]
    df_train[f'recency_x_{col}'] = df_train['recency_days'] * df_train[col]
    df_train[f'monetary_x_{col}'] = df_train['monetary_avg'] * df_train[col]

if 'is_frequent' in df_train.columns:
    for col in offer_type_cols:
        df_train[f'isfreq_x_{col}'] = df_train['is_frequent'] * df_train[col]

if 'is_recent' in df_train.columns:
    for col in offer_type_cols:
        df_train[f'isrecent_x_{col}'] = df_train['is_recent'] * df_train[col]

if 'is_lapsed' in df_train.columns:
    for col in offer_type_cols:
        df_train[f'islapsed_x_{col}'] = df_train['is_lapsed'] * df_train[col]

if 'is_budget' in df_train.columns:
    for col in offer_type_cols:
        df_train[f'isbudget_x_{col}'] = df_train['is_budget'] * df_train[col]

if 'rfm_score' in df_train.columns:
    for col in offer_type_cols:
        df_train[f'rfm_x_{col}'] = df_train['rfm_score'] * df_train[col]

interaction_cols = [c for c in df_train.columns if '_x_' in c]

### 7. Select Features

In [None]:
leaky_features = ['open_rate', 'click_rate', 'redemption_rate', 
                  'total_opens', 'total_clicks', 'total_redemptions',
                  'opened', 'clicked']

exclude_cols = ['customer_id', 'interaction_id', 'offer_id', 'offer_name', 
                'sent_date', 'redeemed', 'favorite_category', 'favorite_offer_type']

feature_cols = [c for c in df_train.columns 
                if c not in exclude_cols 
                and c not in leaky_features
                and df_train[c].dtype in ['int64', 'float64', 'int32', 'float32', 'uint8']]

In [None]:
X = df_train[feature_cols].fillna(0)
y = df_train['redeemed']

### 8. Splitting Data for the model (Train/Test Split)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

### 9. Train LightGBM Model with MLflow

In [None]:
mlflow.set_experiment("Offer_Recommendation_Ranking")

with mlflow.start_run(run_name="lgbm_v2_interaction_features"):

    params = {
        'objective': 'binary',
        'metric': 'auc',
        'boosting_type': 'gbdt',
        'num_leaves': 63,
        'max_depth': 8,
        'learning_rate': 0.05,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.8,
        'bagging_freq': 5,
        'min_child_samples': 100,
        'reg_alpha': 0.1,
        'reg_lambda': 0.1,
        'verbose': -1,
        'is_unbalance': True,
        'random_state': 42
    }

    mlflow.log_params(params)
    mlflow.log_param('num_features', len(feature_cols))
    mlflow.log_param('num_interaction_features', len(interaction_cols))
    mlflow.log_param('train_samples', len(X_train))
    mlflow.log_param('test_samples', len(X_test))
    mlflow.log_param('positive_rate', round(y.mean(), 3))
    mlflow.log_param('n_clusters', N_CLUSTERS)
    mlflow.log_param('retrieval_top_n', RETRIEVAL_TOP_N)

    train_data = lgb.Dataset(X_train, label=y_train, feature_name=feature_cols, free_raw_data=True)
    test_data = lgb.Dataset(X_test, label=y_test, reference=train_data, free_raw_data=True)

    model = lgb.train(
        params,
        train_data,
        num_boost_round=1000,
        valid_sets=[test_data],
        valid_names=['test'],
        callbacks=[
            lgb.early_stopping(stopping_rounds=50),
            lgb.log_evaluation(period=0)
        ]
    )

    mlflow.log_param('best_iteration', model.best_iteration)

    y_pred_proba = model.predict(X_test)
    y_train_pred = model.predict(X_train)

    test_auc = roc_auc_score(y_test, y_pred_proba)
    train_auc = roc_auc_score(y_train, y_train_pred)

    mlflow.log_metric('test_auc', round(test_auc, 4))
    mlflow.log_metric('train_auc', round(train_auc, 4))
    mlflow.log_metric('auc_gap', round(train_auc - test_auc, 4))

    for threshold in [0.5, 0.4, 0.3, 0.2]:
        y_pred = (y_pred_proba >= threshold).astype(int)
        prec = precision_score(y_test, y_pred, zero_division=0)
        rec = recall_score(y_test, y_pred, zero_division=0)
        mlflow.log_metric(f'precision_at_{threshold}', round(prec, 4))
        mlflow.log_metric(f'recall_at_{threshold}', round(rec, 4))

    mlflow.log_metric('pred_min', round(float(y_pred_proba.min()), 4))
    mlflow.log_metric('pred_max', round(float(y_pred_proba.max()), 4))
    mlflow.log_metric('pred_mean', round(float(y_pred_proba.mean()), 4))
    mlflow.log_metric('pred_std', round(float(y_pred_proba.std()), 4))

    # ROC Curve
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    fig_roc, ax_roc = plt.subplots(figsize=(8, 6))
    ax_roc.plot(fpr, tpr, 'b-', linewidth=2, label=f'Model (AUC = {test_auc:.3f})')
    ax_roc.plot([0, 1], [0, 1], 'k--', linewidth=1, label='Random (AUC = 0.500)')
    ax_roc.set_xlabel('False Positive Rate')
    ax_roc.set_ylabel('True Positive Rate')
    ax_roc.set_title('ROC Curve')
    ax_roc.legend(loc='lower right')
    ax_roc.grid(True, alpha=0.3)
    fig_roc.tight_layout()
    fig_roc.savefig('roc_curve.png', dpi=150)
    mlflow.log_artifact('roc_curve.png')
    plt.show()

    # Feature Importance
    importance_df = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importance(importance_type='gain')
    }).sort_values('importance', ascending=False)

    fig_imp, ax_imp = plt.subplots(figsize=(10, 8))
    top_20 = importance_df.head(20)
    ax_imp.barh(range(len(top_20)), top_20['importance'].values)
    ax_imp.set_yticks(range(len(top_20)))
    ax_imp.set_yticklabels(top_20['feature'].values)
    ax_imp.set_xlabel('Importance (Gain)')
    ax_imp.set_title('Top 20 Feature Importances')
    ax_imp.invert_yaxis()
    fig_imp.tight_layout()
    fig_imp.savefig('feature_importance.png', dpi=150)
    mlflow.log_artifact('feature_importance.png')
    plt.show()

    importance_df.to_csv('feature_importance.csv', index=False)
    mlflow.log_artifact('feature_importance.csv')

    mlflow.lightgbm.log_model(
        model,
        artifact_path="model",
        registered_model_name="offer_recommender"
    )

    import json
    with open('feature_cols.json', 'w') as f:
        json.dump(feature_cols, f)
    mlflow.log_artifact('feature_cols.json')

    run_id = mlflow.active_run().info.run_id

auc = test_auc
print(f"Test AUC: {test_auc:.4f} | Train AUC: {train_auc:.4f} | Gap: {train_auc - test_auc:.4f}")

### 10. View MLflow Results

In [None]:
experiment = mlflow.get_experiment_by_name("Offer_Recommendation_Ranking")
runs_df = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

display_cols = ['run_id', 'metrics.test_auc', 'metrics.train_auc', 'metrics.auc_gap',
                'metrics.precision_at_0.5', 'metrics.recall_at_0.5']
available_cols = [c for c in display_cols if c in runs_df.columns]
runs_df[available_cols]

### 11. Save Model & Artifacts (Local Copies)

In [None]:
import os
import json
import pickle

os.makedirs('../models', exist_ok=True)

model.save_model('../models/offer_recommender_v2.txt')

with open('../models/feature_cols_v2.json', 'w') as f:
    json.dump(feature_cols, f)

with open('../models/kmeans_model.pkl', 'wb') as f:
    pickle.dump(kmeans, f)

with open('../models/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('../models/cluster_candidates.json', 'w') as f:
    json.dump({str(k): v for k, v in cluster_candidates.items()}, f)

print("Saved: model, features, kmeans, scaler, cluster_candidates")

### 12. Load Customer Features + Clusters into Redis

In [None]:
import redis

cache = redis.Redis(host='localhost', port=6379, db=0, decode_responses=True)
cache.ping()

In [None]:
pipe = cache.pipeline()
batch_size = 10000
count = 0

for _, row in df_features.iterrows():
    customer_id = row['customer_id']
    features = row.drop('customer_id').to_dict()
    features = {k: float(v) if isinstance(v, (np.integer, np.floating)) else v
                for k, v in features.items()}
    pipe.set(f"cust:{customer_id}", json.dumps(features))
    count += 1
    if count % batch_size == 0:
        pipe.execute()

pipe.execute()
print(f"Loaded {count:,} customers to Redis")

In [None]:
sample_id = df_features['customer_id'].iloc[0]
stored = json.loads(cache.get(f"cust:{sample_id}"))

print(f"Sample: cluster={stored.get('cluster')}, frequency={stored.get('frequency')}, recency={stored.get('recency_days')}")

### Summary

In [None]:
print(f"Stage 1 (KMeans): {N_CLUSTERS} clusters, {RETRIEVAL_TOP_N} candidates each")
print(f"Stage 2 (LightGBM): AUC={auc:.4f}, {len(feature_cols)} features ({len(interaction_cols)} interactions)")
print(f"Redis: {count:,} customers loaded")