### Generating Synthetic Data

In [None]:
import pandas as pd
import numpy as np
import random
from datetime import datetime, timedelta
import os

## Configuration

In [None]:
# File paths
DATA_DIR = '../../data'
CUSTOMERS_FILE = os.path.join(DATA_DIR, 'customers_with_truth.csv')
OUTPUT_TRANSACTIONS_FILE = os.path.join(DATA_DIR, 'transactions.csv')
OUTPUT_INTERACTIONS_FILE = os.path.join(DATA_DIR, 'offer_interactions.csv')

In [None]:
# Service offerings
services = [
    {'service_id': 'SRV001', 'name': 'Oil Change', 'base_price': 49.99, 'category': 'Maintenance'},
    {'service_id': 'SRV002', 'name': 'Tire Rotation', 'base_price': 29.99, 'category': 'Maintenance'},
    {'service_id': 'SRV003', 'name': 'Brake Inspection', 'base_price': 39.99, 'category': 'Maintenance'},
    {'service_id': 'SRV004', 'name': 'Full Detail', 'base_price': 149.99, 'category': 'Cosmetic'},
    {'service_id': 'SRV005', 'name': 'Windshield Repair', 'base_price': 79.99, 'category': 'Repair'},
    {'service_id': 'SRV006', 'name': 'Battery Replacement', 'base_price': 129.99, 'category': 'Repair'},
    {'service_id': 'SRV007', 'name': 'AC Service', 'base_price': 89.99, 'category': 'Maintenance'},
    {'service_id': 'SRV008', 'name': 'Transmission Flush', 'base_price': 179.99, 'category': 'Maintenance'},
    {'service_id': 'SRV009', 'name': 'Wheel Alignment', 'base_price': 99.99, 'category': 'Maintenance'},
    {'service_id': 'SRV010', 'name': 'Engine Diagnostic', 'base_price': 69.99, 'category': 'Diagnostic'},
]

# Offers - base_rate now has WIDER spread
offers = [
    {'offer_id': 'OFF001', 'name': 'Free Oil Change', 'type': 'Free Service', 'value': 49.99, 'base_rate': 0.30},
    {'offer_id': 'OFF002', 'name': '20% Off Any Service', 'type': 'Discount', 'value': 0.20, 'base_rate': 0.25},
    {'offer_id': 'OFF003', 'name': 'Loyalty Points 2X', 'type': 'Points', 'value': 2.0, 'base_rate': 0.10},
    {'offer_id': 'OFF004', 'name': '$25 Off Next Visit', 'type': 'Credit', 'value': 25.00, 'base_rate': 0.28},
    {'offer_id': 'OFF005', 'name': 'Free Tire Rotation', 'type': 'Free Service', 'value': 29.99, 'base_rate': 0.32},
    {'offer_id': 'OFF006', 'name': 'Winter Package Deal', 'type': 'Bundle', 'value': 50.00, 'base_rate': 0.15},
    {'offer_id': 'OFF007', 'name': 'Refer a Friend $50', 'type': 'Referral', 'value': 50.00, 'base_rate': 0.08},
    {'offer_id': 'OFF008', 'name': 'Birthday Special 30%', 'type': 'Discount', 'value': 0.30, 'base_rate': 0.35},
]

locations = ['Edmonton South', 'Edmonton North', 'Calgary Downtown', 'Calgary NE',
             'Red Deer', 'Lethbridge', 'Vancouver', 'Surrey', 'Winnipeg', 'Saskatoon']

## Load Customer Data

In [None]:
np.random.seed(42)
random.seed(42)

df_customers = pd.read_csv(CUSTOMERS_FILE)
customer_ids = df_customers['true_customer_id'].unique()
n_customers = len(customer_ids)
print(f"Loaded {n_customers:,} unique customers")

### Step 1: Generate Transactions

In [None]:
transactions = []
transaction_id = 0

customer_segments = np.random.choice(
    ['occasional', 'regular', 'frequent', 'vip'], 
    size=n_customers, 
    p=[0.35, 0.40, 0.18, 0.07]
)
segment_map = dict(zip(customer_ids, customer_segments))

for idx, customer_id in enumerate(customer_ids):
    segment = segment_map[customer_id]
    
    if segment == 'occasional': 
        n_trans = np.random.randint(1, 3)
    elif segment == 'regular': 
        n_trans = np.random.randint(4, 10)
    elif segment == 'frequent':
        n_trans = np.random.randint(10, 18)
    else:
        n_trans = np.random.randint(18, 30)
    
    preferred_location = random.choice(locations)
    preferred_category = random.choice(['Maintenance', 'Repair', 'Cosmetic', 'Diagnostic'])
    spending_tier = np.random.choice(['budget', 'standard', 'premium'], p=[0.30, 0.50, 0.20])
    
    base_date = datetime(2024, 1, 1)
    
    if segment in ['frequent', 'vip']:
        max_recency = 180
    elif segment == 'regular':
        max_recency = 400
    else:
        max_recency = 800
    
    for i in range(n_trans):
        if i == 0:
            days_ago = random.randint(0, max_recency)
        else:
            days_ago = random.randint(0, 1095)
        
        trans_date = base_date - timedelta(days=days_ago)
        
        if random.random() < 0.65:
            preferred_services = [s for s in services if s['category'] == preferred_category]
            service = random.choice(preferred_services) if preferred_services else random.choice(services)
        else:
            service = random.choice(services)
        
        if spending_tier == 'budget':
            price_mult = random.uniform(0.75, 0.95)
        elif spending_tier == 'premium':
            price_mult = random.uniform(1.05, 1.30)
        else:
            price_mult = random.uniform(0.90, 1.10)
        
        price = service['base_price'] * price_mult
        location = preferred_location if random.random() < 0.7 else random.choice(locations)
        
        transactions.append({
            'transaction_id': f"TXN{transaction_id:08d}",
            'customer_id': customer_id,
            'service_id': service['service_id'],
            'service_name': service['name'],
            'service_category': service['category'],
            'amount': round(price, 2),
            'location': location,
            'transaction_date': trans_date.strftime('%Y-%m-%d'),
            'transaction_time': f"{random.randint(8,18):02d}:{random.randint(0,59):02d}:00"
        })
        transaction_id += 1

df_transactions = pd.DataFrame(transactions)
print(f"Generated {len(df_transactions):,} transactions")

### Step 2: Compute Customer Features

In [None]:
df_transactions['transaction_date'] = pd.to_datetime(df_transactions['transaction_date'])
REFERENCE_DATE = pd.to_datetime('2024-01-01')

customer_features = df_transactions.groupby('customer_id').agg(
    frequency=('transaction_id', 'count'),
    monetary_total=('amount', 'sum'),
    monetary_avg=('amount', 'mean'),
    monetary_std=('amount', 'std'),
    last_purchase=('transaction_date', 'max'),
    first_purchase=('transaction_date', 'min'),
    unique_services=('service_id', 'nunique'),
    unique_categories=('service_category', 'nunique'),
    unique_locations=('location', 'nunique')
).reset_index()

customer_features['recency_days'] = (REFERENCE_DATE - customer_features['last_purchase']).dt.days
customer_features['tenure_days'] = (customer_features['last_purchase'] - customer_features['first_purchase']).dt.days
customer_features['monetary_std'] = customer_features['monetary_std'].fillna(0)

customer_features['avg_days_between'] = customer_features['tenure_days'] / customer_features['frequency'].clip(lower=1)
customer_features['is_one_time'] = (customer_features['frequency'] == 1).astype(int)
customer_features['is_high_spender'] = (customer_features['monetary_avg'] > customer_features['monetary_avg'].quantile(0.75)).astype(int)
customer_features['is_recent'] = (customer_features['recency_days'] <= 60).astype(int)
customer_features['is_frequent'] = (customer_features['frequency'] >= 8).astype(int)

customer_features = customer_features.drop(columns=['last_purchase', 'first_purchase'])
customer_feature_dict = customer_features.set_index('customer_id').to_dict('index')

### Step 3: Generate Offers with STRONG Signal

Key: Create **wide separation** in redemption probabilities based on observable features.

In [None]:
def calculate_redemption_prob_v4(customer_id, offer, customer_feature_dict):
    """
    STRONG signal version - creates wide probability spread (5% to 70%)
    """
    feat = customer_feature_dict.get(customer_id, {})
    
    # Start with offer base rate
    prob = offer['base_rate']
    
    frequency = feat.get('frequency', 1)
    recency = feat.get('recency_days', 500)
    monetary_avg = feat.get('monetary_avg', 80)
    is_one_time = feat.get('is_one_time', 1)
    unique_services = feat.get('unique_services', 1)
    offer_type = offer['type']
    offer_value = offer['value']
    
    if frequency >= 15:
        prob += 0.30  # VIP customers: massive boost
    elif frequency >= 10:
        prob += 0.22
    elif frequency >= 6:
        prob += 0.14
    elif frequency >= 3:
        prob += 0.06
    elif frequency == 1:
        prob -= 0.12  # One-timers: significant penalty
    
    # RECENCY: Strong impact
    if recency <= 14:
        prob += 0.20  # Very recent: big boost
    elif recency <= 30:
        prob += 0.15
    elif recency <= 60:
        prob += 0.10
    elif recency <= 90:
        prob += 0.05
    elif recency > 365:
        prob -= 0.15  # Lapsed: significant penalty
    elif recency > 180:
        prob -= 0.08
    
    # MONETARY: Moderate impact
    if monetary_avg > 130:
        prob += 0.08 
    elif monetary_avg > 100:
        prob += 0.04
    elif monetary_avg < 50:
        prob -= 0.05
    
    if unique_services >= 6:
        prob += 0.08
    elif unique_services >= 4:
        prob += 0.04
    

    # DISCOUNT offers
    if offer_type == 'Discount':
        if monetary_avg < 70:  
            prob += 0.18
        elif monetary_avg < 90:
            prob += 0.10
        elif monetary_avg > 130:  
            prob -= 0.08
        
        # Recent + discount = action
        if recency <= 30:
            prob += 0.08
    
    # FREE SERVICE offers
    elif offer_type == 'Free Service':
        if monetary_avg > 100:
            prob += 0.15
        else:
            prob += 0.08
        
        # Frequent customers more likely to use it
        if frequency >= 5:
            prob += 0.10
    
    # POINTS offers - ONLY work for frequent customers
    elif offer_type == 'Points':
        if frequency >= 12:
            prob += 0.35  # Loyalists love points
        elif frequency >= 8:
            prob += 0.20
        elif frequency >= 5:
            prob += 0.08
        else:
            prob -= 0.15  
    # BUNDLE offers - need commitment
    elif offer_type == 'Bundle':
        if frequency >= 8 and monetary_avg > 90:
            prob += 0.20  
        elif frequency >= 5:
            prob += 0.08
        elif frequency <= 2:
            prob -= 0.18  
    
    # REFERRAL - only satisfied frequent customers refer
    elif offer_type == 'Referral':
        if frequency >= 10 and recency <= 90:
            prob += 0.25  
        elif frequency >= 6 and recency <= 180:
            prob += 0.10
        else:
            prob -= 0.12  
    
    # CREDIT offers
    elif offer_type == 'Credit':
        
        if frequency >= 4 and recency <= 120:
            prob += 0.15
        elif recency > 365:
            prob -= 0.10  
    
    if offer_value >= 50:
        prob += 0.05  
    elif offer_value >= 25:
        prob += 0.02
    
    return max(0.03, min(0.75, prob))

In [None]:
offer_interactions = []
interaction_id = 0
redemption_probs_log = []

for idx, customer_id in enumerate(customer_ids):
    feat = customer_feature_dict.get(customer_id, {})
    frequency = feat.get('frequency', 1)
    
    if frequency >= 12:
        n_offers = random.randint(8, 15)
    elif frequency >= 6:
        n_offers = random.randint(5, 10)
    elif frequency >= 3:
        n_offers = random.randint(3, 7)
    else:
        n_offers = random.randint(2, 5)
    
    for _ in range(n_offers):
        offer = random.choice(offers)
        days_ago = random.randint(0, 730)
        sent_date = datetime(2024, 1, 1) - timedelta(days=days_ago)
        
        redeem_prob = calculate_redemption_prob_v4(customer_id, offer, customer_feature_dict)
        redemption_probs_log.append(redeem_prob)
        
        open_rate = 0.60 + min(0.25, frequency * 0.02)
        opened = random.random() < open_rate
        
        click_rate = 0.45 + min(0.20, frequency * 0.015)
        clicked = opened and random.random() < click_rate
        
        redeemed = clicked and random.random() < redeem_prob
        
        offer_interactions.append({
            'interaction_id': f"INT{interaction_id:08d}",
            'customer_id': customer_id,
            'offer_id': offer['offer_id'],
            'offer_name': offer['name'],
            'offer_type': offer['type'],
            'offer_value': offer['value'],
            'sent_date': sent_date.strftime('%Y-%m-%d'),
            'opened': int(opened),
            'clicked': int(clicked),
            'redeemed': int(redeemed)
        })
        interaction_id += 1

df_offers = pd.DataFrame(offer_interactions)
print(f"Generated {len(df_offers):,} offer interactions")

### Step 4: Verify STRONG Signal

In [None]:
print(f"Redemption prob range: {np.min(redemption_probs_log):.2f} - {np.max(redemption_probs_log):.2f} (std={np.std(redemption_probs_log):.2f})")
print(f"Redemption rate: {df_offers['redeemed'].mean()*100:.1f}%")

In [None]:
df_analysis = df_offers.merge(
    customer_features[['customer_id', 'frequency', 'recency_days', 'monetary_avg']], 
    on='customer_id'
)

df_analysis['freq_bin'] = pd.cut(
    df_analysis['frequency'], 
    bins=[0, 2, 5, 10, 15, 100], 
    labels=['1-2', '3-5', '6-10', '11-15', '15+']
)

freq_stats = df_analysis.groupby('freq_bin')['redeemed'].agg(['mean', 'count'])
freq_stats['rate_%'] = (freq_stats['mean'] * 100).round(1)
freq_stats[['rate_%', 'count']]

In [None]:
df_analysis['recency_bin'] = pd.cut(
    df_analysis['recency_days'], 
    bins=[0, 30, 90, 180, 365, 2000], 
    labels=['0-30d', '31-90d', '91-180d', '181-365d', '365d+']
)

recency_stats = df_analysis.groupby('recency_bin')['redeemed'].agg(['mean', 'count'])
recency_stats['rate_%'] = (recency_stats['mean'] * 100).round(1)
recency_stats[['rate_%', 'count']]

In [None]:
offer_stats = df_analysis.groupby('offer_type')['redeemed'].agg(['mean', 'count'])
offer_stats['rate_%'] = (offer_stats['mean'] * 100).round(1)
offer_stats.sort_values('rate_%', ascending=False)[['rate_%', 'count']]

In [None]:
points_analysis = df_analysis[df_analysis['offer_type'] == 'Points']
points_by_freq = points_analysis.groupby('freq_bin')['redeemed'].mean() * 100
points_by_freq.round(1)

In [None]:
df_analysis['monetary_bin'] = pd.cut(
    df_analysis['monetary_avg'], 
    bins=[0, 70, 100, 130, 500], 
    labels=['<$70', '$70-100', '$100-130', '$130+']
)
discount_analysis = df_analysis[df_analysis['offer_type'] == 'Discount']
discount_by_monetary = discount_analysis.groupby('monetary_bin')['redeemed'].mean() * 100
discount_by_monetary.round(1)

In [None]:
cross = pd.crosstab(
    df_analysis['offer_type'], 
    df_analysis['freq_bin'], 
    values=df_analysis['redeemed'], 
    aggfunc='mean'
).round(3) * 100
cross.round(1)

## Step 5: Save Files

In [None]:
df_transactions['transaction_date'] = df_transactions['transaction_date'].dt.strftime('%Y-%m-%d')

os.makedirs(DATA_DIR, exist_ok=True)
df_transactions.to_csv(OUTPUT_TRANSACTIONS_FILE, index=False)
df_offers.to_csv(OUTPUT_INTERACTIONS_FILE, index=False)

print(f"Saved: {OUTPUT_TRANSACTIONS_FILE}, {OUTPUT_INTERACTIONS_FILE}")

## Summary

In [None]:
print(f"Transactions: {len(df_transactions):,} | Offers: {len(df_offers):,} | Redemption: {df_offers['redeemed'].mean()*100:.1f}%")
print(f"Signal: freq {freq_stats['rate_%'].min():.1f}%-{freq_stats['rate_%'].max():.1f}% | recency {recency_stats['rate_%'].min():.1f}%-{recency_stats['rate_%'].max():.1f}%")