## Creating Synthetic Dataset for SAR Model Evaluation

This notebook generates a synthetic dataset that matches the structure of our production data while maintaining privacy.

In [4]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [32]:
from collections import defaultdict
import numpy as np
import pandas as pd

def generate_synthetic_data(
    n_users=10000,
    n_products=1000,
    n_interactions=100000,
    start_date='2021-08-01',
    end_date='2023-10-02'
):
    # Convert dates to timestamps
    start_ts = pd.Timestamp(start_date)
    end_ts = pd.Timestamp(end_date)
    
    # Generate user IDs with a similar range to original data
    users = np.random.randint(1000000, 40000000, size=n_users)
    
    # Generate product IDs with a similar range to original data
    products = np.random.randint(2000, 32000, size=n_products)
    
    # Create more extreme power-law distribution for user activity
    user_probs = np.random.power(0.05, n_users)  # More extreme power law
    user_probs = user_probs / user_probs.sum()
    
    # Create product categories with hierarchical structure
    n_categories = 30
    n_subcategories = 5
    product_categories = np.random.randint(0, n_categories, size=len(products))
    product_subcategories = np.random.randint(0, n_subcategories, size=len(products))
    
    # Create revenue values matching real distribution
    revenue_base = np.array([0.0, 0.1, 1.0, 5.0, 10.0, 50.0, 100.0, 500.0, 1000.0, 4050.0])
    revenue_weights = np.array([0.7, 0.1, 0.05, 0.05, 0.03, 0.02, 0.02, 0.015, 0.01, 0.005])
    
    interactions = []
    user_history = defaultdict(list)
    user_categories = defaultdict(lambda: defaultdict(int))
    
    for _ in range(n_interactions):
        # Select user based on power-law distribution
        user = np.random.choice(users, p=user_probs)
        
        # Product selection with stronger patterns
        if user_history[user] and np.random.random() < 0.6:  # 60% chance of category-based selection
            # Find user's preferred categories
            user_cat_counts = user_categories[user]
            if user_cat_counts and np.random.random() < 0.8:  # 80% chance to select from preferred category
                preferred_category = max(user_cat_counts.items(), key=lambda x: x[1])[0]
                category_products = products[product_categories == preferred_category]
                product = np.random.choice(category_products)
            else:
                # Select from related subcategory
                last_product = user_history[user][-1]
                last_subcat = product_subcategories[np.where(products == last_product)[0][0]]
                subcat_products = products[product_subcategories == last_subcat]
                product = np.random.choice(subcat_products)
        else:
            # New category exploration
            product = np.random.choice(products)
        
        # Update user history and category preferences
        user_history[user].append(product)
        product_idx = np.where(products == product)[0][0]
        user_categories[user][product_categories[product_idx]] += 1
        
        # Generate timestamp with daily and weekly patterns
        days_range = (end_ts - start_ts).days
        random_days = np.random.randint(0, days_range)
        hour = np.random.normal(14, 4)  # Business hours centered
        hour = int(np.clip(hour, 0, 23))
        timestamp = start_ts + pd.Timedelta(days=random_days, hours=hour)
        
        # Generate revenue with real distribution
        revenue = np.random.choice(revenue_base, p=revenue_weights)
        
        interactions.append([user, timestamp, revenue, product])
    
    # Create DataFrame
    df = pd.DataFrame(
        interactions,
        columns=['user_id', 'interaction_timestamp', 'interaction_revenue', 'product_id']
    )
    
    # Sort by timestamp
    df = df.sort_values('interaction_timestamp')
    
    return df

In [33]:
from collections import defaultdict
synthetic_df = generate_synthetic_data()

# Display basic statistics and sample
print("Dataset Statistics:")
print(f"Number of unique users: {synthetic_df['user_id'].nunique()}")
print(f"Number of unique products: {synthetic_df['product_id'].nunique()}")
print(f"Date range: {synthetic_df['interaction_timestamp'].min()} to {synthetic_df['interaction_timestamp'].max()}")
print(f"Revenue distribution:\n{synthetic_df['interaction_revenue'].value_counts(normalize=True)}")

print("\nSample of synthetic data:")
print(synthetic_df.head())

# Save the synthetic dataset
output_path = '../data/synthetic_interactions.csv'
synthetic_df.to_csv(output_path, index=False)
print(f"\nDataset saved to: {output_path}")

Dataset Statistics:
Number of unique users: 2564
Number of unique products: 987
Date range: 2021-08-01 02:00:00 to 2023-10-01 22:00:00
Revenue distribution:
interaction_revenue
0.0       0.69884
0.1       0.10153
5.0       0.05054
1.0       0.05037
10.0      0.03001
100.0     0.01976
50.0      0.01973
500.0     0.01515
1000.0    0.00910
4050.0    0.00497
Name: proportion, dtype: float64

Sample of synthetic data:
        user_id interaction_timestamp  interaction_revenue  product_id
28963  39309124   2021-08-01 02:00:00                  0.0       20075
13582  13243611   2021-08-01 04:00:00                  0.1       29046
96703  32143353   2021-08-01 04:00:00                  0.0       12475
92389   8744795   2021-08-01 05:00:00                  0.0       22499
86444   9571056   2021-08-01 06:00:00                 10.0       23072

Dataset saved to: ../data/synthetic_interactions.csv
