In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os

In [2]:
# set the working directory

In [3]:
def simulate_data(n_samples=50_000, test_size=0.2):
    """
    Simulates a dataset with X (predictors), T (treatment), Y (outcome) where T affects Y
    depending on X; then splits data into training and test sets.

    Parameters:
    n_samples (int): Number of samples to generate.
    test_size (float): Proportion of the dataset to include in the test split.

    Returns:
    None: Saves the generated dataset to CSV files.
    """
    # Set the random seed for reproducibility
    rs = 2024
    np.random.seed(rs)

    # Generate random values for columns X1, X2, X3, X4
    X1 = np.random.rand(n_samples)
    X2 = np.random.rand(n_samples)
    X3 = np.random.rand(n_samples)
    X4 = np.random.rand(n_samples)

    # Create column X5 to have a correlation of 0.3 with column X4
    noise = np.random.rand(n_samples)
    X5 = 0.3 * X4 + np.sqrt(1 - 0.3 ** 2) * noise

    # Generate binary columns X6 and X7 with a probability of 0.2 for 1
    X6 = np.random.choice([0, 1], size=n_samples, p=[0.8, 0.2])
    X7 = np.random.choice([0, 1], size=n_samples, p=[0.8, 0.2])

    # Generate binary column T with a 50% probability for 0 and 1
    T = np.random.choice([0, 1], size=n_samples, p=[0.5, 0.5])

    # Initialize column Y
    Y = np.zeros(n_samples, dtype=int)

    # Create column Y based on T and some X columns,
    # effectively creating treatment heterogeneities
    for i in range(n_samples):
        if T[i] == 1:
            prob_Y = 0.30 \
                     - 0.04 * (X1[i] + X2[i]) \
                     + 0.16 * (X3[i] * X4[i]) \
                     - 0.02 * X5[i] \
                     + 0.02 * X6[i]
        else:
            prob_Y = 0.30

        # Ensure the probability is within the range [0, 1]
        prob_Y = min(max(prob_Y, 0), 1)

        # Generate Y based on the calculated probability
        Y[i] = np.random.choice([0, 1], p=[1 - prob_Y, prob_Y])

    # Create a DataFrame
    df = pd.DataFrame({
        'X1': X1,
        'X2': X2,
        'X3': X3,
        'X4': X4,
        'X5': X5,
        'X6': X6,
        'X7': X7,
        'T': T,
        'Y': Y
    })

    train_df, test_df = train_test_split(df, test_size=test_size, random_state=rs)
    train_df.to_csv('data/train_data.csv', index=False)
    test_df.to_csv('data/test_data.csv', index=False)
    print("Data simulation complete. Files saved to data/train_data.csv and data/test_data.csv")


# -------------------------------------------------------
# Notes for generating above code to practical business A/B tests：
#
# (1) Intercept (baseline rate) choices
# - The simulator uses a baseline probability of 0.30 (30% chance of "any return shipment").
# - In real deployments, set this intercept based on your observed baseline metric: e.g., if your true return-shipment rate is ~12%, use 0.12.
#
# (2) Customize correlation between X5 and X4 in other business cases
# - In retail/marketplace applications, features like "cart value", "number of items", "eco-label count", "weekday", or "duplicates present" are often correlated.
# - The authors deliberately make X5 correlated with X4 to mimic realistic feature dependencies.
#
# Current default (used above):
#   X5 = 0.3 * X4 + sqrt(1 - 0.3**2) * noise
# This construction targets corr(X4, X5) ≈ 0.3 assuming X4 and 'noise' have similar scale.
#
# Why this works:
# - With X5 = a\*X4 + b*noise and X4 ⟂ noise, corr(X4, X5) = a / sqrt(a\**2 + b**2) (if variances match).
# - Hence, to target correlation ρ, a convenient choice is: a = ρ, b = sqrt(1 - ρ**2).
#
# --- TUNE HERE IF YOU WANT A DIFFERENT CORRELATION ---
# Example A: Stronger correlation (≈ 0.7), i.e., X5 follows X4 more closely
# a = 0.7
# b = np.sqrt(1 - a**2)
# X5 = a * X4 + b * noise
#
# Example B: Weaker correlation (≈ 0.1), i.e., X5 more independent from X4
# a = 0.1
# b = np.sqrt(1 - a**2)
# X5 = a * X4 + b * noise
#
# (3) Clamping vs. using a logit/probit link
# - The simulator clamps probabilities to [0,1] with min(max(prob_Y,0),1) as a safe shortcut.
# - In applied analytics, prefer a smooth link function for probability outcomes:
#     prob_Y = 1 / (1 + np.exp(-linear_index))     # logistic link
#     linear_index = <your deterministic function of X and T>
# - Benefits:
#   * Predictions always in (0,1) without sharp truncation.
#   * Better behaved gradients for optimization and interpretation.
#
# (4) Simulating continuous business KPIs (returns value / sales)
# - Many core KPIs (returns value, AOV, sales) are continuous, skewed, and non-negative.
# - To simulate these, replace the Bernoulli draw with a skewed non-negative distribution whose mean depends on X and T. Two common choices:
#     (a) Log-normal:
#         Y_value = np.random.lognormal(mean=mu, sigma=sigma)
#         mu = <mean index as a function of X,T>         # e.g., mu = beta0 + f(X,T)
#         sigma = <spread parameter, e.g., 0.5>
#     (b) Gamma:
#         Y_value = np.random.gamma(shape, scale)
#         shape, scale = <set from desired mean/variance that depend on X,T>
# - This lets you simulate:
#     * Returns (value): non-negative, right-skewed
#     * Sales / revenue: non-negative, right-skewed
#     * AOV (average order value): similar properties
#
# Training tweak for econml CausalForestDML:
# - If you switch to a continuous outcome, set:
#     CausalForestDML(discrete_treatment=True, discrete_outcome=False, ...)
# - Off-policy evaluation (IPS) and GATE logic still apply conceptually—you're now estimating treatment effects on continuous KPIs (e.g., expected returns value or sales).
# ----------------------------------------------------------

In [4]:
if __name__ == "__main__":
    simulate_data()

Data simulation complete. Files saved to data/train_data.csv and data/test_data.csv
