# Choose fixed constants for synthetic causal survival data

* In this case is no censoring, low proxy noise, weak confounding, strong treatment effect
* If want to change:
    * proxy noise: sigma_z, sigma_v
    * strength of confounding: gamma_u, beta_u
    * strength of treatment effect: tau
    * level of censoring: target_censor_rate

In [41]:
import numpy as np

# Reproducibility
SEED = 123
rng = np.random.default_rng(SEED)



# Dimensions
n = 5000          # number of subjects
p = 20            # number of covariates

#-------------------------------------------------------------

# Proxy (negative control) model knobs: Z, V
#   Z = a_z * U + b_z^T X + eps_z
#   V = a_v * U + b_v^T X + eps_v
#   Corr(Z, U) = a_z / sqrt(a_z^2 + sigma_z^2), same for Corr(V, U)
#   -> sigma = a * sqrt(1 / Corr(Z,U)^2 - 1)

# We fix a and tune sigma
# Let a = 1.5
#   then we have the following table:
#   Corr(Z, U)          sigma
#   0.9                 0.73
#   0.8                 1.125
#   0.7                 1.53
#   0.6                 2.0
#   0.5                 2.6
#   0.4                 3.44
#   0.3                 4.77
#   0.2                 7.35
#   0.1                 14.93

# The lower the corr, the noisier the proxy
a_z = 1.5
a_v = 1.5
sigma_z = 0.6
sigma_v = 0.6

# Make b_z, b_v sparse (a few nonzeros) with modest magnitude
# Ensures signal(U) ≫ signal(X) ≫ noise

# With p=20, choosing 5 nonzeros gives some heterogeneity (proxies aren’t trivial) but avoids dense, noisy relationships
k_proxy = 5  # number of nonzero entries
b_z = np.zeros(p)
b_v = np.zeros(p)

proxy_idx_z = rng.choice(p, size=k_proxy, replace=False)
proxy_idx_v = rng.choice(p, size=k_proxy, replace=False)


# Since X_i ~ N(0, I_p), each term contributes 0.2^2 = 0.04 to Var(b^T X)
#   Total Var(b^T X) = 5 * 0.04 = 0.2
#   Var(a_z U) = a^2 = 2.25
#   U signal variance >> X signal variance
b_z[proxy_idx_z] = 0.2
b_v[proxy_idx_v] = 0.2

# --------------------------------------------------------------

# Treatment assignment and survival knobs:
#   P(W=1|X,U) = sigmoid(b0 + gamma_u * U + alpha^T X)
#   eta(w) = beta_t^T X + beta_u * U + tau * w
#   T(w) ~ Weibull(shape=k_weib, scale depends on eta(w))

# Confounding strength controls how much U affects treatment assignment W and outcome T
#   level     gamma_u     beta_u
#   none      0.0         0.0
#   weak      0.2         0.2
#   moderate  0.5         0.5
#   strong    1.0         1.0
#   extreme   2.0         2.0       note: may need to change b0 to +- 0.3 to bring back the marginal treatment rate to ~0.5

b0 = 0.0          # intercept (controls prevalence)
gamma_u = 0.2     

# alpha sparse with small magnitude
# Ensures X matters, but not dominate; U still matters
k_alpha = 5
alpha = np.zeros(p)
alpha_idx = rng.choice(p, size=k_alpha, replace=False)
alpha[alpha_idx] = 0.1

#   k           effect
#   1           constant hazard (exponential)
#   1.2-1.8     mildly increasing hazard (recommended range)
#   >2.5        very fast hazard growth; short survival
#   <1          decreasing hazard (not recommended)

k_weib = 1.5      # Weibull shape
beta_u = 0.2      

# beta_t sparse with small magnitude
# Ensures X matters, but not dominate; U still matters
k_beta = 5
beta_t = np.zeros(p)
beta_idx = rng.choice(p, size=k_beta, replace=False)
beta_t[beta_idx] = 0.1

# Treatment effect on log-hazard
# Changing tau changes how strong the treatment effect is
#   hazard ratio = exp(tau)

#   tau           effect
#   0.0           no effect
#   0.05-0.15     small effect
#   0.15-0.3      moderate effect
#   0.3-0.6       large effect

tau = 0.2

# Baseline scale parameter for event times
lambda0 = 1500.0

# -------------------------------------------------------------

# Censoring knobs
target_censor_rate = 0
c_admin = 1000.0  # administrative censoring time (days), initial guess


# Add light random censoring (set to False for pure administrative censoring)
# Pure adminstrative censoring: everyone is censored only because the study ends, not because of individual-specific dropout or loss to follow-up
use_random_censoring = False
rand_censor_dist = "exponential"  # placeholder, only used if use_random_censoring=True
rand_censor_scale = 5000.0        # placeholder



# Bundle parameters into a dictionary
params = dict(
    SEED=SEED, n=n, p=p,
    a_z=a_z, a_v=a_v, sigma_z=sigma_z, sigma_v=sigma_v, b_z=b_z, b_v=b_v,
    b0=b0, gamma_u=gamma_u, alpha=alpha,
    k_weib=k_weib, lambda0=lambda0, beta_t=beta_t, beta_u=beta_u, tau=tau,
    target_censor_rate=target_censor_rate, c_admin=c_admin,
    use_random_censoring=use_random_censoring,
    rand_censor_dist=rand_censor_dist, rand_censor_scale=rand_censor_scale
)

print("Constants chosen:")
print(f"  n={n}, p={p}")
print(f"  Confounding: gamma_u={gamma_u}, beta_u={beta_u}")
print(f"  Proxy noise: a_z=a_v={a_z}, sigma_z=sigma_v={sigma_z}")
print(f"  Weibull shape k={k_weib}, baseline scale lambda0={lambda0}")
print(f"  Admin censor time c_admin={c_admin}, target censor rate={target_censor_rate}")


Constants chosen:
  n=5000, p=20
  Confounding: gamma_u=0.2, beta_u=0.2
  Proxy noise: a_z=a_v=1.5, sigma_z=sigma_v=0.6
  Weibull shape k=1.5, baseline scale lambda0=1500.0
  Admin censor time c_admin=1000.0, target censor rate=0


# Draw observed covariates X_i

In [42]:
# X_i ~ N(0, I_p)
# Shape: (n, p)

X = rng.normal(loc=0.0, scale=1.0, size=(n, p))

print("X shape:", X.shape)
print("X mean (first 5 dims):", X.mean(axis=0)[:5])
print("X std  (first 5 dims):", X.std(axis=0)[:5])


X shape: (5000, 20)
X mean (first 5 dims): [-0.00417848 -0.00737974  0.03927416 -0.00916619  0.00587886]
X std  (first 5 dims): [0.99882706 0.99480017 1.00976172 0.99426534 1.00630381]


# Draw the unobserved confounder U_i

In [43]:
# U_i ~ N(0, 1)
# Shape: (n,)

U = rng.normal(loc=0.0, scale=1.0, size=n)

print("U shape:", U.shape)
print("U mean:", U.mean())
print("U std :", U.std())

U shape: (5000,)
U mean: 0.0031244516699708
U std : 0.9833456171370012


# Generate proxy variables Z_i and V_i

In [44]:
# Z_i = a_z * U_i + b_z^T X_i + eps_z
# V_i = a_v * U_i + b_v^T X_i + eps_v
# eps_z ~ N(0, sigma_z^2), eps_v ~ N(0, sigma_v^2)

eps_z = rng.normal(loc=0.0, scale=sigma_z, size=n)
eps_v = rng.normal(loc=0.0, scale=sigma_v, size=n)

Z = a_z * U + (X @ b_z) + eps_z
V = a_v * U + (X @ b_v) + eps_v

print("Z shape:", Z.shape, "V shape:", V.shape)
print("Z summary (mean, std):", float(Z.mean()), float(Z.std()))
print("V summary (mean, std):", float(V.mean()), float(V.std()))
print("Corr(Z, U):", float(np.corrcoef(Z, U)[0, 1]))
print("Corr(V, U):", float(np.corrcoef(V, U)[0, 1]))

Z shape: (5000,) V shape: (5000,)
Z summary (mean, std): -0.006995013255793667 1.6466043972342541
V summary (mean, std): 0.004072923637241014 1.657567563693127
Corr(Z, U): 0.8937167621086809
Corr(V, U): 0.8886880525407779


# Assign treatment W_i using a confounded logistic model

In [45]:
# W_i ~ Bernoulli(sigmoid(b0 + alpha^T X_i + gamma_u * U_i))

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

# Linear predictor for treatment assignment
lin_treat = b0 + X @ alpha + gamma_u * U

# Propensity scores
p_treat = sigmoid(lin_treat)

# Draw treatment
W = rng.binomial(n=1, p=p_treat, size=n).astype(int)

print("W shape:", W.shape)
print("Treatment prevalence (mean W):", W.mean())
print("Propensity score range:", (p_treat.min(), p_treat.max()))


W shape: (5000,)
Treatment prevalence (mean W): 0.5112
Propensity score range: (np.float64(0.2527443961888165), np.float64(0.7518095236861074))


# Generate potential event times T_i(0) and T_i(1) from a Weibull Cox proportional hazards model

* If $U \sim \text{Uniform}(0,1)$, then $T = F^{-1}(U)$ has cumulative distribution function (CDF) $F$. Therefore, if you know the Weibull CDF for $T(w)$, you can generate $T(w)$ by plugging the same uniform draw $U$ into the corresponding inverse CDF.
* Using the same $U_i$ for both $T_i(0)$ and $T_i(1)$ links the two potential outcomes for the same individual: If $U_i$ is large, both $T_i(0)$ and $T_i(1)$ will be large (late event) relative to their own distributions.

In [46]:
# Define the Cox-style linear predictors for each potential world w in {0,1}
#    eta_i(w) = beta_t^T X_i + beta_u * U_i + tau * w
eta0 = X @ beta_t + beta_u * U + tau * 0.0
eta1 = X @ beta_t + beta_u * U + tau * 1.0

# In the PDF parameterization:
#    T_i(w) | X_i, U_i ~ Weibull(shape=k_weib, scale = lambda0 * exp(-eta_i(w) / k_weib))
scale0 = lambda0 * np.exp(-eta0 / k_weib)
scale1 = lambda0 * np.exp(-eta1 / k_weib)

# Linking step:
#    Draw one latent uniform per subject i.
#    This is the "rank" of that person in their conditional event-time distribution.
u_latent = rng.uniform(0.0, 1.0, size=n)
u_latent = np.clip(u_latent, 1e-12, 1 - 1e-12)  # avoid numerical issues at 0 or 1

# Weibull rv has CDF: F_w(t) = 1 - exp( -(t / scale_w)^k_weib )
# If U ~ Uniform(0,1), then F^-1(U) has CDF F_w(t). So we solve u=F(t) for t
#    T = scale_w * (-log(1-U))^(1/k_weib)
base = (-np.log(1.0 - u_latent)) ** (1.0 / k_weib)
T0 = scale0 * base
T1 = scale1 * base

print("T0/T1 shapes:", T0.shape, T1.shape)
print("Median T0, T1:", np.median(T0), np.median(T1))

# Should be 1 due to shared u_latent
#   T1 = (scale1 / scale0) * T0
#   This ratio depends only on tau, X, U, but not on any additional randomness -> linear relationship
print("Corr(T0, T1):", np.corrcoef(T0, T1)[0, 1])  

T0/T1 shapes: (5000,) (5000,)
Median T0, T1: 1157.71041711901 1013.197268240639
Corr(T0, T1): 0.9999999999999999


# Reveal the factual event time T_i under the assigned treatment

In [47]:
# T_i = T_i(W_i)
# Use treatment indicator W to select the corresponding potential outcome
T = np.where(W == 1, T1, T0)

print("T shape:", T.shape)
print("Observed event time summary (min, median, max):",
      (np.min(T), np.median(T), np.max(T)))
print("Median T | W=0:", np.median(T[W == 0]))
print("Median T | W=1:", np.median(T[W == 1]))


T shape: (5000,)
Observed event time summary (min, median, max): (np.float64(4.174980569630763), np.float64(1086.3393771408537), np.float64(8050.372317975076))
Median T | W=0: 1174.7753817081002
Median T | W=1: 994.7938404923486


# Add right-censoring to get observed (Y_i, Δ_i)

In [48]:
# If we set c_admin to the (1 - target_censor_rate) quantile of T,
# then approximately target_censor_rate fraction will satisfy T > c_admin and be censored.
c_admin = float(np.quantile(T, 1.0 - target_censor_rate))

# Censoring time (pure administrative censoring)
C = np.full(n, c_admin, dtype=float)

# (Optional) add light random censoring on top (usually leave off for clean control)
# If you want it, set use_random_censoring=True in Cell 0
if use_random_censoring:
    if rand_censor_dist == "exponential":
        C_rand = rng.exponential(scale=rand_censor_scale, size=n)
    elif rand_censor_dist == "uniform":
        C_rand = rng.uniform(low=0.0, high=rand_censor_scale, size=n)
    else:
        raise ValueError("rand_censor_dist must be 'exponential' or 'uniform'")
    C = np.minimum(C, C_rand)

# Observed follow-up time and event indicator
Y = np.minimum(T, C)
Delta = (T <= C).astype(int)  # 1=event observed, 0=censored

# Report achieved censoring
censor_rate = 1.0 - Delta.mean()

print(f"Calibrated administrative censoring time c_admin = {c_admin:.3f}")
print(f"Achieved censoring rate = {censor_rate:.3%}  (target {target_censor_rate:.0%})")
print("Y summary (min, median, max):", (float(Y.min()), float(np.median(Y)), float(Y.max())))
print("Events observed (Delta=1) count:", int(Delta.sum()), "out of", n)


Calibrated administrative censoring time c_admin = 8050.372
Achieved censoring rate = 0.000%  (target 0%)
Y summary (min, median, max): (4.174980569630763, 1086.3393771408537, 8050.372317975076)
Events observed (Delta=1) count: 5000 out of 5000


# Combine all generated components into a single DataFrame

In [49]:
import pandas as pd

# Create DataFrame for covariates X
X_cols = [f"X{j+1}" for j in range(p)]
df_X = pd.DataFrame(X, columns=X_cols)

# Full observed dataset
df = pd.concat(
    [
        df_X,
        pd.DataFrame({
            "Z": Z,                 # proxy 1
            "V": V,                 # proxy 2
            "W": W,                 # treatment
            "Y": Y,                 # observed time
            "Delta": Delta          # event indicator (1=event, 0=censored)
        })
    ],
    axis=1
)

# Hidden variables
df_hidden = pd.DataFrame({
    "U": U,
    "T0": T0,
    "T1": T1,
    "T": T
})

print("Observed data shape:", df.shape)
print("Columns:", df.columns.tolist())
print("\nHead of observed dataset:")
display(df.head())

# A single object with everything (observed + hidden):
full_data = {
    "observed": df,
    "hidden": df_hidden
}

Observed data shape: (5000, 25)
Columns: ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10', 'X11', 'X12', 'X13', 'X14', 'X15', 'X16', 'X17', 'X18', 'X19', 'X20', 'Z', 'V', 'W', 'Y', 'Delta']

Head of observed dataset:


Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X16,X17,X18,X19,X20,Z,V,W,Y,Delta
0,-0.311795,0.337769,-2.207471,0.827921,1.54163,1.126807,0.75477,-0.145978,1.281902,1.074031,...,-2.172044,-0.370147,0.16438,0.859881,1.761661,1.321089,1.611579,0,1440.906398,1
1,0.993324,-0.291521,0.728128,-1.2616,1.429939,-0.156475,-0.673759,-0.63906,-0.061361,-0.392785,...,0.055346,-0.481563,-0.583408,-0.862161,-1.488175,-0.700023,-0.837553,0,605.35272,1
2,0.216307,0.984376,-0.543084,-0.558615,-0.316483,-0.46064,-1.43627,1.365108,0.439,-0.711695,...,1.519524,1.703909,-0.248859,-0.499749,0.099598,-0.883904,-0.878286,1,511.533413,1
3,0.128343,-0.734222,-0.620475,0.813274,1.641801,-0.226501,-0.647965,-0.283371,-0.995131,-0.272872,...,-0.148758,1.315666,-1.222346,-0.303591,-1.173689,2.03541,2.305684,0,1072.121843,1
4,0.826274,0.850322,-0.515768,1.658113,-0.297263,-1.383377,-0.281205,0.360021,-0.234392,2.265521,...,-2.728486,-0.646397,1.115104,-0.843211,-0.636688,-0.237534,0.432936,0,922.758148,1


# Compute CATE given (X, U): E[T(1)−T(0)∣X,U]

In [50]:
from scipy.special import gamma

# Linear predictors (already conceptually defined earlier)
eta0 = X @ beta_t + beta_u * U + tau * 0.0
eta1 = X @ beta_t + beta_u * U + tau * 1.0

# Constant factor from the Weibull mean
weibull_mean_const = lambda0 * gamma(1.0 + 1.0 / k_weib)

# Conditional mean event times
E_T0_XU = weibull_mean_const * np.exp(-eta0 / k_weib)
E_T1_XU = weibull_mean_const * np.exp(-eta1 / k_weib)

print("E[T(0) | X, U] summary (min, median, max):", (float(E_T0_XU.min()), float(np.median(E_T0_XU)), float(E_T0_XU.max())))
print("E[T(1) | X, U] summary (min, median, max):", (float(E_T1_XU.min()), float(np.median(E_T1_XU)), float(E_T1_XU.max())))

# Option 1: directly from previous results
CATE_XU_direct = E_T1_XU - E_T0_XU

# Option 2: closed-form expression
common_term = weibull_mean_const * np.exp(-(X @ beta_t + beta_u * U) / k_weib)
CATE_XU_closed = common_term * (np.exp(-tau / k_weib) - 1.0)

# Sanity check: abs diff should be small
max_abs_diff = float(np.max(np.abs(CATE_XU_direct - CATE_XU_closed)))

print("Max |direct - closed-form|:", max_abs_diff)
print("CATE(X,U) summary (min, median, max):", (float(CATE_XU_direct.min()), float(np.median(CATE_XU_direct)), float(CATE_XU_direct.max())))
print("Average oracle CATE (given X,U):", float(np.mean(CATE_XU_direct)))

E[T(0) | X, U] summary (min, median, max): (608.7524498092433, 1358.366383821356, 2694.2747038118105)
E[T(1) | X, U] summary (min, median, max): (532.7639019750808, 1188.8060166053024, 2357.9573349484363)
Max |direct - closed-form|: 5.115907697472721e-13
CATE(X,U) summary (min, median, max): (-336.3173688633742, -169.56036721605358, -75.98854783416255)
Average oracle CATE (given X,U): -172.36864269422128


# Residualize proxies

In [51]:
Z_tilde = Z - (X @ b_z)
V_tilde = V - (X @ b_v)

# These should remain strongly correlated with U if proxies are low-noise
print("Corr(Z_tilde, U):", float(np.corrcoef(Z_tilde, U)[0, 1]))
print("Corr(V_tilde, U):", float(np.corrcoef(V_tilde, U)[0, 1]))


Corr(Z_tilde, U): 0.9249655839101226
Corr(V_tilde, U): 0.9268329887829447


# Compute the conditional distribution of U given observed (X, Z, V)

In [52]:
numerator = a_z * (sigma_v ** 2) * Z_tilde + a_v * (sigma_z ** 2) * V_tilde

denominator = (a_z ** 2) * (sigma_v ** 2) + (a_v ** 2) * (sigma_z ** 2) + (sigma_z ** 2) * (sigma_v ** 2)

mu_U_given_ZV = numerator / denominator

var_U_given_ZV = (sigma_z ** 2) * (sigma_v ** 2) / denominator

print("mu_U_given_ZV shape:", mu_U_given_ZV.shape)
print("var_U_given_ZV:", float(var_U_given_ZV))
# correlation between posterior mean and true U (should be high if proxies are informative)
print("Corr(mu_U_given_ZV, U):", float(np.corrcoef(mu_U_given_ZV, U)[0, 1]))


mu_U_given_ZV shape: (5000,)
var_U_given_ZV: 0.07407407407407408
Corr(mu_U_given_ZV, U): 0.9606790664758929


# Get a fully observed “ground-truth” effect $E[T(1)−T(0)∣X,Z,V]$

In [53]:
const_term = lambda0 * gamma(1.0 + 1.0 / k_weib)                # lambda * Gamma(1 + 1/k)
x_term = -(1.0 / k_weib) * (X @ beta_t)                         # -(1/k) * beta_t^T X_i
mu_term = -(beta_u / k_weib) * mu_U_given_ZV                    # -(beta_u/k) * mu_{U|Z,V}
var_term = 0.5 * (beta_u**2 / (k_weib**2)) * var_U_given_ZV     # + 1/2 * (beta_u^2/k^2) * sigma^2_{U|Z,V}
contrast_term = (np.exp(-tau / k_weib) - 1.0)                   # (e^{-tau/k} - 1)

CATE_XZV = const_term * np.exp(x_term + mu_term + var_term) * contrast_term

print("CATE_XZV shape:", CATE_XZV.shape)
print("CATE(X,Z,V) summary (min, median, max):", (float(CATE_XZV.min()), float(np.median(CATE_XZV)), float(CATE_XZV.max())))
print("Average benchmark CATE (given X,Z,V):", float(np.mean(CATE_XZV)))

CATE_XZV shape: (5000,)
CATE(X,Z,V) summary (min, median, max): (-341.76154348852765, -169.29553238718745, -72.29427870974213)
Average benchmark CATE (given X,Z,V): -172.52084017001474


* $E[T(1)−T(0)∣X,U] = -172.37$
* $E[T(1)−T(0)∣X,Z,V] = -172.52$
* true causal effect at the individual level v.s. actual causal effect we can observe in real life

不要censor

比较我们的method on uncensored和eq 8， 比mean sq error

* basic version of csf, using x v z as all covariates