# **Target Trial Emulation in Python**

## 1. Setup

### a. Library and Paths Setup

In [1]:
import pandas as pd
import os
import numpy as np
import matplotlib.pyplot as plt

# Add working directory to path
import sys
cwd = os.getcwd()
if cwd not in sys.path:
    sys.path.append(cwd)

# Custom modules
from custom_modules.trialemulation.trial_sequence import trial_sequence
from custom_modules.trialemulation.te_stats_glm_logit import TEStatsGLMLogit
from custom_modules.trialemulation.te_datastore import save_to_datatable

CSV_PATH = './csv_files/'
PP_PATH = './models/PP/'
ITT_PATH = './models/ITT/'
SCRIPT_PATH = os.getcwd()

# Set pandas display options to prevent wrapping
pd.set_option('display.width', 1000)  # Adjust width to fit your terminal
pd.set_option('display.max_columns', None)  # Show all columns

### b. Specify estimands

In [2]:
trial_pp = trial_sequence("PP")  # Per-protocol
trial_itt = trial_sequence("ITT") # Intention-to-treat

### c. Create directories

In [3]:
trial_pp_dir = os.path.join(os.getcwd(), "trial_pp")
os.makedirs(trial_pp_dir, exist_ok=True)

trial_itt_dir = os.path.join(os.getcwd(), "trial_itt")
os.makedirs(trial_itt_dir, exist_ok=True)

## 2. Data Preparation

### a. Load the data

In [4]:
file_path = os.path.join(CSV_PATH, "data_censored.csv")

# Read the CSV file into a DataFrame
try:
    data_censored = pd.read_csv(file_path)
    print("Data loaded successfully!")
    print(data_censored.head())  # Display the first few rows
except FileNotFoundError:
    print(f"File not found at {file_path}")

Data loaded successfully!
   id  period  treatment  x1        x2  x3        x4  age     age_s  outcome  censored  eligible
0   1       0          1   1  1.146148   0  0.734203   36  0.083333        0         0         1
1   1       1          1   1  0.002200   0  0.734203   37  0.166667        0         0         0
2   1       2          1   0 -0.481762   0  0.734203   38  0.250000        0         0         0
3   1       3          1   0  0.007872   0  0.734203   39  0.333333        0         0         0
4   1       4          1   1  0.216054   0  0.734203   40  0.416667        0         0         0


### b. Set the data

In [5]:
trial_pp = trial_pp.set_data(
    data=data_censored,
    id_col="id",
    period_col="period",
    treatment_col="treatment",
    outcome_col="outcome",
    eligible_col="eligible"
)

trial_itt = trial_itt.set_data(
    data=data_censored,
    id_col="id",
    period_col="period",
    treatment_col="treatment",
    outcome_col="outcome",
    eligible_col="eligible"
)

In [6]:
trial_itt.show()

Trial Sequence Object
Estimand: Intention-to-treat

Data:
     id  period  treatment  x1        x2  x3        x4  age     age_s  outcome  censored  eligible  time_of_event  first  am_1  cumA  switch  regime_start  time_on_regime  eligible0  eligible1
0     1       0          1   1  1.146148   0  0.734203   36  0.083333        0         0         1         9999.0   True     0     1       0             0               0          1          0
1     1       1          1   1  0.002200   0  0.734203   37  0.166667        0         0         0         9999.0  False     1     2       0             1               1          0          1
2     1       2          1   0 -0.481762   0  0.734203   38  0.250000        0         0         0         9999.0  False     1     3       0             2               1          0          1
3     1       3          1   0  0.007872   0  0.734203   39  0.333333        0         0         0         9999.0  False     1     4       0             3               1

## 3. Weight models and censoring

### a. Censoring due to treatment switching

In [7]:
# Set switch weight model
trial_pp = trial_pp.set_switch_weight_model(
    numerator="age",  
    denominator="age + x1 + x3",  
    model_fitter=TEStatsGLMLogit(os.path.join(PP_PATH, "switch_models"))
)

trial_pp.switch_weights.show()

 - Numerator formula: treatment ~ age
 - Denominator formula: treatment ~ age + x1 + x3
 - Model fitter type: TEStatsGLMLogit
 - Weight models not fitted. Use calculate_weights()


### 3.2 Other informative censoring

In [8]:
# Set censor weight model
trial_pp = trial_pp.set_censor_weight_model(
    censor_event="censored",
    numerator="~ x2",
    denominator="~ x2 + x1",
    pool_models="none",
    model_fitter=TEStatsGLMLogit(os.path.join(PP_PATH, "switch_models"))
)

# Display censor weights
trial_pp.censor_weights.show()

 - Numerator formula: (1 - censored) ~ x2
 - Denominator formula: (1 - censored) ~ x2 + x1
 - Model fitter type: TEStatsGLMLogit
 - Weight models not fitted. Use calculate_weights()


In [9]:
# Set censor weight model
trial_itt = trial_itt.set_censor_weight_model(
    censor_event="censored",
    numerator="~ x2",
    denominator="~ x2 + x1",
    pool_models="numerator",
    model_fitter= TEStatsGLMLogit(save_path = ITT_PATH + "switch_models")
)

trial_itt.censor_weights.show()

 - Numerator formula: (1 - censored) ~ x2
 - Denominator formula: (1 - censored) ~ x2 + x1
 - Numerator model is pooled. Denominator model is not.
 - Model fitter type: TEStatsGLMLogit
 - Weight models not fitted. Use calculate_weights()


## 4. Calculate weights

In [10]:
# Calculate weights for Per-protocol trial
trial_pp = trial_pp.calculate_weights()

# Calculate weights for ITT trial
trial_itt = trial_itt.calculate_weights()

  special.gammaln(n - y + 1) + y * np.log(mu / (1 - mu + 1e-20)) +
  n * np.log(1 - mu + 1e-20)) * var_weights


In [11]:
trial_itt.show_weight_models()

Weight Models for Informative Censoring
[[n]]
Model: P(censor_event = 0 | X) for numerator
Summary [tidy]:
                   Coef.      Std.Err.             z     P>|z|        [0.025        0.975]
Intercept  2.556607e+01  12161.762392  2.102168e-03  0.998323 -23811.050208  23862.182345
x2        -2.171798e-14  12434.708391 -1.746561e-18  1.000000 -24371.580605  24371.580605
Summary [glance]:
                      0                 1                2            3
0               Model:               GLM             AIC:       4.0000
1       Link Function:             Logit             BIC:   -1841.0897
2  Dependent Variable:         Intercept  Log-Likelihood:  -2.5306e-09
3                Date:  2025-03-09 23:59         LL-Null:       0.0000
4    No. Observations:               321        Deviance:   5.0622e-09
5            Df Model:                 1    Pearson chi2:     2.53e-09
6        Df Residuals:               319           Scale:       1.0000
7              Method:             

In [12]:
trial_pp.show_weight_models()

Weight Models for Informative Censoring
[[n0]]
Model: P(censor_event = 0 | X, previous treatment = 0) for numerator
Summary [tidy]:
                   Coef.      Std.Err.             z    P>|z|        [0.025        0.975]
Intercept  2.556607e+01  16581.144002  1.541876e-03  0.99877 -32472.878998  32524.011135
x2        -8.881784e-16  17442.895871 -5.091921e-20  1.00000 -34187.447692  34187.447692
Summary [glance]:
                      0                 1                2            3
0               Model:               GLM             AIC:       4.0000
1       Link Function:             Logit             BIC:    -862.8141
2  Dependent Variable:         Intercept  Log-Likelihood:  -1.3402e-09
3                Date:  2025-03-09 23:59         LL-Null:       0.0000
4    No. Observations:               170        Deviance:   2.6809e-09
5            Df Model:                 1    Pearson chi2:     1.34e-09
6        Df Residuals:               168           Scale:       1.0000
7            

## 5. Specify Outcome Model

In [13]:
# # Set outcome model for trial_pp with default parameters
# trial_pp.set_outcome_model()

# # Set outcome model for trial_itt with specified adjustment_terms
# trial_itt.set_outcome_model(adjustment_terms="x2")

## 6.  Expand Trials

In [14]:
trial_pp = trial_pp.set_expansion_options(
    output=save_to_datatable(),
    chunk_size=500
)

trial_itt = trial_itt.set_expansion_options(
    output=save_to_datatable(),
    chunk_size=500
)

In [15]:
# trial_pp  <- expand_trials(trial_pp)
# trial_itt <- expand_trials(trial_itt)

# trial_pp@expansion

## 7.  Load or Sample from Expanded Data

In [16]:
# trial_itt = load_expanded_data(trial_itt, seed=1234, p_control=0.5)

## 8. Fit Marginal Structural Model

In [17]:
# trial_itt = fit_msm(
#     trial_itt,
#     weight_cols=["weight", "sample_weight"],
#     modify_weights=lambda w: w.clip(upper=w.quantile(0.99))  # Winsorization of extreme weights
# )

# # Accessing the model summary
# model_summary = trial_itt.outcome_model

# trial_itt@outcome_model@fitted@model$model

# trial_itt@outcome_model@fitted@model$vcov

# trial_itt

## 9. Inference

In [18]:
# # Generate predictions
# preds = trial_itt.predict(
#     newdata=trial_itt.outcome_data[trial_itt.outcome_data['trial_period'] == 1],
#     predict_times=np.arange(0, 11),
#     type="survival"
# )

# # Plotting the results
# plt.plot(preds['difference']['followup_time'], preds['difference']['survival_diff'], label='Survival Difference', color='blue')
# plt.xlabel("Follow up")
# plt.ylabel("Survival difference")

# # Plot the confidence intervals
# plt.plot(preds['difference']['followup_time'], preds['difference']['2.5%'], color='red', linestyle='--', label='2.5% CI')
# plt.plot(preds['difference']['followup_time'], preds['difference']['97.5%'], color='red', linestyle='--', label='97.5% CI')

# plt.legend()
# plt.title("Survival Difference with Confidence Intervals")
# plt.show()