In [0]:
! pip install pytmle
! pip install hydra-core
! pip install hazardous
! pip install pandas==1.4.2

In [0]:
%cd /Workspace/Users/jannis.guski@scai.fraunhofer.de/commute-tmle
%env INPUT_CSV="./data/a_inputs/trinetx.csv"
# this reads all experiments from ./conf/experiment and transforms it to a comma-separated list
import os

experiments = os.listdir('./conf/experiment')
experiments = [os.path.splitext(f)[0] for f in experiments if f.endswith('.yaml')]
EXPERIMENTS = ','.join(experiments)
os.environ['EXPERIMENTS'] = EXPERIMENTS

Move experiments to "ignore folder" that should not be run (e.g., there is no information on COVID hospitalization in TriNetX)

In [0]:
!mv ./conf/experiment/*hospital* ./conf/experiment/ignore

## Set dates

In [0]:
!python3 -m src.set_dates --multirun +experiment=$EXPERIMENTS general.input_csv=$INPUT_CSV

## Merge covariates

Unfortunately, it is not possible to simply call src.merge_covariates from this notebook because the spark session object cannot be shared. A solution could be to use the databricks-connect API, but this is currently not possible because tokens are disabled on the LUCID platform.

In [0]:
from src.cohort_specific.trinetx.merge_covariates_tnx import merge_covariates_trinetx
from src.utils.utils import parse_path_for_experiment

from omegaconf import OmegaConf
import pandas as pd
import yaml


In [0]:
with open(f"./conf/cohort/trinetx.yaml", 'r') as file:
    tnx_config = yaml.safe_load(file)

for exp in os.environ["EXPERIMENTS"].split(","):
    print(exp)
    with open(f"./conf/experiment/{exp}.yaml", 'r') as file:
        exp_config = yaml.safe_load(file)
    exp_config = OmegaConf.create(exp_config)
    input_path = parse_path_for_experiment("./data/b_dates_set", exp_config)
    df = pd.read_csv(
        input_path,
        parse_dates=["index_date"],
    )
    df_merged = merge_covariates_trinetx(df, spark=spark, **tnx_config)
    df_merged = df_merged.set_index("patient_id")

    # save df_merged
    save_path = parse_path_for_experiment(
        "./data/c_covariates_merged", exp_config
    )
    df_merged.drop(
        columns=["diagnoses", "diagnosis_dates", "drugs", "prescription_dates"],
        errors="ignore",
    ).to_csv(save_path, float_format="%.2f")


## Fit TMLE