In [None]:
import os
import pandas as pd # type: ignore
import numpy as np # type: ignore

try:
    os.chdir('/container/mount/point')
except FileNotFoundError:
    print("Warning: Directory '/container/mount/point' does not exist.")

In [34]:
def preprocess_exposure(data, target_variable, mapping, new_col, dataset_name):
    """
    Preprocesses the exposure variable in the dataset.

    Parameters:
        data (pd.DataFrame): Input DataFrame.
        target_variable (str): Name of the column to map.
        mapping (dict): Dictionary mapping original values to new values.
        new_col (str): Name of the new column to create.
        dataset_name (str): Name of the dataset (for logging).

    Returns:
        pd.DataFrame: DataFrame with the new exposure column and rows with missing values dropped.
    """
    print(f"\nRunning dataset: {dataset_name}")
    print(f"Covariates data (before): {data.shape}")
    data[new_col] = data[target_variable].map(mapping)
    data = data.dropna(subset=[new_col])
    print(f"Covariates data (after): {data.shape}")
    return data

def set_thresholds(df, column_thresholds):
    """
    Sets matching thresholds for covariates.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        column_thresholds (dict): Dictionary of column names and their threshold values.

    Returns:
        np.ndarray: Array of thresholds aligned with DataFrame columns.
    """
    thresholds = np.full(df.shape[1], np.nan)
    for col, val in column_thresholds.items():
        if col in df.columns:
            thresholds[df.columns.get_loc(col)] = val
    return thresholds

def match_and_simulate(df, target_variable, target_encoding, column_thresholds, n_col, output_prefix, dataset_name):
    """
    Performs matching and simulates outcomes for a given dataset.

    Parameters:
        df (pd.DataFrame): Input DataFrame.
        target_variable (str): Name of the exposure variable.
        target_encoding (dict): Mapping for exposure variable to string labels.
        column_thresholds (dict): Dictionary of covariate thresholds.
        n_col (int): Number of randomizations for simulation.
        output_prefix (str): Prefix for output file paths.
        dataset_name (str): Name of the dataset (for logging).

    Returns:
        pd.DataFrame: DataFrame of matched pairs.
    """
    print(f"\nMatching and simulating for dataset: {dataset_name}")
    df["W"] = df[target_variable]
    df["W_str"] = df["W"].map(target_encoding)
    df["is_treated"] = df["W"].astype(bool)
    df["pair_nb"] = np.nan

    test, control = df[df["W"] == 0], df[df["W"] == 1]
    print(f"Number of test - {len(test)}")
    print(f"Number of control - {len(control)}")

    thresholds = set_thresholds(df, column_thresholds)
    scaling = np.ones(df.shape[1], dtype=int)

    treated_units = df[df["is_treated"]]
    control_units = df[~df["is_treated"]]
    print(f"Number of treated units: {treated_units.shape[0]}")
    print(f"Number of control units: {control_units.shape[0]}")

    discrepancies = discrepancyMatrix(treated_units, control_units, thresholds, scaling)
    g, pairs_dict = construct_network(discrepancies, treated_units.shape[0], control_units.shape[0])
    matched_df = process_matched_pairs(pairs_dict, treated_units, control_units)

    print(f"Number of pairs: {len(matched_df.W)}")
    print(f"Number of test individuals: {len(matched_df[matched_df.W == 0])}")
    print(f"Number of control individuals: {len(matched_df[matched_df.W == 1])}\n")

    matched_df.to_csv(f'{output_prefix}_matched_df_{target_variable}.csv', index=True)
    simulated_outcomes = generate_simulated_outcomes(matched_df, n_col)
    simulated_outcomes.to_csv(f'{output_prefix}_simulated_outcomes_{target_variable}.csv', index=True)

    return matched_df

### AGP Dataset

In [41]:
# AGP parameters
K = 1000
agp_params = {
    "name": "AGP",
    "file": "data/AGP/agdata_smoke.csv",
    "index_col": 0,
    "target_variable": "smoking_frequency",
    "mapping": {"Daily": 0, "Never": 1},
    "new_col": "W",
    "output_prefix": "data/AGP",
    "column_thresholds": {"sex": 0, "age_cat": 0, "bmi_corrected": 4},
    "encoding": {0: "Yes", 1: "No"}
}

# AGP workflow
agp_data = pd.read_csv(agp_params["file"], index_col=agp_params["index_col"], low_memory=False)
agp_data = preprocess_exposure(agp_data, agp_params["target_variable"], agp_params["mapping"], agp_params["new_col"], agp_params["name"])
agp_data.to_csv(f"{agp_params['output_prefix']}_preprocessed.csv", index=True)

agp_matched_df = match_and_simulate(
    agp_data, agp_params["new_col"], agp_params["encoding"], agp_params["column_thresholds"], K, agp_params["output_prefix"], agp_params["name"]
)
agp_matched_df.to_csv("data/sample_df_ige_AGP.csv", index=True)


Running dataset: AGP
Covariates data (before): (12089, 660)
Covariates data (after): (12089, 660)

Matching and simulating for dataset: AGP
Number of test - 234
Number of control - 11855
Number of treated units: 11855
Number of control units: 234
Number of pairs: 468
Number of test individuals: 234
Number of control individuals: 234



### KORA Dataset

In [42]:
# KORA parameters
K = 1000
kora_params = {
    "name": "KORA",
    "file": "data/kora_full_preprocessed_masked.csv",
    "index_col": "u3_16s_id",
    "target_variable": "smoking_(cat)",
    "mapping": {1: 0, 3: 1},
    "new_col": "smoking_bin",
    "output_prefix": "data/KORA",
    "column_thresholds": {"sex": 0, "age_exm": 0, "bmi": 4},
    "encoding": {0: "Yes", 1: "No"}
}

# KORA workflow
kora_data = pd.read_csv(kora_params["file"], index_col=kora_params["index_col"], low_memory=False)
kora_data = preprocess_exposure(kora_data, kora_params["target_variable"], kora_params["mapping"], kora_params["new_col"], kora_params["name"])
kora_data.to_csv(f"{kora_params['output_prefix']}_preprocessed.csv", index=True)

kora_matched_df = match_and_simulate(
    kora_data, kora_params["new_col"], kora_params["encoding"], kora_params["column_thresholds"], K, kora_params["output_prefix"], kora_params["name"]
)
kora_matched_df.to_csv("data/sample_df_ige_KORA.csv", index=True)


Running dataset: KORA
Covariates data (before): (1938, 75)
Covariates data (after): (1084, 76)

Matching and simulating for dataset: KORA
Number of test - 271
Number of control - 813
Number of treated units: 813
Number of control units: 271
Number of pairs: 506
Number of test individuals: 253
Number of control individuals: 253

