In [18]:
import pandas as pd
from datetime import datetime

In [6]:
# Path to the template TSV file containing the headers
template_file_path = "node_template/submission_demographic_template.tsv"

# Read the template TSV file to extract the headers
df_template = pd.read_csv(template_file_path, sep="\t", nrows=0)  # Read only the header
headers = df_template.columns.tolist()  # Extract the headers as a list

In [7]:
# Observational Patients
# File paths
case_path_obs = "/Users/jinn/Documents/IU/ARDaC/case_obs_DCC_data_release_v2-0-0.tsv"
file_path_obs = "/Users/jinn/Documents/IU/ARDaC/DCC_data_release_v2.0.0/raw_data/Data for Nanxin/OBS Final Datasets/OBS_SUBJECTS.csv"

# Read the files using pandas
df_obs_case = pd.read_csv(case_path_obs, sep="\t", dtype=str)
df_obs_input = pd.read_csv(file_path_obs, sep=",", dtype=str)

df_obs_output = pd.DataFrame(index=df_obs_input.index, columns=headers)

In [20]:
# Step 1: Extract "*submitter_id" from df_obs_case and create case_table
case_table = pd.DataFrame()
case_table["*submitter_id"] = df_obs_case["*submitter_id"]
case_table["usubjid"] = case_table["*submitter_id"].apply(lambda x: x.split("_")[0])  # Extract the number before "_"

# Step 2: Iterate through case_table and map values to df_obs_output
for _, row in case_table.iterrows():
    submitter_id = row["*submitter_id"]
    usubjid = row["usubjid"]

    # Find the corresponding record in df_obs_input
    input_row = df_obs_input[df_obs_input["usubjid"] == usubjid]

    if not input_row.empty:
        input_row = input_row.iloc[0]  # Extract the first matching row

        # Populate df_obs_output
        df_obs_output.loc[:, "*type"] = "demographic"
        df_obs_output.loc[:, "project_id"] = "ARDaC-AlcHepNet"
        df_obs_output.loc[_, "*submitter_id"] = f"{submitter_id}_demographic"
        df_obs_output.loc[_, "*cases.submitter_id"] = f"{submitter_id}"
        df_obs_output.loc[_, "age_at_index"] = input_row.get("calc_age", None)
        df_obs_output.loc[_, "cause_of_death_primary"] = input_row.get("codp", None)
        df_obs_output.loc[_, "cause_of_death_secondary"] = input_row.get("cods", None)
        df_obs_output.loc[_, "cur_employ_stat"] = input_row.get("employed", None)
        df_obs_output.loc[_, "education"] = input_row.get("edu", None)
        df_obs_output.loc[_, "ethnicity"] = input_row.get("ethnic", None)
        df_obs_output.loc[_, "gender"] = input_row.get("gender", None)
        df_obs_output.loc[_, "marital"] = input_row.get("maristat", None)
        df_obs_output.loc[_, "race"] = input_row.get("race", None)
        df_obs_output.loc[_, "sex"] = input_row.get("sex", None)

        # Map "vital_status" from "ALIVE"
        alive = input_row.get("ALIVE", "").strip()
        if alive == "Y":
            df_obs_output.loc[_, "vital_status"] = "Alive"
        elif alive == "N":
            df_obs_output.loc[_, "vital_status"] = "Dead"
        else:
            df_obs_output.loc[_, "vital_status"] = "Not Reported"

        # Extract "year_of_birth" and "year_of_death"
        brthdtc = input_row.get("brthdtc", None)
        df_obs_output.loc[_, "year_of_birth"] = brthdtc.split("-")[0] if pd.notna(brthdtc) else None

        dthdtc = input_row.get("dthdtc", None)
        df_obs_output.loc[_, "year_of_death"] = dthdtc.split("-")[0] if pd.notna(dthdtc) else None

        # Calculate "days_to_death"
        scdat = input_row.get("scdat", None)  # Study enrollment date
        if pd.notna(dthdtc) and pd.notna(scdat):
            try:
                death_date = datetime.strptime(dthdtc, "%Y-%m-%d")
                study_date = datetime.strptime(scdat, "%Y-%m-%d")
                days_to_death = (death_date - study_date).days
                df_obs_output.loc[_, "days_to_death"] = days_to_death
            except ValueError:
                df_obs_output.loc[_, "days_to_death"] = None
        

In [None]:
df_obs_output

In [22]:
# Export the "DEMOGRAPHIC" node for Observational study.
obs_output_path = "demographic_obs_DCC_data_release_v2-0-0.tsv"
df_obs_output.to_csv(obs_output_path, sep="\t", index=False, header=True)
print(f"Observational patients file saved as: {obs_output_path}")

Observational patients file saved as: demographic_obs_DCC_data_release_v2-0-0.tsv


In [23]:
# Clinical Trial Patients
# File paths
case_path_rct = "/Users/jinn/Documents/IU/ARDaC/case_rct_DCC_data_release_v2-0-0.tsv"
file_path_rct = "/Users/jinn/Documents/IU/ARDaC/DCC_data_release_v2.0.0/raw_data/Data for Nanxin/RCT Final Datasets/RCT_SUBJECTS.csv"

# Read the files using pandas
df_rct_case = pd.read_csv(case_path_rct, sep="\t", dtype=str)
df_rct_input = pd.read_csv(file_path_rct, sep=",", dtype=str)

df_rct_output = pd.DataFrame(index=df_rct_input.index, columns=headers)

In [24]:
# Step 1: Extract "*submitter_id" from df_rct_case and create case_table
case_table = pd.DataFrame()
case_table["*submitter_id"] = df_rct_case["*submitter_id"]
case_table["usubjid"] = case_table["*submitter_id"].apply(lambda x: x.split("_")[0])  # Extract the number before "_"

# Step 2: Iterate through case_table and map values to df_rct_output
for _, row in case_table.iterrows():
    submitter_id = row["*submitter_id"]
    usubjid = row["usubjid"]

    # Find the corresponding record in df_rct_input
    input_row = df_rct_input[df_rct_input["usubjid"] == usubjid]

    if not input_row.empty:
        input_row = input_row.iloc[0]  # Extract the first matching row

        # Populate df_rct_output
        df_rct_output.loc[:, "*type"] = "demographic"
        df_rct_output.loc[:, "project_id"] = "ARDaC-AlcHepNet"
        df_rct_output.loc[_, "*submitter_id"] = f"{submitter_id}_demographic"
        df_rct_output.loc[_, "*cases.submitter_id"] = f"{submitter_id}"
        df_rct_output.loc[_, "age_at_index"] = input_row.get("calc_age", None)
        df_rct_output.loc[_, "cause_of_death_primary"] = input_row.get("codp", None)
        df_rct_output.loc[_, "cause_of_death_secondary"] = input_row.get("cods", None)
        df_rct_output.loc[_, "cur_employ_stat"] = input_row.get("employed", None)
        df_rct_output.loc[_, "education"] = input_row.get("edu", None)
        df_rct_output.loc[_, "ethnicity"] = input_row.get("ethnic", None)
        df_rct_output.loc[_, "gender"] = input_row.get("gender", None)
        df_rct_output.loc[_, "marital"] = input_row.get("maristat", None)
        df_rct_output.loc[_, "race"] = input_row.get("race", None)
        df_rct_output.loc[_, "sex"] = input_row.get("sex", None)

        # Map "vital_status" from "ALIVE"
        alive = input_row.get("ALIVE", "").strip()
        if alive == "Y":
            df_rct_output.loc[_, "vital_status"] = "Alive"
        elif alive == "N":
            df_rct_output.loc[_, "vital_status"] = "Dead"
        else:
            df_rct_output.loc[_, "vital_status"] = "Not Reported"

        # Extract "year_of_birth" and "year_of_death"
        brthdtc = input_row.get("brthdtc", None)
        df_rct_output.loc[_, "year_of_birth"] = brthdtc.split("-")[0] if pd.notna(brthdtc) else None

        dthdtc = input_row.get("dthdtc", None)
        df_rct_output.loc[_, "year_of_death"] = dthdtc.split("-")[0] if pd.notna(dthdtc) else None

        # Calculate "days_to_death"
        scdat = input_row.get("scdat", None)  # Study enrollment date
        if pd.notna(dthdtc) and pd.notna(scdat):
            try:
                death_date = datetime.strptime(dthdtc, "%Y-%m-%d")
                study_date = datetime.strptime(scdat, "%Y-%m-%d")
                days_to_death = (death_date - study_date).days
                df_rct_output.loc[_, "days_to_death"] = days_to_death
            except ValueError:
                df_rct_output.loc[_, "days_to_death"] = None

In [None]:
df_rct_output

In [26]:
# Export the "DEMOGRAPHIC" node for Clinical Trial study.
rct_output_path = "demographic_rct_DCC_data_release_v2-0-0.tsv"
df_rct_output.to_csv(rct_output_path, sep="\t", index=False, header=True)
print(f"Clinical Trial patients file saved as: {rct_output_path}")

Clinical Trial patients file saved as: demographic_rct_DCC_data_release_v2-0-0.tsv
