In [49]:
import pandas as pd

In [50]:
# Path to the template TSV file containing the headers
template_file_path = "node_template/submission_case_template.tsv"

# Read the template TSV file to extract the headers
df_template = pd.read_csv(template_file_path, sep="\t", nrows=0)  # Read only the header
headers = df_template.columns.tolist()  # Extract the headers as a list

In [None]:
# Observational Patients

# File path to the input CSV file
file_path_obs = "/Users/jinn/Documents/IU/ARDaC/DCC_data_release_v2.0.0/raw_data/Data for Nanxin/OBS Final Datasets/OBS_SUBJECTS.csv"

# Read the file using pandas
df_obs_input = pd.read_csv(file_path_obs, sep=",", dtype=str)

# Initialize a DataFrame with the defined headers and same index as df_obs_input
df_obs_output = pd.DataFrame(index=df_obs_input.index, columns=headers)

# Assign fixed values using .loc to align with the index
df_obs_output.loc[:, "*type"] = "case"
df_obs_output.loc[:, "project_id"] = "ARDaC-AlcHepNet"
df_obs_output.loc[:, "*studies.submitter_id"] = "obs"
df_obs_output.loc[:, "index_date"] = "Study Enrollment"

# Map dynamic values using .apply
df_obs_output["*submitter_id"] = df_obs_input["usubjid"].apply(lambda x: f"{x}_obs" if pd.notna(x) else None)
df_obs_output["cohort"] = df_obs_input["obs_arm"].apply(lambda x: x.split(":")[-1].strip() if pd.notna(x) else None)
df_obs_output["study_site"] = df_obs_input["site"].apply(lambda x: x.strip() if pd.notna(x) else None)
df_obs_output["vital_status"] = df_obs_input["ALIVE"].apply(lambda x: "alive" if x == "Y" else "dead" if x == "N" else None)

# Display the resulting DataFrame
print("Observational Patients Output DataFrame preview:")
print(df_obs_output.head())

In [None]:
# Clinical Trial Patients
# File path to the RCT_SUBJECTS.csv file
file_path_rct = "/Users/jinn/Documents/IU/ARDaC/DCC_data_release_v2.0.0/raw_data/Data for Nanxin/RCT Final Datasets/RCT_SUBJECTS.csv"

# Read the RCT_SUBJECTS.csv file
df_input_rct = pd.read_csv(file_path_rct, sep=",", dtype=str)

# Initialize a new DataFrame with the defined headers and the same index as df_input_rct
df_rct_output = pd.DataFrame(index=df_input_rct.index, columns=headers)

# Assign fixed values using .loc for proper alignment
df_rct_output.loc[:, "*type"] = "case"
df_rct_output.loc[:, "project_id"] = "ARDaC-AlcHepNet"
df_rct_output.loc[:, "*studies.submitter_id"] = "clinical"
df_rct_output.loc[:, "index_date"] = "Study Enrollment"

# Map dynamic values based on the input file
df_rct_output["*submitter_id"] = df_input_rct["usubjid"].apply(lambda x: f"{x}_clinical" if pd.notna(x) else None)
df_rct_output["actarm"] = df_input_rct["rct_arm"].apply(lambda x: x.strip() if pd.notna(x) else None)
df_rct_output["rct_meld_strata"] = df_input_rct["rct_meld_strata"].apply(lambda x: x.strip() if pd.notna(x) else None)
df_rct_output["study_site"] = df_input_rct["site"].apply(lambda x: x.strip() if pd.notna(x) else None)
df_rct_output["vital_status"] = df_input_rct["ALIVE"].apply(lambda x: "alive" if x == "Y" else "dead" if x == "N" else None)

# Fill other unmapped columns with NaN for consistency
for col in df_rct_output.columns:
    if col not in ["*type", "project_id", "*submitter_id", "*studies.submitter_id", "actarm", 
                   "rct_meld_strata", "study_site", "vital_status", "index_date"]:
        df_rct_output[col] = None

# Display the first few rows of the output DataFrame for verification
print("Output RCT DataFrame preview:")
print(df_rct_output.head())

In [53]:
# Export the "CASE" node.

# Save the Combined DataFrame to TSV
rct_output_path = "case_rct_DCC_data_release_v2-0-0.tsv"
obs_output_path = "case_obs_DCC_data_release_v2-0-0.tsv"

df_rct_output.to_csv(rct_output_path, sep="\t", index=False, header=True)
print(f"RCT patients file saved as: {rct_output_path}")
df_obs_output.to_csv(obs_output_path, sep="\t", index=False, header=True)
print(f"Observational patients file saved as: {obs_output_path}")

RCT patients file saved as: case_rct_DCC_data_release_v2-0-0.tsv
Observational patients file saved as: case_obs_DCC_data_release_v2-0-0.tsv
