In [60]:
import pandas as pd

In [61]:
# Path to the template TSV file containing the headers
template_file_path = "node_template/submission_audit_template.tsv"

# Read the template TSV file to extract the headers
df_template = pd.read_csv(template_file_path, sep="\t", nrows=0)  # Read only the header
headers = df_template.columns.tolist()  # Extract the headers as a list

In [62]:
# Observational Patients
# File paths
file_path_obs = "/Users/jinn/Documents/IU/ARDaC/DCC_data_release_v2.0.0/raw_data/Data for Nanxin/OBS Final Datasets/OBS_AUDIT.csv"
case_path_obs = "/Users/jinn/Documents/IU/ARDaC/case_obs_DCC_data_release_v2-0-0.tsv"

# Read the files using pandas
df_obs_input = pd.read_csv(file_path_obs, sep=",", dtype=str)
df_obs_case = pd.read_csv(case_path_obs, sep="\t", dtype=str)

In [63]:
# Step 1: Extract "*submitter_id" from df_obs_case and create case_table
case_table = pd.DataFrame()
case_table["*submitter_id"] = df_obs_case["*submitter_id"]
case_table["usubjid"] = case_table["*submitter_id"].apply(lambda x: x.split("_")[0])  # Extract the number before "_"
df_obs_output = pd.DataFrame(index=case_table.index, columns=headers)

# Initialize a list to store unmatched records
unmatched_records_obs = []

# Step 2: Iterate through case_table and map values to df_obs_output
for _, row in case_table.iterrows():
    submitter_id = row["*submitter_id"]
    usubjid = row["usubjid"]

    # Find the corresponding record in df_obs_input
    input_row = df_obs_input[df_obs_input["usubjid"] == usubjid]
    
    if not input_row.empty:
        input_row = input_row.iloc[0]  # Extract the first matching row
        # Populate df_obs_output
        df_obs_output.loc[:, "*type"] = "audit"
        df_obs_output.loc[:, "project_id"] = "ARDaC-AlcHepNet"
        df_obs_output.loc[_, "*submitter_id"] = f"{submitter_id}_audit"
        df_obs_output.loc[_, "cases.submitter_id"] = f"{submitter_id}"
        
        df_obs_output.loc[_, "auditnd"] = input_row.get("auditnd", None)
        df_obs_output.loc[_, "adt0101"] = input_row.get("adt0101", None)
        df_obs_output.loc[_, "adt0102"] = input_row.get("adt0102", None)
        df_obs_output.loc[_, "adt0103"] = input_row.get("adt0103", None)
        df_obs_output.loc[_, "adt0104"] = input_row.get("adt0104", None)
        df_obs_output.loc[_, "adt0105"] = input_row.get("adt0105", None)
        df_obs_output.loc[_, "adt0106"] = input_row.get("adt0106", None)
        df_obs_output.loc[_, "adt0107"] = input_row.get("adt0107", None)
        df_obs_output.loc[_, "adt0108"] = input_row.get("adt0108", None)
        df_obs_output.loc[_, "adt0109"] = input_row.get("adt0109", None)
        df_obs_output.loc[_, "adt0110"] = input_row.get("adt0110", None)
    else:
        # Add unmatched record to the list
        unmatched_records_obs.append({
            "usubjid": usubjid,
            "*submitter_id": submitter_id,
            "missing_audit": "Y"
        })
        # Remove the unmatched row from df_rct_output
        df_obs_output.drop(_, inplace=True)
        
# Step 3: QC Create a DataFrame for unmatched records
df_unmatched_obs = pd.DataFrame(unmatched_records_obs)

In [None]:
df_obs_output

In [65]:
# Export the "DEMOGRAPHIC" node for Observational study.
obs_output_path = "audit_obs_DCC_data_release_v2-0-0.tsv"
df_obs_output.to_csv(obs_output_path, sep="\t", index=False, header=True)
print(f"Observational patients file saved as: {obs_output_path}")

# Step 4: Output unmatched records to a TSV file
output_path_unmatched_obs = "audit_qc_obs_DCC_data_release_v2-0-0.tsv"
df_unmatched_obs.to_csv(output_path_unmatched_obs, sep="\t", index=False)
print(f"Observational QC file saved as: {output_path_unmatched_obs}")

Observational patients file saved as: audit_obs_DCC_data_release_v2-0-0.tsv
Observational QC file saved as: audit_qc_obs_DCC_data_release_v2-0-0.tsv


In [66]:
# Clinical trial Patients
# File paths
file_path_rct = "/Users/jinn/Documents/IU/ARDaC/DCC_data_release_v2.0.0/raw_data/Data for Nanxin/RCT Final Datasets/RCT_AUDIT.csv"
case_path_rct = "/Users/jinn/Documents/IU/ARDaC/case_rct_DCC_data_release_v2-0-0.tsv"

# Read the files using pandas
df_rct_input = pd.read_csv(file_path_rct, sep=",", dtype=str)
df_rct_case = pd.read_csv(case_path_rct, sep="\t", dtype=str)

In [67]:
# Step 1: Extract "*submitter_id" from df_rct_case and create case_table
case_table = pd.DataFrame()
case_table["*submitter_id"] = df_rct_case["*submitter_id"]
case_table["usubjid"] = case_table["*submitter_id"].apply(lambda x: x.split("_")[0])  # Extract the number before "_"
df_rct_output = pd.DataFrame(index=case_table.index, columns=headers)

# Initialize a list to store unmatched records
unmatched_records_rct = []

# Step 2: Iterate through case_table and map values to df_rct_output
for _, row in case_table.iterrows():
    submitter_id = row["*submitter_id"]
    usubjid = row["usubjid"]

    # Find the corresponding record in df_rct_input
    input_row = df_rct_input[df_rct_input["usubjid"] == usubjid]
    
    if not input_row.empty:
        input_row = input_row.iloc[0]  # Extract the first matching row
        # Populate df_rct_output
        df_rct_output.loc[:, "*type"] = "audit"
        df_rct_output.loc[:, "project_id"] = "ARDaC-AlcHepNet"
        df_rct_output.loc[_, "*submitter_id"] = f"{submitter_id}_audit"
        df_rct_output.loc[_, "cases.submitter_id"] = f"{submitter_id}"
        
        df_rct_output.loc[_, "auditnd"] = input_row.get("auditnd", None)
        df_rct_output.loc[_, "adt0101"] = input_row.get("adt0101", None)
        df_rct_output.loc[_, "adt0102"] = input_row.get("adt0102", None)
        df_rct_output.loc[_, "adt0103"] = input_row.get("adt0103", None)
        df_rct_output.loc[_, "adt0104"] = input_row.get("adt0104", None)
        df_rct_output.loc[_, "adt0105"] = input_row.get("adt0105", None)
        df_rct_output.loc[_, "adt0106"] = input_row.get("adt0106", None)
        df_rct_output.loc[_, "adt0107"] = input_row.get("adt0107", None)
        df_rct_output.loc[_, "adt0108"] = input_row.get("adt0108", None)
        df_rct_output.loc[_, "adt0109"] = input_row.get("adt0109", None)
        df_rct_output.loc[_, "adt0110"] = input_row.get("adt0110", None)
    else:
        # Add unmatched record to the list
        unmatched_records_rct.append({
            "usubjid": usubjid,
            "*submitter_id": submitter_id,
            "missing_audit": "Y"
        })
        # Remove the unmatched row from df_rct_output
        df_rct_output.drop(_, inplace=True)

        
# Step 3: QC Create a DataFrame for unmatched records
df_unmatched_rct = pd.DataFrame(unmatched_records_rct)

In [None]:
df_rct_output

In [69]:
# Export the "DEMOGRAPHIC" node for Randomized Controlled Trial (RCT) study.
rct_output_path = "audit_rct_DCC_data_release_v2-0-0.tsv"
df_rct_output.to_csv(rct_output_path, sep="\t", index=False, header=True)
print(f"RCT patients file saved as: {rct_output_path}")

# Step 4: Output unmatched records to a TSV file
output_path_unmatched_rct = "audit_qc_rct_DCC_data_release_v2-0-0.tsv"
df_unmatched_rct.to_csv(output_path_unmatched_rct, sep="\t", index=False)
print(f"RCT QC file saved as: {output_path_unmatched_rct}")

RCT patients file saved as: audit_rct_DCC_data_release_v2-0-0.tsv
RCT QC file saved as: audit_qc_rct_DCC_data_release_v2-0-0.tsv
