In [1]:
import pandas as pd
from pathlib import Path
import sys

In [4]:


def collapse_to_patient(input_path, output_path, modality_name):
    """
    Groups file-level data into patient-level data.
    """
    if not Path(input_path).exists():
        print(f"Error: {input_path} not found!")
        return None

    df = pd.read_csv(input_path)
    
    # Grouping by patient ID (case_id)
    df_patient = df.groupby("case_id").agg(
        **{f"has_{modality_name}": ("local_verified", "max")},
        **{f"{modality_name}_file_count": ("file_id", "count")}
    ).reset_index()

    df_patient.to_csv(output_path, index=False)
    print(f"✅ Success: Collapsed {modality_name} data.")
    print(f"   Saved to: {output_path}")
    return df_patient

In [5]:
collapse_to_patient(
    input_path="../processed/genomics_mutations_file_to_case.csv",
    output_path="../processed/patient_mutations_status.csv",
    modality_name="mutations"
)

✅ Success: Collapsed mutations data.
   Saved to: ../processed/patient_mutations_status.csv


Unnamed: 0,case_id,has_mutations,mutations_file_count
0,"TCGA-05-4244, TCGA-05-4244",1,1
1,"TCGA-05-4249, TCGA-05-4249",1,1
2,"TCGA-05-4250, TCGA-05-4250",1,1
3,"TCGA-05-4382, TCGA-05-4382",1,1
4,"TCGA-05-4384, TCGA-05-4384",1,1
...,...,...,...
554,"TCGA-NJ-A55O, TCGA-NJ-A55O",1,1
555,"TCGA-NJ-A55R, TCGA-NJ-A55R",1,1
556,"TCGA-NJ-A7XG, TCGA-NJ-A7XG",1,1
557,"TCGA-O1-A52J, TCGA-O1-A52J",1,1


In [6]:
collapse_to_patient(
    input_path="../processed/genomics_rnaseq_file_to_case.csv",
    output_path="../processed/patient_rnaseq_status.csv",
    modality_name="rnaseq"
)

✅ Success: Collapsed rnaseq data.
   Saved to: ../processed/patient_rnaseq_status.csv


Unnamed: 0,case_id,has_rnaseq,rnaseq_file_count
0,TCGA-05-4244,1,1
1,TCGA-05-4245,1,1
2,TCGA-05-4249,1,1
3,TCGA-05-4250,1,1
4,TCGA-05-4382,1,1
...,...,...,...
513,TCGA-NJ-A55O,1,1
514,TCGA-NJ-A55R,1,1
515,TCGA-NJ-A7XG,1,1
516,TCGA-O1-A52J,1,1


In [7]:
collapse_to_patient(
    input_path="../processed/clinical_file_to_case.csv",
    output_path="../processed/patient_clinical_status.csv",
    modality_name="clinical"
)

✅ Success: Collapsed clinical data.
   Saved to: ../processed/patient_clinical_status.csv


Unnamed: 0,case_id,has_clinical,clinical_file_count
0,TCGA-05-4244,1,1
1,TCGA-05-4245,1,2
2,TCGA-05-4249,1,1
3,TCGA-05-4250,1,1
4,TCGA-05-4382,1,2
...,...,...,...
518,TCGA-NJ-A55O,1,1
519,TCGA-NJ-A55R,1,1
520,TCGA-NJ-A7XG,1,1
521,TCGA-O1-A52J,1,1


In [10]:

# 1. Define the cleaning function
def clean_tcga_id(id_val):
    if pd.isna(id_val):
        return None
    # Split by comma (handles "ID, ID"), take the first one, then take only the first 12 characters
    first_id = str(id_val).split(',')[0].strip()
    return "-".join(first_id.split("-")[:3])

# 2. Clean the IDs in your existing dataframes
mut_df['case_id'] = mut_df['case_id'].apply(clean_tcga_id)
rna_df['case_id'] = rna_df['case_id'].apply(clean_tcga_id)
clin_df['case_id'] = clin_df['case_id'].apply(clean_tcga_id)

# 3. RE-AGGREGATE: This is the most important part. 
# Since IDs were cleaned, multiple rows might now have the same case_id.
mut_final = mut_df.groupby("case_id").max().reset_index()
rna_final = rna_df.groupby("case_id").max().reset_index()
clin_final = clin_df.groupby("case_id").max().reset_index()

# 4. Final Merge
master_df = clin_final.merge(mut_final, on="case_id", how="outer") \
                      .merge(rna_final, on="case_id", how="outer") \
                      .fillna(0)

# 5. Check the result
complete_cases = master_df[
    (master_df['has_mutations'] == 1) & 
    (master_df['has_rnaseq'] == 1) & 
    (master_df['has_clinical'] == 1)
]

print(f"Standardized Unique Patients: {len(master_df)}")
print(f"Patients with Complete Multimodal Data(mutations + RNASeq + CLinical) : {len(complete_cases)}")
master_df.head()

Standardized Unique Patients: 572
Patients with Complete Multimodal Data(mutations + RNASeq + CLinical) : 508


Unnamed: 0,case_id,has_clinical,clinical_file_count,has_mutations,mutations_file_count,has_rnaseq,rnaseq_file_count
0,TCGA-05-4244,1.0,1.0,1.0,1.0,1.0,1.0
1,TCGA-05-4245,1.0,2.0,0.0,0.0,1.0,1.0
2,TCGA-05-4249,1.0,1.0,1.0,1.0,1.0,1.0
3,TCGA-05-4250,1.0,1.0,1.0,1.0,1.0,1.0
4,TCGA-05-4382,1.0,2.0,1.0,1.0,1.0,1.0
