In [14]:
import os
import pandas as pd


In [16]:
# ---  CONFIGURATION ---
# Pointing directly to the folder that contains "TCGA_CS_..." folders
kaggle_folder_path = r"F:\FYP_Preparation\FYP_Data\LGG\lgg-mri-segmentation\kaggle_3m" 

# Pointing to the clinical file
clinical_data_path = r"F:\FYP_Preparation\FYP_Data\lgg_tcga\data_clinical_patient.txt"

# Output folder
output_path = r"F:\FYP_Preparation\FYP_Data\FYP_Final_Dataset"

In [17]:
# Step 1: Get all Patient IDs from Kaggle Images
print(f"Scanning folder: {kaggle_folder_path}...")
kaggle_patients = []

# Verify the path exists first
if not os.path.exists(kaggle_folder_path):
    print("ERROR: The folder path does not exist. Check your spelling!")
else:
    for folder_name in os.listdir(kaggle_folder_path):
        # We only want folders that start with TCGA
        if "TCGA" in folder_name:
            # Current Name: TCGA_CS_4941_19960909
            # Target Name:  TCGA-CS-4941
            parts = folder_name.split('_')
            patient_id = f"{parts[0]}-{parts[1]}-{parts[2]}"
            kaggle_patients.append(patient_id)

    print(f"Found {len(kaggle_patients)} MRI Patient Folders.")

Scanning folder: F:\FYP_Preparation\FYP_Data\LGG\lgg-mri-segmentation\kaggle_3m...
Found 110 MRI Patient Folders.


In [22]:
# Step 2: Load Clinical Data & Match
if len(kaggle_patients) > 0:
    print("Loading Clinical Data...")
        # Skip the first 4 rows which are usually header descriptions in cBioPortal data
    clinical_df = pd.read_csv(clinical_data_path, sep='\t', comment='#')
        
        # Filter: Keep only patients that are in our Kaggle list
    matched_data = clinical_df[clinical_df['PATIENT_ID'].isin(kaggle_patients)]
        
    print(f"Successfully matched {len(matched_data)} patients!")
        
        # Step 3: Save the Result
    if not os.path.exists(output_path):
        os.makedirs(output_path)
            
        # Save Clinical Data
    matched_data.to_csv(f"{output_path}/clean_clinical_data.csv", index=False)
        
        # Save the list of valid IDs (We need this for Genomics later)
    pd.DataFrame(kaggle_patients, columns=['PATIENT_ID']).to_csv(f"{output_path}/valid_patient_ids.csv", index=False)
        
    print(f"Files saved to: {output_path}")
else:
    print("Still found 0 patients. Please verify the 'kaggle_folder_path' again.")

Loading Clinical Data...
Successfully matched 110 patients!
Files saved to: F:\FYP_Preparation\FYP_Data\FYP_Final_Dataset
