In [2]:
import os
import pandas as pd
import cv2
import numpy as np


In [3]:
# --- CONFIGURATION ---
# 1. Path to the Kaggle folder containing patient subfolders
kaggle_folder_path = r"F:\FYP_Preparation\FYP_Data\LGG\lgg-mri-segmentation\kaggle_3m"

# 2. Path to your previously generated Valid Patients List
valid_ids_path = r"F:\FYP_Preparation\FYP_Data\FYP_Final_Dataset\valid_patient_ids.csv"

# 3. Path to your Clean Clinical and Genomics Data
clinical_path = r"F:\FYP_Preparation\FYP_Data\FYP_Final_Dataset\clean_clinical_data.csv"
genomics_path = r"F:\FYP_Preparation\FYP_Data\FYP_Final_Dataset\clean_genomics_data.csv"

# 4. Output Path for the MASTER CSV
output_path = r"F:\FYP_Preparation\FYP_Data\FYP_Final_Dataset\final_model_data.csv"

In [4]:
print(">>> Starting Key-Slice Extraction...")

# 1. Load the Patient Lists
try:
    valid_patients = pd.read_csv(valid_ids_path)['PATIENT_ID'].tolist()
    clinical_df = pd.read_csv(clinical_path)
    genomics_df = pd.read_csv(genomics_path)
    print(f"Loaded {len(valid_patients)} target patients.")
except Exception as e:
    print(f"CRITICAL ERROR: Could not load input CSVs. Check paths.\n{e}")
    exit()

dataset_rows = []

>>> Starting Key-Slice Extraction...
Loaded 110 target patients.


In [5]:
# 2. Loop through every patient to find their BEST image
for patient_id in valid_patients:
    # Converting "TCGA-CS-4941" -> "TCGA_CS_4941" (Folder format uses underscores)
    folder_name_prefix = patient_id.replace("-", "_")
    
    # Find the actual folder path (it usually has a date suffix like _19960909)
    patient_folder = None
    for item in os.listdir(kaggle_folder_path):
        if item.startswith(folder_name_prefix) and os.path.isdir(os.path.join(kaggle_folder_path, item)):
            patient_folder = os.path.join(kaggle_folder_path, item)
            break
    
    if not patient_folder:
        print(f"Warning: Folder not found for {patient_id}")
        continue

    # 3. Find the Slice with the Largest Tumor Area
    max_tumor_area = 0
    best_image_path = None
    
    # List all files in the patient's folder
    files = os.listdir(patient_folder)
    
    for f in files:
        if "_mask.tif" in f:
            mask_path = os.path.join(patient_folder, f)
            
            # Read mask image (Grayscale)
            mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
            
            # Calculate Area (Sum of non-zero pixels)
            if mask is not None:
                tumor_area = np.sum(mask > 0)
                
                if tumor_area > max_tumor_area:
                    max_tumor_area = tumor_area
                    # The image has the same name but without '_mask'
                    image_filename = f.replace("_mask.tif", ".tif")
                    best_image_path = os.path.join(patient_folder, image_filename)

    # 4. Prepare the Data Row
    # Only keep patients who actually have a visible tumor
    if best_image_path and max_tumor_area > 0:
        # Fetch Clinical Data
        clin_row = clinical_df[clinical_df['PATIENT_ID'] == patient_id]
        
        # Fetch Genomics Data
        gen_row = genomics_df[genomics_df['PATIENT_ID'] == patient_id]
        
        if not clin_row.empty and not gen_row.empty:
            # Create a combined record
            record = {
                'PATIENT_ID': patient_id,
                'IMAGE_PATH': best_image_path,
                'TUMOR_AREA': max_tumor_area,
                # Clinical Features
                'AGE': clin_row.iloc[0].get('AGE', 0),
                'GENDER': clin_row.iloc[0].get('GENDER', 'Unknown'),
                'GRADE': clin_row.iloc[0].get('HISTOLOGICAL_GRADE', 'G2'), # Default to G2 if missing
                'SURVIVAL_MONTHS': clin_row.iloc[0].get('OS_MONTHS', 0),
                'OS_STATUS': clin_row.iloc[0].get('OS_STATUS', 'LIVING'),
                # Genomics Features (Top 5 for MVP)
                'IDH1': gen_row.iloc[0].get('IDH1', 0),
                'TP53': gen_row.iloc[0].get('TP53', 0),
                'ATRX': gen_row.iloc[0].get('ATRX', 0),
                'CIC': gen_row.iloc[0].get('CIC', 0),
                'PTEN': gen_row.iloc[0].get('PTEN', 0)
            }
            dataset_rows.append(record)
    else:
        print(f"  -> Skipping {patient_id} (No visible tumor found in masks)")


In [6]:
# 5. Save the Master CSV
final_df = pd.DataFrame(dataset_rows)
final_df.to_csv(output_path, index=False)

print("\n" + "="*40)
print(f"SUCCESS: Master Dataset Created!")
print(f"Total Model-Ready Patients: {len(final_df)}")
print(f"Saved to: {output_path}")
print("="*40)
print(final_df.head(2))


SUCCESS: Master Dataset Created!
Total Model-Ready Patients: 110
Saved to: F:\FYP_Preparation\FYP_Data\FYP_Final_Dataset\final_model_data.csv
     PATIENT_ID                                         IMAGE_PATH  \
0  TCGA-CS-4941  F:\FYP_Preparation\FYP_Data\LGG\lgg-mri-segmen...   
1  TCGA-CS-4942  F:\FYP_Preparation\FYP_Data\LGG\lgg-mri-segmen...   

   TUMOR_AREA  AGE   GENDER GRADE  SURVIVAL_MONTHS   OS_STATUS       IDH1  \
0        2877   67  Unknown    G2             7.69  1:DECEASED  1961.0076   
1        1539   44  Unknown    G2            43.86  1:DECEASED  1523.5294   

        TP53  ATRX        CIC  PTEN  
0  1310.5590     0  3306.0732     0  
1  1290.3114     0  1328.0277     0  
