In [2]:
import os
import pandas as pd
import numpy as np

def sample_brain_tumor_age():
    group = np.random.choice(
        ["child", "young_adult", "middle_aged", "elderly"],
        p=[0.1, 0.2, 0.5, 0.2]
    )
    
    if group == "child":
        return np.random.randint(0, 19)  # 0–18
    elif group == "young_adult":
        return np.random.randint(19, 41)  # 19–40
    elif group == "middle_aged":
        return np.random.randint(41, 71)  # 41–70
    else:  # elderly
        return np.random.randint(71, 86)  # 71–85

def sort_by_orderlist(your_list, order_list):
    # For each item, find the *first* index it appears in order_list
    order_indices = {item: idx for idx, item in enumerate(order_list) if item not in locals().get('order_indices', {})}

    # Sort based on the first appearance index
    return sorted(your_list, key=lambda x: order_indices.get(x, float('inf')))

data_folder_name = "images_registered_proc"        # Change This
data_path = os.path.join(os.getcwd(), data_folder_name)
datafiles = os.listdir(data_path)
data_formated = None

gender_bias = 0.5                           # Change This
train_val_test_split = [0.7, 0.2, 0.1]      # Change This

subjectID_vs_age = {}
sort_ids = [int(x[8:11]+x[17:20]) for x in datafiles]
datafiles = sort_by_orderlist(datafiles, sort_ids)

print("Data files in the dataset folder:")
for datafile in datafiles:
    current_file_path = os.path.join(data_path, datafile)
    img_file_path = os.path.join(current_file_path, "normalized.nii.gz")          # Change This 
    segm_file_path = os.path.join(current_file_path, "segm.nii.gz")   # Change This 
    latent_file_path = os.path.join(current_file_path, "normalized_latent.npz")     # Change This
    
    if os.path.isfile(img_file_path) and os.path.isfile(segm_file_path):# and os.path.isfile(latent_file_path):
        subject_id = datafile.split("_")[0].split("-")[1]
        image_uid = current_file_path.split("/")[-1]
        split = np.random.choice(["train", "valid", "test"], p=train_val_test_split)
        sex = np.random.choice([0, 1], 1, p = (gender_bias, 1-gender_bias))[0]
        
        if subject_id not in subjectID_vs_age:
            age = sample_brain_tumor_age()/100
            subjectID_vs_age[subject_id] = age
        else:
            age = subjectID_vs_age[subject_id]
            extra = int(datafile.split("k")[1][1:4])
            age = age + (extra / 100)
        diagnosis = 0
        last_diagnosis = 0
        image_path = img_file_path
        segm_path = segm_file_path
        latent_path = latent_file_path
        new_row = {
            "subject_id": subject_id,
            "image_uid": image_uid,
            "split": split,
            "sex": sex,
            "age": age,
            "diagnosis": diagnosis,
            "last_diagnosis": last_diagnosis,
            "image_path": image_path,
            "segm_path": segm_path,
            "latent_path": latent_path
        }
        # print(list(new_row.values()))
        if data_formated is None:
            data_formated = pd.DataFrame([new_row])
        else:
            data_formated = pd.concat([data_formated, pd.DataFrame([new_row])], ignore_index=True)
        
print("Data files processed successfully.")
print(data_formated)

# Save the DataFrame to a CSV file
data_formated.to_csv("data_formated.csv", index=False)

Data files in the dataset folder:
Data files processed successfully.
    subject_id                 image_uid  split  sex   age  diagnosis  \
0          024  Patient-024_week-018_reg  valid    1  0.26          0   
1          061  Patient-061_week-054_reg  train    1  0.18          0   
2          043  Patient-043_week-018_reg   test    0  0.80          0   
3          073  Patient-073_week-073_reg   test    1  0.59          0   
4          064  Patient-064_week-033_reg  train    1  0.76          0   
..         ...                       ...    ...  ...   ...        ...   
556        085  Patient-085_week-001_reg   test    1  0.80          0   
557        072  Patient-072_week-064_reg  valid    0  1.21          0   
558        052  Patient-052_week-089_reg  train    1  1.23          0   
559        022  Patient-022_week-053_reg   test    1  1.07          0   
560        030  Patient-030_week-067_reg  train    1  1.29          0   

     last_diagnosis                                   