In [3]:
#imports
from rbclib import RBCPath
from pathlib import Path
from typing import Iterable, List, Optional
import pandas as pd
import numpy as np
import fsspec

# filepaths
rbcdata_path = Path('/home/jovyan/shared/data/RBC')
train_filepath = rbcdata_path / 'train_participants.tsv'
test_filepath = rbcdata_path / 'test_participants.tsv'

pnc_freesurfer_path = RBCPath('rbc://PNC_FreeSurfer/freesurfer')


In [4]:
# config - dict of rows to get from each file

train_vars = {
    'participant_id',
    'study', 
    'study_site', 
    'session_id', 
    'age', 
    'sex', 
    'race', 
    'ethnicity', 
    'parent_1_education', 
    'parent_2_education', 
    'p_factor'
}

brain_measures_vars = {
    'subject_id',
    'session_id', 
    'Left_Thalamus_Proper_Volume_mm3', 
    'Left_Caudate_Volume_mm3', 
    'Left_Putamen_Volume_mm3', 
    'Left_Pallidum_Volume_mm3', 
    'Left_Hippocampus_Volume_mm3', 
    'Left_Amygdala_Volume_mm3', 
    'Left_Accumbens_area_Volume_mm3', 
    'Right_Thalamus_Proper_Volume_mm3', 
    'Right_Caudate_Volume_mm3', 
    'Right_Putamen_Volume_mm3', 
    'Right_Pallidum_Volume_mm3', 
    'Right_Hippocampus_Volume_mm3', 
    'Right_Amygdala_Volume_mm3', 
    'Right_Accumbens_area_Volume_mm3', 
    'SubCortGray_SubCortGrayVol', 
    'EstimatedTotalIntraCranialVol_eTIV'
}

region_surface_vars = {
    'subject_id', 
    'session_id', 
    'atlas', 
    'hemisphere', 
    'StructName', 
    'SurfArea', 
    'GrayVol', 
    'ThickAvg', 
    'MeanCurv',
    'GausCurv', 
    'FoldInd', 
    'CurvInd', 
    'Mean_wgpct', 
    'Mean_piallgi'
}

ATLAS_PREFIXES = ['Schaefer2018_100Parcels']
STRUCTNAME_REGEX = 'limbic'
FILTER_LOGIC = 'and'

In [5]:
# BUILD TRAIN/TEST DATAFRAMES
# Load the PNC participants TSV files...
with train_filepath.open('r') as f:
    train_data = pd.read_csv(f, sep='\t')
with test_filepath.open('r') as f:
    test_data = pd.read_csv(f, sep='\t')

# change participant_id col name to subject_id for standardization
train_data = train_data.rename(columns={'participant_id':'subject_id'})

# discover subject IDs
train_ids = train_data['subject_id']

In [None]:
# BUILD BRAIN MEASURES DATAFRAME
brain_measures_frames = []

for sub_id in train_ids:
    # directory for this subject
    sub_path = RBCPath(f"rbc://PNC_FreeSurfer/freesurfer/sub-{sub_id}")
    try:
        if not sub_path.is_dir():
                print(f"Skipping {sub_id}: directory not found")
                continue
    except FileNotFoundError:
        print(f"Skipping {sub_id}: directory not found")
        continue

    brain_measures_fp = sub_path / f"sub-{sub_id}_brainmeasures.tsv"
    try:
        if not brain_measures_fp.is_file():
                print(f"Skipping {sub_id}: file not found")
                continue
    except FileNotFoundError:
        print(f"Skipping {sub_id}: file not found")
        continue

    # read + keep only desired columns
    df = pd.read_csv(brain_measures_fp, sep="\t")
    keep = [c for c in brain_measures_vars if c in df.columns]
    df = df[keep]

    brain_measures_frames.append(df)

brain_measures_df = pd.concat(brain_measures_frames, ignore_index=True)
# save df to parquet file
region_surfaces_df.to_parquet('data/region_surfaces_100parcels.parquet')

print(brain_measures_df.shape)
brain_measures_df.head()

Skipping 1342487188: directory not found
Skipping 1649551035: directory not found
Skipping 2003542642: directory not found
Skipping 219325366: directory not found
Skipping 2249226316: directory not found


In [None]:
# save df to parquet file
region_surfaces_df.to_parquet('data/region_surfaces_100parcels.parquet')

In [10]:
# BUILD REGION SURFACES DATAFRAME

region_surfaces_frames = []

for sub_id in train_ids:
    try:
        sub_path = RBCPath(f"rbc://PNC_FreeSurfer/freesurfer/sub-{sub_id}")
        try:
            if not sub_path.is_dir():
                print(f"Skipping {sub_id}: directory not found")
                continue
        except FileNotFoundError:
            print(f"Skipping {sub_id}: directory not found")
            continue

        region_surfaces_fp = sub_path / f"sub-{sub_id}_regionsurfacestats.tsv"
        try:
            if not region_surfaces_fp.is_file():
                print(f"Skipping {sub_id}: file not found")
                continue
        except FileNotFoundError:
            print(f"Skipping {sub_id}: file not found")
            continue

        # read + select columns
        try:
            df = pd.read_csv(region_surfaces_fp, sep="\t")
        except FileNotFoundError:
            print(f"Skipping {sub_id}: file disappeared while reading")
            continue

        keep = [c for c in region_surface_vars if c in df.columns]
        df = df[keep]
        region_surfaces_frames.append(df)

    except OSError as e:
        # catches other filesystem errors from rbclib/pathlib
        print(f"Skipping {sub_id}: OS error: {e}")
        continue

# safe concat (in case none found)
if region_surfaces_frames:
    region_surfaces_df = pd.concat(region_surfaces_frames, ignore_index=True)
else:
    region_surfaces_df = pd.DataFrame(columns=["subject_id", *region_surface_vars])

# save df to parquet file
region_surfaces_df.to_parquet('data/region_surfaces_100parcels.parquet')

print(region_surfaces_df.shape)
region_surfaces_df.head()

Skipping 1342487188: directory not found
Skipping 1649551035: directory not found
Skipping 2003542642: directory not found
Skipping 219325366: directory not found
Skipping 4184549693: directory not found
Skipping 495793681: directory not found
(14561756, 14)


Unnamed: 0,session_id,FoldInd,GausCurv,subject_id,SurfArea,Mean_wgpct,CurvInd,hemisphere,StructName,MeanCurv,GrayVol,atlas,Mean_piallgi,ThickAvg
0,,18,0.027,sub-1000393599,1121,25.8475,1.6,lh,caudalanteriorcingulate,0.122,3493,aparc.DKTatlas,1.9877,2.87
1,,28,0.02,sub-1000393599,2236,23.4921,2.7,lh,caudalmiddlefrontal,0.109,7030,aparc.DKTatlas,3.3898,2.882
2,,49,0.032,sub-1000393599,2619,15.9672,5.2,lh,cuneus,0.125,5753,aparc.DKTatlas,3.2453,2.019
3,,8,0.036,sub-1000393599,549,20.8874,1.0,lh,entorhinal,0.144,2714,aparc.DKTatlas,2.671,3.655
4,,57,0.028,sub-1000393599,2822,20.8271,4.5,lh,fusiform,0.13,8180,aparc.DKTatlas,2.8272,2.738


In [1]:
#BACKUP FOR KERNEL CRASH

import pandas as pd

# Load the file directly into a DataFrame
region_surfaces_backup = pd.read_parquet('data/region_surfaces_100parcels.parquet')

# Optional: keep a safe copy
region_surfaces_backup = region_surfaces_backup.copy()

# check it loaded correctly
print(region_surfaces_backup.shape)
region_surfaces_backup.head()

(14561756, 14)


Unnamed: 0,session_id,FoldInd,GausCurv,subject_id,SurfArea,Mean_wgpct,CurvInd,hemisphere,StructName,MeanCurv,GrayVol,atlas,Mean_piallgi,ThickAvg
0,,18,0.027,sub-1000393599,1121,25.8475,1.6,lh,caudalanteriorcingulate,0.122,3493,aparc.DKTatlas,1.9877,2.87
1,,28,0.02,sub-1000393599,2236,23.4921,2.7,lh,caudalmiddlefrontal,0.109,7030,aparc.DKTatlas,3.3898,2.882
2,,49,0.032,sub-1000393599,2619,15.9672,5.2,lh,cuneus,0.125,5753,aparc.DKTatlas,3.2453,2.019
3,,8,0.036,sub-1000393599,549,20.8874,1.0,lh,entorhinal,0.144,2714,aparc.DKTatlas,2.671,3.655
4,,57,0.028,sub-1000393599,2822,20.8271,4.5,lh,fusiform,0.13,8180,aparc.DKTatlas,2.8272,2.738


In [2]:
# FLATTEN REGION SURFACES
wide = (
    region_surfaces_backup #region_surfaces_df normally
      .melt(id_vars=["subject_id", "atlas", "StructName"],
            value_vars=region_surface_vars,
            var_name="metric", value_name="value")
      .assign(col=lambda d: d["atlas"] + "_" + d["StructName"] + "_" + d["metric"])
      .pivot_table(index="subject_id", columns="col", values="value", aggfunc="first")
)

print(wide.shape)
wide.head()

NameError: name 'region_surface_vars' is not defined

In [None]:
# COMBINE ALL 3 DATAFRAMES - TBD
# on 'subject_id'

dfs = [df for df in [train_df, region_surfaces_df_wide, brain_measures_df] if df is not None and not df.empty]
if not dfs:
    combined_df = pd.DataFrame(columns=['subject_id'])
else:
    combined_df = dfs[0]
    for df in dfs[1:]:
        combined_df = pd.merge(combined_df, df, on='subject_id', how='outer', validate='one_to_one')
print(combined_df.shape)
combined_df.head()

In [4]:
# DATAFRAME BUILDER - TBD
# single function for multiple dataframes

def build_df():
        # file paths - loops through subject_id array and append to single df
        for subject in train_ids:
            # set root path
            # file name format example = 'rbc://PNC_FreeSurfer//home/jovyan/shared/data/RBC/PNC_FreeSurfer/freesurfer/sub-1000393599/sub-1000393599_regionsurfacestats.tsv'
            sub_path = RBCPath(f'rbc://PNC_FreeSurfer/freesurfer/sub-{train_ids[subject]}')
            assert sub_path.is_dir()
            # set path end
            brain_measures_fp = sub_path / f'sub-{train_ids[subject]}_brainmeasures.tsv'
            assert brain_measures_fp.is_dir()
            
        for file in brain_measures_fp.iterdir():
            region_surfaces_df.append() #add only cols in region_surface_vars to filter cols
        
        region_surfaces_df = build_df(BASE_PATH, subject_ids, pattern='regionsurfacestats', columns=region_surface_vars)
        print(region_surfaces_df.shape)
        region_surfaces_df.head()