# Setup

In [1]:
import numpy as np
import pandas as pd

# Imaging Data

## Data Concatenation

Grab and append the results of every sample which ran to completion into one file (uncomment to run; it can be quite expensive)

In [2]:
# %%bash
#
# cat /work/cadotte_lab/bids_cmri/derivatives/sct_6.5/*/*/softseg_vertebrae_metrics_c2c7.csv > "full_metrics_softseg_c2c7.csv"

In [3]:
img_df = pd.read_csv("full_metrics_softseg_c2c7.csv")

Remove the duplicate headers introduced via this process

In [4]:
def remove_dup_headers(df):
    df = df.loc[df.loc[:, "Timestamp"] != "Timestamp", :]
    return df

In [5]:
img_df = remove_dup_headers(img_df)

## Indexing by Image Type

In [6]:
def parse_filenames(df):
    # Get the root file names out of the paths in the dataset
    file_names = [f.split('/')[-1] for f in df['Filename']]

    # Split them into their notable components
    file_components = [f.split('_') for f in file_names]

    # Grab the patient ID from the components
    patient_ids = [f[0] for f in file_components]

    # Grab the orientation (acquisition type) from the components
    acqs = [f[1].split('-')[1] for f in file_components]

    # Get the contrast type of the image from the file components
    contrasts = [f[-2] for f in file_components]

    # Get the run number, if one exists, from the file components
    runs = [int(f[-3].split('-')[1]) if "run" in f[-3] else None for f in file_components]
    
    idx = pd.MultiIndex.from_tuples(
        zip(patient_ids, acqs, contrasts, runs, df['VertLevel']),
        names=['GRP', 'acq', 'weight', 'run', 'vert_level']
    )
    return idx

In [7]:
img_df_idx = parse_filenames(img_df)
img_df.index = img_df_idx
img_df.shape

(6971, 23)

## Redundant Feature Purge

Drop a number of metadata/irrelevant features before proceeding

In [8]:
to_drop = ['Timestamp', 'SCT Version', 'Filename', 'Slice (I->S)', 'VertLevel', 'DistancePMJ']

In [9]:
img_df = img_df.drop(to_drop, axis=1)

## Vertebrae as Feature

In [10]:
def pivot_vertebrae(df):
    return df.unstack(level="vert_level")

In [11]:
img_df = pivot_vertebrae(img_df)
img_df.shape

(1213, 102)

## Column and Index Reformatting

Stack the columns so they aren't a multi-index (which can cause major headaches in ML analysis)

In [12]:
def stack_columns(df):
    new_cols = [f"{c[0]} [V{c[1]}]" for c in df.columns]
    df.columns = new_cols
    return df

In [13]:
stack_columns(img_df).shape

(1213, 102)

Unstack everything except GRP so that it is a "feature" during data combination

In [14]:
def unstack_index(df):
    tmp_df = df.reset_index()
    tmp_df = tmp_df.set_index('GRP')
    return tmp_df

In [15]:
img_df = unstack_index(img_df)
img_df.shape

(1213, 105)

# Data Joining and Finalization

## Load Clinical Data

In [16]:
final_participants_df = pd.read_csv('../../clinical/participants_cleaned.tsv', sep='\t').set_index('GRP')
final_participants_df.shape

(292, 132)

## Data Joining

In [17]:
final_df = img_df.join(final_participants_df, how='inner')
final_df.shape

(736, 237)

In [18]:
final_df

Unnamed: 0_level_0,acq,weight,run,MEAN(area) [V2],MEAN(area) [V3],MEAN(area) [V4],MEAN(area) [V5],MEAN(area) [V6],MEAN(area) [V7],STD(area) [V2],...,Work Status,Work Status (Category),Comorbidities: Nicotine (Smoking),Comorbidities: Nicotine (Smokeless),Comorbidities: Nicotine (Patches),Comorbidities: Nicotine (Recent Quit),mJOA initial,mJOA 12 months,HRR,Recovery Class
GRP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
sub-cMRI10248,sag,T1w,,62.83016422067748,67.9560399477234,69.8186205721783,61.21597180330931,48.864113498244045,39.35894384938024,1.2586593304437395,...,Employed but not working,NW,0.0,0.0,0.0,0.0,14.0,16.0,0.500000,good
sub-cMRI10248,sag,T2w,,56.36806938741003,60.30258074779702,57.72132432172437,55.849618550948534,54.536389475785136,43.90694496696523,0.41266660718116077,...,Employed but not working,NW,0.0,0.0,0.0,0.0,14.0,16.0,0.500000,good
sub-cMRI10361,sag,T1w,,74.01944258655115,74.64714645290408,72.21676277178518,73.94405009244882,65.66413641578889,62.225881990890855,3.635668549984042,...,*Currently working,W,0.0,0.0,0.0,0.0,11.0,14.0,0.428571,fair
sub-cMRI10361,sag,T2w,2.0,71.34129773534104,62.59972218647677,66.38362643528775,70.42225728402714,57.32548327684547,45.59413997298462,5.53957671045849,...,*Currently working,W,0.0,0.0,0.0,0.0,11.0,14.0,0.428571,fair
sub-cMRI10473,axial,T2w,2.0,50.51769101066976,53.295280906316265,51.23283609895412,53.61912613366745,49.167872327922154,40.894621886225856,0.6689603190519934,...,*Currently working,W,1.0,0.0,0.0,0.0,14.0,11.0,-0.750000,fair
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sub-cMRI9884,sag,T2w,,42.69559391157851,34.33745110847326,33.64007469753996,,,,2.9112956107867145,...,Other,HSR,0.0,0.0,0.0,0.0,16.0,17.0,0.500000,good
sub-cMRI9907,axial,T2w,1.0,54.94456045773034,31.321494581665142,63.82868793430342,77.88388880225219,73.2764255456561,60.55970278133378,6.320454087347112,...,*Currently working,W,0.0,0.0,0.0,0.0,12.0,18.0,1.000000,good
sub-cMRI9907,sag,T1w,,64.95568800002779,67.63331610461951,75.62896231983221,71.301980219314,60.54984336442122,51.09593560179294,6.434667543646018,...,*Currently working,W,0.0,0.0,0.0,0.0,12.0,18.0,1.000000,good
sub-cMRI9907,sag,T2w,,77.64985036927119,56.22202873614792,59.11907313804921,75.06514339324212,73.50642836698945,64.07855113537731,3.5002546049239776,...,*Currently working,W,0.0,0.0,0.0,0.0,12.0,18.0,1.000000,good


## Redundant run deletion

Keep only the last run of each entry remaining to avoid redundancy

In [19]:
final_df = final_df.sort_values('run').groupby(['GRP', 'acq', 'weight']).last()
final_df = final_df.drop(columns=['run'])
final_df.shape

(626, 234)

## Stratification

Initial stratification

In [20]:
df_map = dict()
for idx, df in final_df.reset_index().groupby(['acq', 'weight']):
    # If there are less than 50 samples, just skip
    if df.shape[0] < 50:
        continue
    # Otherwise, save the result to a spreadsheet for further use
    df_label = '_'.join(idx)
    df_map[df_label] = df

Full datasets

In [21]:
for k, df, in df_map.items():
    file_out = f"full_{k}.tsv"
    df.set_index('GRP').to_csv(file_out, sep='\t')

Imaging metrics only

In [22]:
img_cols = list(img_df.drop(columns=['run']).columns)
img_cols.extend(['GRP', 'Recovery Class'])

for k, df, in df_map.items():
    file_out = f"img_only_{k}.tsv"
    sub_df = df.loc[:, img_cols]
    sub_df.set_index('GRP').to_csv(file_out, sep='\t')