# Setup

In [None]:
import numpy as np
import pandas as pd

# Imaging Data

## Data Concatenation

Grab and append the results of every sample which ran to completion into one file (uncomment to run; it can be quite expensive)

In [None]:
# %%bash
#
# cat /work/cadotte_lab/bids_cmri/derivatives/sct_6.5/*/*/softseg_vertebrae_metrics.csv > "full_metrics_softseg.csv"

In [None]:
softseg_df = pd.read_csv("full_metrics_softseg.csv")

Remove the duplicate headers introduced via this process

In [None]:
def remove_dup_headers(df):
    df = df.loc[df.loc[:, "Timestamp"] != "Timestamp", :]
    return df

In [None]:
softseg_df = remove_dup_headers(softseg_df)

## Indexing by Image Type

In [None]:
def parse_filenames(df):
    # Get the root file names out of the paths in the dataset
    file_names = [f.split('/')[-1] for f in df['Filename']]

    # Split them into their notable components
    file_components = [f.split('_') for f in file_names]

    # Grab the patient ID from the components
    patient_ids = [f[0] for f in file_components]

    # Grab the orientation (acquisition type) from the components
    acqs = [f[1].split('-')[1] for f in file_components]

    # Get the contrast type of the image from the file components
    contrasts = [f[-2] for f in file_components]

    # Get the run number, if one exists, from the file components
    runs = [int(f[-3].split('-')[1]) if "run" in f[-3] else None for f in file_components]
    
    idx = pd.MultiIndex.from_tuples(
        zip(patient_ids, acqs, contrasts, runs, df['VertLevel']),
        names=['GRP', 'acq', 'weight', 'run', 'vert_level']
    )
    return idx

In [None]:
softseg_df_idx = parse_filenames(softseg_df)
softseg_df.index = softseg_df_idx
softseg_df[0:10]

## Redundant Feature Purge

Drop a number of metadata/irrelevant features before proceeding

In [None]:
to_drop = ['Timestamp', 'SCT Version', 'Filename', 'Slice (I->S)', 'VertLevel', 'DistancePMJ']

In [None]:
softseg_df = softseg_df.drop(to_drop, axis=1)

## Vertebrae as Feature

In [None]:
def pivot_vertebrae(df):
    return df.unstack(level="vert_level")

In [None]:
softseg_df = pivot_vertebrae(softseg_df)
softseg_df.iloc[:10, :]

## Column and Index Reformatting

Stack the columns so they aren't a multi-index (which can cause major headaches in ML analysis)

In [None]:
def stack_columns(df):
    new_cols = [f"{c[0]} [V{c[1]}]" for c in df.columns]
    df.columns = new_cols
    return df

In [None]:
stack_columns(softseg_df)

Unstack everything except GRP so that it is a "feature" during data combination

In [None]:
def unstack_index(df):
    tmp_df = df.reset_index()
    tmp_df = tmp_df.set_index('GRP')
    return tmp_df

In [None]:
softseg_df = unstack_index(softseg_df)
softseg_df

# Clinical Data Prep

## Data Setup

Grab the participants data; uncomment to run, it can be quite time consuming

In [None]:
# %%bash

# cp "/work/cadotte_lab/bids_cmri/participants.tsv" "./participants.tsv"

Load the data with pandas, setting patient GRP as the index

In [None]:
participants_df = pd.read_csv('../participants.tsv', sep='\t')
participants_df = participants_df.set_index('GRP')
participants_df.iloc[:10, :]

## mJOA Cleanup

Isolate the initial and 1-year mJOA metrics so they are not deleted by mistake 

In [None]:
mjoa_cols = [
    "('mJOA', 'initial')",
    "('mJOA', '12 months')",
    "('mJOA; Total [CSA]', 'initial')",
    "('mJOA; Total [CSA]', '12 months')"
]
mjoa_df = participants_df.loc[:, mjoa_cols]

Transfer the CSA dataset's values into the originals when their is not an original value to go off of

In [None]:
missing_idx = mjoa_df.loc[:, "('mJOA', 'initial')"].isna()
print(f"Before: {np.sum(missing_idx)}")
mjoa_df.loc[missing_idx, "('mJOA', 'initial')"] = mjoa_df.loc[missing_idx, "('mJOA; Total [CSA]', 'initial')"]
missing_idx = mjoa_df.loc[:, "('mJOA', 'initial')"].isna()
print(f"After: {np.sum(missing_idx)}")

In [None]:
missing_idx = mjoa_df.loc[:, "('mJOA', '12 months')"].isna()
print(f"Before: {np.sum(missing_idx)}")
mjoa_df.loc[missing_idx, "('mJOA', '12 months')"] = mjoa_df.loc[missing_idx, "('mJOA; Total [CSA]', '12 months')"]
missing_idx = mjoa_df.loc[:, "('mJOA', '12 months')"].isna()
print(f"After: {np.sum(missing_idx)}")

In [None]:
mjoa_df

Drop the (now redundant) columns in both datasets

In [None]:
mjoa_df = mjoa_df.drop(["('mJOA; Total [CSA]', 'initial')", "('mJOA; Total [CSA]', '12 months')"], axis=1)
participants_df = participants_df.drop(mjoa_cols, axis=1)

## Timepoint isolation

Isolate data w/o a timepoint before proceeding

In [None]:
non_timed = participants_df.iloc[:, -20:]
timed = participants_df.drop(non_timed.columns, axis=1)
timed

Only keep values w/ an initial time point (only mJOA is needed after 1 year, as it's the only value important to calculating the target)

In [None]:
keep_cols = []
for c in timed.columns:
    if c.split(',')[1] == " 'initial')":
        keep_cols.append(c)

cleaned_participants_df = participants_df.loc[:, keep_cols]
cleaned_participants_df.loc[:, non_timed.columns] = non_timed
cleaned_participants_df

Drop redundant columns in the dataset

In [None]:
cleaned_participants_df = cleaned_participants_df.drop(columns=["('Surgical', 'initial')", "('BMI', 'initial')"])

Reformat column headers to be cleaner, namely by removing the (now redundant) time point

In [None]:
cols = [c.replace("'initial'", "") for c in cleaned_participants_df.columns]
cleaned_participants_df.columns = cols

In [None]:
cleaned_participants_df

## EQ5D Unusual Null Value Correction

EQ5D occasionally uses the value of `4` to indicate a null value for some reason

In [None]:
for c in cleaned_participants_df.columns:
    if 'EQ5D' in c:
        cleaned_participants_df.loc[cleaned_participants_df[c] == 4, c] = np.nan

## Consolidation and Clean-Up

Add back in the mJOA metrics

In [None]:
final_participants_df = cleaned_participants_df.copy()
final_participants_df.loc[:, mjoa_df.columns] = mjoa_df
final_participants_df

Format the column headers to be cleaner by removing characters which could cause issues with common storage methodologies (namely CSV, TSV, and SQL formats)

In [None]:
cols = [c.replace("'", "").replace(",", "").replace(" )", ")") for c in final_participants_df.columns]
cols = [c[1:-1] if c[0] == "(" and c[-1] == ")" else c for c in cols]
final_participants_df.columns = cols
final_participants_df

Calculate the Hirabayashi Recovery Ratio (HRR) and whether it is a significant improvement or not (HRR >= 0.5)

In [None]:
def hrr(mjoa_init, mjoa_1year):
    numerator = mjoa_1year - mjoa_init
    denominator = 18 - mjoa_init
    return numerator / denominator

In [None]:
hrr_vals = hrr(final_participants_df['mJOA initial'], final_participants_df['mJOA 12 months'])
final_participants_df['HRR'] = hrr_vals
final_participants_df['Recovery Class'] = ['good' if v >= 0.5 else "fair" for v in hrr_vals]
final_participants_df.loc[pd.isna(hrr_vals), 'Recovery Class'] = np.nan
final_participants_df = final_participants_df.dropna(subset=['Recovery Class'])

Drop any patients which are missing a valid Recovery Class

In [None]:
final_participants_df.dropna(axis=0, subset=['Recovery Class'])

Drop any patients which did not undergo surgical treatment

In [None]:
final_participants_df = final_participants_df.loc[final_participants_df['Surgical'] == 1, :]

Save the results on their own for isolated ML model testing

In [None]:
final_participants_df.to_csv('clinical_only.tsv', sep='\t')

# Data Joining and Finalization

## Data Joining

In [None]:
final_softseg_df = softseg_df.join(final_participants_df, how='inner')
final_softseg_df

## Redundant run deletion

Keep only the last run of each entry remaining to avoid redundancy

In [None]:
final_softseg_df = final_softseg_df.sort_values('run').groupby(['GRP', 'acq', 'weight']).last()
final_softseg_df = final_softseg_df.drop(columns=['run'])
final_softseg_df

## Stratification

Initial stratification

In [None]:
df_map = dict()
for idx, df in final_softseg_df.reset_index().groupby(['acq', 'weight']):
    # If there are less than 50 samples, just skip
    if df.shape[0] < 50:
        continue
    # Otherwise, save the result to a spreadsheet for further use
    df_label = '_'.join(idx)
    df_map[df_label] = df

Full datasets

In [None]:
for k, df, in df_map.items():
    file_out = f"full_{k}.tsv"
    df.set_index('GRP').to_csv(file_out, sep='\t')

Imaging metrics only

In [None]:
img_cols = list(softseg_df.drop(columns=['run']).columns)
img_cols.extend(['GRP', 'Recovery Class'])

for k, df, in df_map.items():
    file_out = f"img_only_{k}.tsv"
    sub_df = df.loc[:, img_cols]
    sub_df.set_index('GRP').to_csv(file_out, sep='\t')