# Setup

In [None]:
import numpy as np
import pandas as pd

# Imaging Data

## Data Concatenation

Grab and append the results of every sample which ran to completion into one file (uncomment to run; it can be quite expensive)

In [None]:
# %%bash
#
# cat /work/cadotte_lab/bids_cmri/derivatives/sct_6.5/*/*/softseg_vertebrae_metrics.csv > "full_metrics_softseg_c2c6.csv"

In [None]:
img_df = pd.read_csv("full_metrics_softseg_c2c6.csv")

Remove the duplicate headers introduced via this process

In [None]:
def remove_dup_headers(df):
    df = df.loc[df.loc[:, "Timestamp"] != "Timestamp", :]
    return df

In [None]:
img_df = remove_dup_headers(img_df)

## Indexing by Image Type

In [None]:
def parse_filenames(df):
    # Get the root file names out of the paths in the dataset
    file_names = [f.split('/')[-1] for f in df['Filename']]

    # Split them into their notable components
    file_components = [f.split('_') for f in file_names]

    # Grab the patient ID from the components
    patient_ids = [f[0] for f in file_components]

    # Grab the orientation (acquisition type) from the components
    acqs = [f[1].split('-')[1] for f in file_components]

    # Get the contrast type of the image from the file components
    contrasts = [f[-2] for f in file_components]

    # Get the run number, if one exists, from the file components
    runs = [int(f[-3].split('-')[1]) if "run" in f[-3] else None for f in file_components]
    
    idx = pd.MultiIndex.from_tuples(
        zip(patient_ids, acqs, contrasts, runs, df['VertLevel']),
        names=['GRP', 'acq', 'weight', 'run', 'vert_level']
    )
    return idx

In [None]:
img_df_idx = parse_filenames(img_df)
img_df.index = img_df_idx
img_df.shape

## Redundant Feature Purge

Drop a number of metadata/irrelevant features before proceeding

In [None]:
to_drop = ['Timestamp', 'SCT Version', 'Filename', 'Slice (I->S)', 'VertLevel', 'DistancePMJ']

In [None]:
img_df = img_df.drop(to_drop, axis=1)

## Vertebrae as Feature

In [None]:
def pivot_vertebrae(df):
    return df.unstack(level="vert_level")

In [None]:
img_df = pivot_vertebrae(img_df)
img_df.shape

## Column and Index Reformatting

Stack the columns so they aren't a multi-index (which can cause major headaches in ML analysis)

In [None]:
def stack_columns(df):
    new_cols = [f"{c[0]} [V{c[1]}]" for c in df.columns]
    df.columns = new_cols
    return df

In [None]:
stack_columns(img_df).shape

Unstack everything except GRP so that it is a "feature" during data combination

In [None]:
def unstack_index(df):
    tmp_df = df.reset_index()
    tmp_df = tmp_df.set_index('GRP')
    return tmp_df

In [None]:
img_df = unstack_index(img_df)
img_df.shape

# Data Joining and Finalization

## Load Clinical Data

In [None]:
final_participants_df = pd.read_csv('../../clinical/participants_cleaned.tsv', sep='\t').set_index('GRP')
final_participants_df.shape

## Data Joining

In [None]:
final_df = img_df.join(final_participants_df, how='inner')
final_df.shape

In [None]:
final_df

## Redundant run deletion

Keep only the last run of each entry remaining to avoid redundancy

In [None]:
final_df = final_df.sort_values('run').groupby(['GRP', 'acq', 'weight']).last()
final_df = final_df.drop(columns=['run'])
final_df.shape

## Stratification

Initial stratification

In [None]:
df_map = dict()
for idx, df in final_df.reset_index().groupby(['acq', 'weight']):
    # If there are less than 50 samples, just skip
    if df.shape[0] < 50:
        continue
    # Otherwise, save the result to a spreadsheet for further use
    df_label = '_'.join(idx)
    df_map[df_label] = df

Full datasets

In [None]:
for k, df, in df_map.items():
    file_out = f"full_{k}.tsv"
    df.set_index('GRP').to_csv(file_out, sep='\t')

Imaging metrics only

In [None]:
img_cols = list(img_df.drop(columns=['run']).columns)
img_cols.extend(['GRP', 'Recovery Class'])

for k, df, in df_map.items():
    file_out = f"img_only_{k}.tsv"
    sub_df = df.loc[:, img_cols]
    sub_df.set_index('GRP').to_csv(file_out, sep='\t')