In [53]:
import pandas as pd
import os
import numpy as np

In [54]:
df_path = 'combinedDfs/CombinedTestingFrame_A2_A3_O3_egd_rss_hb_pruned.txt'
binningName = 'DDS_binning'

In [55]:
original_labelmap = {
    "T1":0,
    "T1_c":1,
    "T2":2,
    "T2-FLAIR":3,
    "PD":4,
    "SWI":5,
    "GRE":6,
    "T2*":7,
    "DWI":8,
    "ADC":9,
    "BOLD":10,
    "angio":11,
    "PWI":12,
    "ASL":13,
    "DTI":14,
    "Other":15
}

In [56]:
all_binnings_dict = {
    'DDS_binning': {
        'non_DDS': ['SWI','GRE','T2*','ADC','BOLD','angio','ASL','DTI']
    },
    'alternative_binning': {
        'diffusion' : ['DWI', 'ADC', 'DTI'],
        'perfusion' : ['BOLD', 'ASL', 'angio', 'PWI'],
        'suscept' : ['SWI','GRE','T2*']
    }
}

In [57]:
def createBinning(original_labelmap, binningDict):
    orignal_labelmap_copy = original_labelmap.copy()
    new_labelmap = {}
    transfer_array = np.zeros(len(original_labelmap))
    class_counter = 0
    for bin_name, bin_items in binningDict.items():
        new_labelmap[bin_name] = class_counter
        for item in list(bin_items):
            transfer_array[original_labelmap[item]] = class_counter
            orignal_labelmap_copy.pop(item)
        class_counter += 1
    for name, item in orignal_labelmap_copy.items():
        new_labelmap[name] = class_counter
        transfer_array[item] = class_counter
        class_counter += 1
    unchangedLabels = orignal_labelmap_copy
    return new_labelmap, transfer_array, class_counter, unchangedLabels
    

In [58]:
new_labelmap, transfer_array, class_counter, unchangedLabels = createBinning(original_labelmap, all_binnings_dict[binningName])

In [59]:
def apply_binning_to_df(original_df, transfer_array, suffix=""):
    df = original_df.copy()
    suffix = '_'+suffix
    df[f'label{suffix}'] = df['label'].apply(lambda x: int(transfer_array[x]))
    return df

In [60]:
origin_df = pd.read_csv(df_path,names=['ID','label','extra'],sep='\t', dtype={'ID':str,'label':int,'extra':int})

In [61]:
binned_df = apply_binning_to_df(origin_df, transfer_array, suffix=binningName)

In [62]:
binned_df

Unnamed: 0,ID,label,extra,label_DDS_binning
0,/trinity/home/r098375/DDS/data/ADNI/ADNI3/test...,3,0,4
1,/trinity/home/r098375/DDS/data/ADNI/ADNI3/test...,15,0,8
2,/trinity/home/r098375/DDS/data/ADNI/ADNI3/test...,3,0,4
3,/trinity/home/r098375/DDS/data/ADNI/ADNI3/test...,3,0,4
4,/trinity/home/r098375/DDS/data/ADNI/ADNI3/test...,3,0,4
...,...,...,...,...
62665,/trinity/home/r098375/DDS/data/HeartBrain/NIFT...,0,0,1
62666,/trinity/home/r098375/DDS/data/HeartBrain/NIFT...,0,0,1
62667,/trinity/home/r098375/DDS/data/HeartBrain/NIFT...,0,0,1
62668,/trinity/home/r098375/DDS/data/HeartBrain/NIFT...,0,0,1


In [63]:
binned_df.groupby(f'label_{binningName}')['label'].value_counts()

label_DDS_binning  label
0                  7         7770
                   10        4995
                   13        2835
                   5         1665
                   11        1050
                   6          720
                   9          300
                   14         240
1                  0         7695
2                  1          600
3                  2         4545
4                  3         4455
5                  4          450
6                  8         4005
7                  12        2265
8                  15       19080
Name: count, dtype: int64

In [64]:
new_labelmap

{'non_DDS': 0,
 'T1': 1,
 'T1_c': 2,
 'T2': 3,
 'T2-FLAIR': 4,
 'PD': 5,
 'DWI': 6,
 'PWI': 7,
 'Other': 8}

In [65]:
extension = os.path.splitext(df_path)
basename = extension[0]
extension = extension[1]
outname = f'{basename}_{binningName}{extension}'
binned_df.to_csv(outname,columns=['ID',f'label_{binningName}','extra'],sep='\t',index=False,header=False)