In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import os

In [2]:
FullLabelPath = 'parelsnoer/Parelsnoer_Labels_extensive2_pruned.txt'
FullLabelFile = pd.read_csv(FullLabelPath,names=['ID','label','extra'],sep='\t',low_memory=False)

In [3]:
FullLabelFile

Unnamed: 0,ID,label,extra
0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0
1,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0
2,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0
3,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0
4,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0
...,...,...,...
11230,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0
11231,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0
11232,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0
11233,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0


In [4]:
FullLabelFile['label'].value_counts()

label
16    7305
0     2520
14     990
17     195
2       75
3       60
6       60
1       30
Name: count, dtype: int64

In [5]:
amountsPerClass = [1, 5, 10, 20]

In [6]:
def extractNiftiFilepathAndSlicenum(df):
    ID = df['ID']
    split = ID.rsplit('__s', 1)
    NiftiPath = split[0]
    NiftiPath = NiftiPath.replace('NIFTI_SLICES', 'NIFTI') + '.nii.gz'
    slicenum = int(split[1].split('.nii.gz')[0])
    return pd.Series({'NiftiPath': NiftiPath, 'slicenum': slicenum})

In [7]:
FilepathsAndSlicenums = FullLabelFile.apply(extractNiftiFilepathAndSlicenum, axis=1)

In [8]:
FullLabelFile = FullLabelFile.merge(FilepathsAndSlicenums, left_index=True, right_index=True)

In [9]:
FullLabelFile

Unnamed: 0,ID,label,extra,NiftiPath,slicenum
0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,7
1,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,5
2,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,8
3,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,13
4,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,9
...,...,...,...,...,...
11230,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,5
11231,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,7
11232,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,17
11233,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,10


In [10]:
for numTrainSamples in amountsPerClass:
    extension = os.path.splitext(FullLabelPath)
    basename = extension[0]
    extension = extension[1]
    df_train = pd.DataFrame()
    df_test = pd.DataFrame()
    for label in FullLabelFile['label'].unique():
        outname_train = f'{basename}_s{numTrainSamples}_train{extension}'
        outname_test = f'{basename}_s{numTrainSamples}_test{extension}'
        LabelFrame = FullLabelFile[FullLabelFile['label']==label]
        NiftiNames = LabelFrame['NiftiPath'].unique()
        try:
            train, test = train_test_split(NiftiNames, train_size=numTrainSamples, shuffle=True, random_state=42)
        except:
            train = NiftiNames
            test = []
        for tr in train:
            if df_train.empty:
                df_train = LabelFrame[LabelFrame['NiftiPath']==tr]
            else:
                temp = LabelFrame[LabelFrame['NiftiPath']==tr]
                df_train = pd.concat([df_train, temp],axis=0)
        for te in test:
            if df_test.empty:
                df_test = LabelFrame[LabelFrame['NiftiPath']==te]
            else:
                temp = LabelFrame[LabelFrame['NiftiPath']==te]
                df_test = pd.concat([df_test, temp],axis=0)
        df_train[['ID','label','extra']].to_csv(outname_train, index=False, header=False, sep='\t')
        df_test[['ID','label','extra']].to_csv(outname_test, index=False, header=False, sep='\t')

In [11]:
df_train

Unnamed: 0,ID,label,extra,NiftiPath,slicenum
3075,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,13
3076,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,14
3077,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,8
3078,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,19
3079,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,18
...,...,...,...,...,...
4570,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,6,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,10
4571,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,6,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,6
4572,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,6,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16
4573,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,6,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,19


In [12]:
df_test

Unnamed: 0,ID,label,extra,NiftiPath,slicenum
1695,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,11
1696,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16
1697,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,7
1698,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,15
1699,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,16,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,9
...,...,...,...,...,...
8515,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,14,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,6
8516,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,14,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,14
8517,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,14,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,5
8518,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,14,0,/trinity/home/r098375/DDS/data/parelsnoer/NIFT...,8


In [13]:
control = 'parelsnoer/Parelsnoer_Labels_extensive2_pruned_s10.txt'
ControlLabelFile = pd.read_csv(control,names=['ID','label','extra'],sep='\t',low_memory=False)

In [14]:
ControlLabelFile['label'].value_counts()

label
16    150
0     150
17    150
14    150
2      75
3      60
6      60
1      30
Name: count, dtype: int64