In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

In [19]:
csv_file_names = ["pyradiomics_extraction_box_with_correct_mask.csv", "pyradiomics_extraction_box_without_correct_mask.csv", 
             "pyradiomics_extraction_segmentation_maskcorrect.csv", "pyradiomics_extraction_segmentation_no_maskcorrect.csv"]

FILE_INDEX = 2
FILENAME = '../Data/Without Demographic Features/' + csv_file_names[FILE_INDEX]
CLASS_LABELS = '../Data/Patient class labels.csv'

randgen = 12345678
train_size = 0.7

In [20]:
df = pd.read_csv(FILENAME).drop(columns = 'sequence', errors='ignore')
labels = pd.read_csv(CLASS_LABELS)
total_features = pd.merge(df, labels, left_on = 'patient', right_on = 'Patient ID').drop(columns = ['Patient ID', 'patient'])
total_features

Unnamed: 0,original_shape_Elongation,original_shape_Flatness,original_shape_LeastAxisLength,original_shape_MajorAxisLength,original_shape_Maximum2DDiameterColumn,original_shape_Maximum2DDiameterRow,original_shape_Maximum2DDiameterSlice,original_shape_Maximum3DDiameter,original_shape_MeshVolume,original_shape_MinorAxisLength,...,original_glszm_ZoneVariance,original_ngtdm_Busyness,original_ngtdm_Coarseness,original_ngtdm_Complexity,original_ngtdm_Contrast,original_ngtdm_Strength,ER,PR,HER2,Mol Subtype
0,0.807005,0.729780,23.614309,32.358102,36.073737,38.431136,39.309346,45.798426,11352.720347,26.113156,...,4.009263e+06,0.0,1000000.0,0.0,0.0,0.0,0,0,1,2
1,0.641558,0.577887,17.378564,30.072592,23.148662,29.852205,29.748492,33.784677,4051.005400,19.293327,...,6.054147e+06,0.0,1000000.0,0.0,0.0,0.0,0,0,0,3
2,0.756223,0.272616,13.050754,47.872295,40.488107,43.126056,56.606869,57.665740,11866.396187,36.202150,...,1.663605e+07,0.0,1000000.0,0.0,0.0,0.0,1,1,0,0
3,0.718139,0.610317,8.730117,14.304230,11.420813,14.972284,14.477716,16.540987,829.333325,10.272429,...,0.000000e+00,0.0,1000000.0,0.0,0.0,0.0,1,1,0,0
4,0.687760,0.436019,22.539193,51.693090,55.175581,49.674566,36.443449,58.274413,25572.814941,35.552459,...,4.436710e+08,0.0,1000000.0,0.0,0.0,0.0,1,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
917,0.886914,0.668151,28.553043,42.734408,45.886501,41.977361,51.383808,56.016573,28567.316541,37.901723,...,0.000000e+00,0.0,1000000.0,0.0,0.0,0.0,1,1,0,0
918,0.757419,0.596730,12.864613,21.558522,21.066007,20.543932,25.375225,27.268354,2558.161146,16.328835,...,4.485585e+06,0.0,1000000.0,0.0,0.0,0.0,1,1,0,0
919,0.872632,0.837806,15.199560,18.142097,19.038279,20.649190,20.505122,24.348411,2671.696974,15.831368,...,0.000000e+00,0.0,1000000.0,0.0,0.0,0.0,1,1,0,0
920,0.855874,0.735409,30.453436,41.410193,43.076382,45.448460,48.143552,54.398662,28568.303573,35.441921,...,1.504081e+08,0.0,1000000.0,0.0,0.0,0.0,1,1,0,0


In [21]:
# split for er

er_train, er_test = train_test_split(total_features.drop(columns = ['PR', 'HER2', 'Mol Subtype']), train_size=train_size, stratify = total_features['ER'], random_state=randgen)
er_train = er_train.rename(columns = {'ER': 'label'})
er_test = er_test.rename(columns = {'ER': 'label'})

In [22]:
# split for pr

pr_train, pr_test = train_test_split(total_features.drop(columns = ['ER', 'HER2', 'Mol Subtype']), train_size=train_size, stratify = total_features['PR'], random_state=randgen)
pr_train = pr_train.rename(columns = {'PR': 'label'})
pr_test = pr_test.rename(columns = {'PR': 'label'})

In [23]:
# split for HER2

her2_train, her2_test = train_test_split(total_features.drop(columns = ['ER', 'PR', 'Mol Subtype']), train_size=train_size, stratify = total_features['HER2'], random_state=randgen)

her2_train = her2_train.rename(columns = {'HER2': 'label'})
her2_test = her2_test.rename(columns = {'HER2': 'label'})

In [24]:
# split for molecular subtype

molsub_train, molsub_test = train_test_split(total_features.drop(columns = ['ER', 'PR', 'HER2']), train_size=train_size, stratify = total_features['Mol Subtype'], random_state=randgen)

molsub_train = molsub_train.rename(columns = {'Mol Subtype': 'label'})
molsub_test = molsub_test.rename(columns = {'Mol Subtype': 'label'})

In [28]:
# saving

os.makedirs("ER", mode = 777, exist_ok = True)
os.makedirs("PR", mode = 777, exist_ok = True)
os.makedirs("HER2", mode = 777, exist_ok = True)
os.makedirs("Mol_Subtype", mode = 777, exist_ok = True)


er_train.to_csv("ER/train.csv", index = False)
pr_train.to_csv('PR/train.csv', index = False)
her2_train.to_csv('HER2/train.csv', index = False)
molsub_train.to_csv("Mol_Subtype/train.csv", index = False)


er_test.to_csv("ER/test.csv", index = False)
pr_test.to_csv('PR/test.csv', index = False)
her2_test.to_csv('HER2/test.csv', index = False)
molsub_test.to_csv("Mol_Subtype/test.csv", index = False)