In [16]:
%load_ext autoreload
%autoreload 2
import pandas as pd
from sklearn.model_selection import train_test_split

import numpy as np
from matplotlib import pyplot as plt
from utils.constants import DATA_DIR


from utils.vis_utils import load_patient_metadata

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [17]:
np.random.seed(111)


In [18]:
df = pd.read_csv(DATA_DIR() + "patient_meta.txt", sep="\t")

# remove duplicates 
df = df.drop_duplicates(subset="Image", keep="first")

df = df[~df.TMA_CASE.str.contains("TMA2_106|TMA2_92")]


In [19]:
len(df)

440

In [20]:
from utils.vis_utils import load_tiff
from utils.constants import TIFF_DIR
from pathlib import Path
import os

outliers = Path("meta/outliers.txt").read_text().splitlines()

for outlier in outliers:
    path = os.path.join(TIFF_DIR(), outlier)

    print("shape for case:",outlier[:-5], " is ", load_tiff(path).shape)

# Remove files of different shapes
df = df[~df.Image.isin(outliers)]

shape for case: TMA14_478  is  (38, 750, 850)
shape for case: TMA3_172  is  (38, 837, 871)
shape for case: TMA12_544  is  (38, 850, 742)
shape for case: TMA3_171  is  (38, 858, 1000)
shape for case: TMA11_598  is  (38, 850, 623)
shape for case: TMA2_97re  is  (38, 700, 1030)
shape for case: TMA4_244  is  (38, 802, 850)
shape for case: TMA5_292  is  (38, 950, 630)
shape for case: TMA8_599  is  (38, 832, 816)
shape for case: TMA2_135  is  (38, 425, 850)
shape for case: TMA3_173  is  (38, 826, 973)
shape for case: TMA5_291  is  (38, 500, 750)
shape for case: TMA3_175  is  (38, 826, 1007)


In [21]:
df

Unnamed: 0,CASE,nervesegments,n_stroma,n_tumor,TMA_CASE,Image,Subtype2,Subtype3,Batch,cause_of_death,follow_up_months,tumor_diameter,age_of_diagnosis,histologic grade
0,194,3,3,0,TMA14_194,TMA14_194.tiff,Luminal,Luminal B,D,0,158,25,51,2
1,219,3,2,1,TMA12_219,TMA12_219.tiff,Luminal,Luminal A,C,0,111,8,64,1
2,224,3,2,1,TMA12_224,TMA12_224.tiff,Luminal,Luminal B,C,0,109,35,57,2
3,246,3,1,2,TMA4_246,TMA4_246.tiff,Triple Negative,Triple Negative,C,0,106,35,64,3
4,252,13,5,8,TMA4_252,TMA4_252.tiff,Triple Negative,Triple Negative,C,2,10,39,53,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444,85,5,2,3,TMA2_85,TMA2_85.tiff,Triple Negative,Triple Negative,A,2,22,21,55,2
445,94,1,1,0,TMA12_94,TMA12_94.tiff,Luminal,Luminal B,C,0,140,30,60,3
446,95,2,2,0,TMA12_95,TMA12_95.tiff,Triple Negative,Triple Negative,C,2,47,30,50,3
447,123,2,2,0,TMA2_123,TMA2_123.tiff,Luminal,Luminal A,B,0,174,12,57,2


In [22]:
df.columns

Index(['CASE', 'nervesegments', 'n_stroma', 'n_tumor', 'TMA_CASE', 'Image',
       'Subtype2', 'Subtype3', 'Batch', 'cause_of_death', 'follow_up_months',
       'tumor_diameter', 'age_of_diagnosis', 'histologic grade'],
      dtype='object')

In [23]:
nonzero_nerves = df[df["nervesegments"] > 0]
no_nerves = df[df["nervesegments"] == 0]

In [24]:

train_test_ratio = 0.7
nerve_train, nerve_test = train_test_split(nonzero_nerves, test_size=1-train_test_ratio)
non_nerve_train, non_nerve_test = train_test_split(no_nerves, test_size=1-train_test_ratio)

validation_test_ratio = 0.5

nerve_test, nerve_validation = train_test_split(nerve_test, test_size=validation_test_ratio)
non_nerve_test, non_nerve_validation = train_test_split(non_nerve_test, test_size=validation_test_ratio)

train = pd.concat([nerve_train, non_nerve_train])
validation = pd.concat([nerve_validation, non_nerve_validation])
test = pd.concat([nerve_test, non_nerve_test])

# counts =pd.DataFrame(nerve_train['nervesegments'].value_counts())
# counts.sort_values(by=["count", "nervesegments"],ascending=[False, True], inplace=True)

# counts.plot(kind='bar')
# plt.xlabel('Nerve Segments')
# plt.ylabel('Count')
# plt.title('Distribution of Nerve Segments in Training Data')
# plt.show()

In [25]:
train_shuffled = train.sample(frac=1).TMA_CASE
val_shuffled = validation.sample(frac=1).TMA_CASE
test_shuffled = test.sample(frac=1).TMA_CASE

In [27]:
train_shuffled.to_csv("meta/train_files.txt", index=False)
val_shuffled.to_csv("meta/validation_files.txt", index=False)
test_shuffled.to_csv("meta/test_files.txt", index=False)