### Load Patients List Data

In [1]:
import pandas as pd

ID_PATH = "_info/ids.csv"

ids_df = pd.read_csv(ID_PATH).dropna()
ids_df.roi_exist = ids_df.roi_exist.astype(bool)

ids_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91 entries, 0 to 90
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          91 non-null     object
 1   subtype     91 non-null     object
 2   roi_exist   91 non-null     bool  
 3   annot_type  91 non-null     object
dtypes: bool(1), object(3)
memory usage: 2.9+ KB


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Load Patients Split Ratio (Training and Test)

In [2]:
import json

SPLIT_PATH = "_info/split_ratio.json"

with open(SPLIT_PATH, 'r') as file:
    split_ratio = json.load(file)

pd.DataFrame(split_ratio)

Unnamed: 0,Train,Test
ccRCC,33,23
pRCC,15,7
CHROMO,3,3
ONCOCYTOMA,3,4


### Split Training and Test Patients

In [3]:
random_seed = 42

SAVE_ID_LIST = False
LOAD_ID_LIST = False
IDS_LIST_DIR = "_info/split_ids.json"

# Add special patients to CHROMO and ONCO to have more data
train_chromo = ['HP20002300', 'HP19012316', 'HP20.2506']
train_onco = ['HP20.5602', 'HP18005453']

temp_ids_df = ids_df.copy()

if not LOAD_ID_LIST:
    print(">>> Splitting Train/Test IDs ...")
    train_test_ids = {'Train':{}, 'Test': {}}

    for subtype, count in split_ratio['Train'].items():
        train_test_ids['Train'][subtype] = []
        train_test_ids['Test'][subtype] = []
        st_temp = (ids_df['subtype'] == subtype) & (~ids_df['roi_exist'])
        if (st_temp).any():
            for index, patient in ids_df[st_temp].iterrows():
                train_test_ids['Test'][subtype].append(index)
                temp_ids_df = temp_ids_df.drop(index=index)

        if subtype=='CHROMO': 
            for id in train_chromo:
                train_test_ids['Train'][subtype].append(temp_ids_df[temp_ids_df['id']==id].index[0])
                temp_ids_df = temp_ids_df.drop(index=temp_ids_df[temp_ids_df['id']==id].index[0])

        elif subtype=='ONCOCYTOMA':
            for id in train_onco:
                train_test_ids['Train'][subtype].append(temp_ids_df[temp_ids_df['id']==id].index[0])
                temp_ids_df = temp_ids_df.drop(index=temp_ids_df[temp_ids_df['id']==id].index[0])

        st_df = temp_ids_df[temp_ids_df['subtype'] == subtype]
        st_train_indexes = st_df.sample(n=count-len(train_test_ids['Train'][subtype]), random_state=random_seed).index.tolist()
        st_test_indexes = [idx for idx in st_df.index if idx not in st_train_indexes]

        train_test_ids['Train'][subtype].extend(st_train_indexes)
        train_test_ids['Test'][subtype].extend(st_test_indexes)

    if SAVE_ID_LIST:
        with open(IDS_LIST_DIR, 'w') as file:
            json.dump(train_test_ids, file, indent=4)
        print(f">>> IDs Saved to '{IDS_LIST_DIR}'")
else:
    with open(IDS_LIST_DIR, 'r') as file:
        train_test_ids = json.load(file)    
    print(f">>> IDs Loaded from '{IDS_LIST_DIR}'!")

print(train_test_ids)

>>> Splitting Train/Test IDs ...
{'Train': {'ccRCC': [0, 5, 33, 13, 19, 50, 36, 26, 44, 12, 54, 3, 34, 30, 8, 17, 6, 4, 41, 27, 47, 46, 52, 15, 9, 16, 24, 31, 53, 48, 25, 11, 32], 'pRCC': [56, 69, 64, 57, 71, 61, 76, 67, 59, 60, 73, 68, 74, 72, 58], 'CHROMO': [83, 81, 82], 'ONCOCYTOMA': [88, 86, 87]}, 'Test': {'ccRCC': [1, 2, 7, 10, 14, 18, 20, 21, 22, 23, 28, 29, 35, 37, 38, 39, 40, 42, 43, 45, 49, 51, 55], 'pRCC': [62, 63, 65, 66, 70, 75, 77], 'CHROMO': [78, 79, 80], 'ONCOCYTOMA': [84, 85, 89, 90]}}


In [4]:
from wsi_manager.crop import CropIndexer
import os
import glob
import time

ROOT_PATH = "F:\My Files\Thesis\Dataset\RCC_WSIs"

CROP_SIZE = 1000 # Crop size in pixels
LEVEL = 0 # Magnification level
OVERLAP = 1 # Overlap; use 1 for no overlap, 2 for 50% overlap, etc.
CROP_RESIZE = 112 # Desired size to resize the patches
WSI_FORMATS = ['scn', 'svs', 'tif'] # WSIs formats

train_list = []

# Creating Training Crops DataFrame

for subtype in train_test_ids['Train'].keys():

    print(f"\n")
    print(f"--"*15)
    print(f"Preparing data for {subtype}")
    print(f"--"*15)
    print(f"\n")

    for index in train_test_ids['Train'][subtype]:

        t_start = time.time()

        if ids_df.iloc[index].annot_type == 'SLIDE':

            print(f'>>> Get {ids_df.iloc[index].id} Image Patches ...')

            # Read subtype patient correspondence file
            patient_corr_path = f'{ROOT_PATH}\\{subtype}\\Annotations\\{subtype}_patients_correspondence.xlsx'
            patients_corr = pd.read_excel(patient_corr_path)
            
            # Check if there are multiple annotation slides for the patient
            annot_slide_no = str(patients_corr[patients_corr['PATIENT']==ids_df.iloc[index].id]['ID'].values[0]).split('-')

            # Create list of annotations to read crops
            if len(annot_slide_no)>1:
                annotations = list(str(i) for i in range(int(annot_slide_no[0]), int(annot_slide_no[1])+1))
            else:
                annotations = annot_slide_no

            slide_section = CropIndexer(type='SLIDE', crop_size=CROP_SIZE, overlap=OVERLAP)

            # Crop all annotations and add them to train data list
            for annot in annotations:
                for fmt in WSI_FORMATS:
                    slide_dir = f'{ROOT_PATH}\\{subtype}\\Annotations\\{annot}.{fmt}'
                    if os.path.exists(slide_dir):
                        crop_indexes = slide_section.crop(slide_dir=slide_dir)
                        break

                print(f"--------> +{len(crop_indexes)} Crops")
                
                # Add metadata to each crop for further analysis
                for crop in crop_indexes:

                    temp_crop = {}
                    temp_crop['subtype'] = subtype
                    temp_crop['annot_type'] = 'SLIDE'
                    temp_crop['id'] = ids_df.iloc[index].id
                    temp_crop['path'] = slide_dir
                    temp_crop['is_tumor'] = True
                    temp_crop['type'] = subtype
                    temp_crop['top'] = crop['top']
                    temp_crop['left'] = crop['left']
                    temp_crop['size'] = crop['size']

                    train_list.append(temp_crop)

        elif ids_df.iloc[index].annot_type == 'XML':

            print(f'>>> Get {ids_df.iloc[index].id} Image Patches ...')

            subtype_path = f'{ROOT_PATH}\\{subtype}'

            # Check for slide in main directory
            if glob.glob(f"{subtype_path}\\*{ids_df.iloc[index].id}*"):
                for slide_dir in glob.glob(f"{subtype_path}\\*{ids_df.iloc[index].id}*"):

                    # Creating path to corresponding XML annotation
                    slidename = slide_dir.split("\\")[-1][:-4]
                    xml_dir = f"{subtype_path}\\{subtype}_xml\\{slidename}.xml"

                    # Cropping slide into patches
                    slide_section = CropIndexer(type='XML', crop_size=CROP_SIZE, overlap=OVERLAP)
                    crop_indexes = slide_section.crop(slide_dir=slide_dir, xml_dir=xml_dir)

                    print(f"--------> +{len(crop_indexes)} Crops")

                    # Add metadata to each crop for further analysis
                    for crop in crop_indexes:

                        temp_crop = {}
                        temp_crop['subtype'] = subtype
                        temp_crop['annot_type'] = 'XML'
                        temp_crop['id'] = ids_df.iloc[index].id
                        temp_crop['path'] = slide_dir
                        temp_crop['is_tumor'] = True if crop['label']=='tumor' else False
                        temp_crop['type'] = subtype if crop['label']=='tumor' else crop['label'].strip()
                        temp_crop['top'] = crop['top']
                        temp_crop['left'] = crop['left']
                        temp_crop['size'] = crop['size']

                        train_list.append(temp_crop)                     

            # Check for slide in pre directory
            elif glob.glob(f"{ROOT_PATH}\\pre\\{subtype}\\*{ids_df.iloc[index].id}*"):
                print("---- Checking 'pre' Data ...")
                for slide_dir in glob.glob(f"{ROOT_PATH}\\pre\\{subtype}\\*{ids_df.iloc[index].id}*"):

                    # Creating path to corresponding XML annotation
                    slidename = slide_dir.split("\\")[-1][:-4]
                    xml_dir = f"{ROOT_PATH}\\pre\\{subtype}\\{subtype}_xml\\{slidename}.xml"

                    slide_section = CropIndexer(type='XML', crop_size=CROP_SIZE, overlap=OVERLAP)
                    crop_indexes = slide_section.crop(slide_dir=slide_dir, xml_dir=xml_dir)

                    print(f"--------> +{len(crop_indexes)} Crops")

                    # Add metadata to each crop for further analysis
                    for crop in crop_indexes:

                        temp_crop = {}

                        temp_crop['subtype'] = subtype
                        temp_crop['annot_type'] = 'XML'
                        temp_crop['id'] = ids_df.iloc[index].id
                        temp_crop['path'] = slide_dir
                        temp_crop['is_tumor'] = True if crop['label']=='tumor' else False
                        temp_crop['type'] = subtype if crop['label']=='tumor' else crop['label'].strip()
                        temp_crop['top'] = crop['top']
                        temp_crop['left'] = crop['left']
                        temp_crop['size'] = crop['size']

                        train_list.append(temp_crop)           

        t_end = time.time()

        print(f"+++ Finished Cropping '{ids_df.iloc[index].id}' Slides in {round(t_end-t_start, 2)}!")



------------------------------
Preparing data for ccRCC
------------------------------


>>> Get HP1213588 Image Patches ...
+++ Finished Cropping 'HP1213588' Slides in 0.0!
>>> Get HP195524 Image Patches ...
+++ Finished Cropping 'HP195524' Slides in 0.0!
>>> Get HP14.9097 Image Patches ...
+++ Finished Cropping 'HP14.9097' Slides in 0.0!
>>> Get HP11.12318 Image Patches ...
+++ Finished Cropping 'HP11.12318' Slides in 0.0!
>>> Get HP12.6691 Image Patches ...
+++ Finished Cropping 'HP12.6691' Slides in 0.0!
>>> Get HP19.7840 Image Patches ...
+++ Finished Cropping 'HP19.7840' Slides in 0.0!
>>> Get HP15.12550 Image Patches ...
+++ Finished Cropping 'HP15.12550' Slides in 0.0!
>>> Get HP14.11034 Image Patches ...
+++ Finished Cropping 'HP14.11034' Slides in 0.0!
>>> Get HP19.4075 Image Patches ...
--------> +3232 Crops
--------> +2151 Crops
--------> +3248 Crops
--------> +3335 Crops
--------> +2851 Crops
+++ Finished Cropping 'HP19.4075' Slides in 16.27!
>>> Get HP11.12277 Image Pat

In [5]:
DFS_LIST_DIR = "_dfs/train_df.csv"

train_df = pd.DataFrame(train_list)

In [4]:
DFS_LIST_DIR = "_dfs/train_df.csv"
train_df = pd.read_csv(DFS_LIST_DIR)

In [6]:
train_df.to_csv(DFS_LIST_DIR, index=False)

In [6]:
train_df['type'].value_counts()

type
pRCC          36462
ccRCC         15024
necrosis      10029
normal         7158
CHROMO         5562
fiber          4734
ONCOCYTOMA     3474
Name: count, dtype: int64

In [49]:
tree_pair_dict = {
            "Root": (['ccRCC', 'pRCC', 'CHROMO', 'ONCOCYTOMA'], ['normal', 'necrosis', 'fiber']),
            "Node": (['ccRCC', 'pRCC'], ['CHROMO', 'ONCOCYTOMA']),
            "Leaf1":(['CHROMO'], ['ONCOCYTOMA']),
            "Leaf2":(['ccRCC'], ['pRCC'])
}

tree_pair_dict = {
                "Root": {
                    "class_0": ['ccRCC', 'pRCC', 'CHROMO', 'ONCOCYTOMA'],
                    "class_1": ['normal', 'necrosis', 'fiber']
                },
                "Node": {
                    "class_0": ['ccRCC', 'pRCC'],
                    "class_1": ['CHROMO', 'ONCOCYTOMA']
                },
                "Leaf1": {
                    "class_0": ['CHROMO'],
                    "class_1": ['ONCOCYTOMA']
                },
                "Leaf2": {
                    "class_0": ['ccRCC'],
                    "class_1": ['pRCC']
                }
}


In [5]:
with open("_info/tree_pair_dict.json", 'r') as file:
    tree_pair_dict = json.load(file)

In [13]:
from dataset.balancer import Balancer

data_balancer = Balancer(method='undersample', random_state=random_seed)
balanced_dfs = data_balancer.apply(train_df, tree_pair_dict)

In [7]:
balanced_dfs['Root']['type'].value_counts()

type
normal        4632
necrosis      4632
fiber         4632
ccRCC         3474
pRCC          3474
CHROMO        3474
ONCOCYTOMA    3474
Name: count, dtype: int64

In [8]:
balanced_dfs['Root']

Unnamed: 0,subtype,annot_type,id,path,is_tumor,type,top,left,size
0,ccRCC,XML,HP19.4075,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,True,ccRCC,89744,45869,1000
1,ccRCC,XML,HP19.4075,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,True,ccRCC,141571,44777,1000
2,ccRCC,XML,HP19.4075,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,True,ccRCC,89066,42554,1000
3,ccRCC,XML,HP19.5254,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,True,ccRCC,128143,81458,1000
4,ccRCC,XML,HP19.5254,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,True,ccRCC,151143,17458,1000
...,...,...,...,...,...,...,...,...,...
27787,pRCC,XML,HP17.11714,F:\My Files\Thesis\Dataset\RCC_WSIs\pRCC\HP17....,False,fiber,95749,28194,1000
27788,ccRCC,XML,HP19.4075,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,False,fiber,138744,64869,1000
27789,pRCC,XML,HP17.7980,F:\My Files\Thesis\Dataset\RCC_WSIs\pRCC\HP17....,False,fiber,48557,42661,1000
27790,pRCC,XML,HP17.11714,F:\My Files\Thesis\Dataset\RCC_WSIs\pRCC\HP17....,False,fiber,90039,64131,1000


In [14]:
import numpy as np

balanced_dfs['Root']['label'] = np.where(balanced_dfs['Root']['is_tumor'] == False, 0, 1)
balanced_dfs['Node']['label'] = np.where(balanced_dfs['Node']['type'].isin(tree_pair_dict['Node']['class_0']), 0, 1)
balanced_dfs['Leaf1']['label'] = np.where(balanced_dfs['Leaf1']['type'].isin(tree_pair_dict['Leaf1']['class_0']), 0, 1)
balanced_dfs['Leaf2']['label'] = np.where(balanced_dfs['Leaf2']['type'].isin(tree_pair_dict['Leaf2']['class_0']), 0, 1)

In [24]:
balanced_dfs['Leaf1'][balanced_dfs['Leaf1']['type']=='ONCOCYTOMA'].head()

Unnamed: 0,subtype,annot_type,id,path,is_tumor,type,top,left,size,label
3474,ONCOCYTOMA,SLIDE,HP18005453,F:\My Files\Thesis\Dataset\RCC_WSIs\ONCOCYTOMA...,True,ONCOCYTOMA,18000,25000,1000,1
3475,ONCOCYTOMA,SLIDE,HP18005453,F:\My Files\Thesis\Dataset\RCC_WSIs\ONCOCYTOMA...,True,ONCOCYTOMA,2000,6000,1000,1
3476,ONCOCYTOMA,SLIDE,HP20.5602,F:\My Files\Thesis\Dataset\RCC_WSIs\ONCOCYTOMA...,True,ONCOCYTOMA,3000,13000,1000,1
3477,ONCOCYTOMA,SLIDE,HP18005453,F:\My Files\Thesis\Dataset\RCC_WSIs\ONCOCYTOMA...,True,ONCOCYTOMA,12000,15000,1000,1
3478,ONCOCYTOMA,SLIDE,HP20.5602,F:\My Files\Thesis\Dataset\RCC_WSIs\ONCOCYTOMA...,True,ONCOCYTOMA,4000,6000,1000,1


In [10]:
balanced_dfs['Root']

Unnamed: 0,subtype,annot_type,id,path,is_tumor,type,top,left,size,label
0,ccRCC,XML,HP19.4075,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,True,ccRCC,89744,45869,1000,1
1,ccRCC,XML,HP19.4075,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,True,ccRCC,141571,44777,1000,1
2,ccRCC,XML,HP19.4075,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,True,ccRCC,89066,42554,1000,1
3,ccRCC,XML,HP19.5254,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,True,ccRCC,128143,81458,1000,1
4,ccRCC,XML,HP19.5254,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,True,ccRCC,151143,17458,1000,1
...,...,...,...,...,...,...,...,...,...,...
27787,pRCC,XML,HP17.11714,F:\My Files\Thesis\Dataset\RCC_WSIs\pRCC\HP17....,False,fiber,95749,28194,1000,0
27788,ccRCC,XML,HP19.4075,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,False,fiber,138744,64869,1000,0
27789,pRCC,XML,HP17.7980,F:\My Files\Thesis\Dataset\RCC_WSIs\pRCC\HP17....,False,fiber,48557,42661,1000,0
27790,pRCC,XML,HP17.11714,F:\My Files\Thesis\Dataset\RCC_WSIs\pRCC\HP17....,False,fiber,90039,64131,1000,0


In [21]:
for stage, pair in tree_pair_dict.items():
    stage_df = pd.DataFrame()
    
    print(stage)

Root
Node
Leaf1
Leaf2


In [None]:
# Step 3.1: Find the smallest class size
class_counts = df['class'].value_counts()
smallest_class_size = class_counts.min()

# Step 3.2: Sample each class to match the smallest class size
undersampled_df = pd.DataFrame()  # Initialize an empty DataFrame to hold the undersampled data
for class_label in class_counts.index:
    class_subset = df[df['class'] == class_label]
    undersampled_class_subset = class_subset.sample(n=smallest_class_size, random_state=42)
    undersampled_df = pd.concat([undersampled_df, undersampled_class_subset], axis=0)

# Reset index of the new DataFrame
undersampled_df = undersampled_df.reset_index(drop=True)

In [None]:
tumor_subtypes = ['ccRCC', 'pRCC', 'CHROMO', 'ONCOCYTOMA']
nontumor_subtypes = ['normal', 'fiber', 'necrosis']

In [11]:
minority_label = train_df['type'].value_counts().min()

In [12]:
minority_label

3474

In [32]:
import pandas as pd

DFS_LIST_DIR = "_dfs/train_df.csv"

train_df = pd.read_csv(DFS_LIST_DIR)

In [17]:
train_df.head()

Unnamed: 0,subtype,annot_type,id,path,is_tumor,type,top,left,size
0,ccRCC,XML,HP19.4075,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,False,normal,48066,46554,1000
1,ccRCC,XML,HP19.4075,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,False,normal,48066,47554,1000
2,ccRCC,XML,HP19.4075,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,False,normal,48066,48554,1000
3,ccRCC,XML,HP19.4075,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,False,normal,48066,49554,1000
4,ccRCC,XML,HP19.4075,F:\My Files\Thesis\Dataset\RCC_WSIs\ccRCC\HP19...,False,normal,48066,50554,1000
