In [1]:
import os
import pandas as pd
from pathlib import Path
from os.path import join
from os import walk


project_root = Path('/home/mahnaz/wsi-learning')
root = project_root / "data" / "camelyon17" / "raw" / "training"
annotations_dir = root / "lesion_annotations"


In [2]:
slide_paths = sorted([os.path.relpath(os.path.join(dirpath, file), root) for (dirpath, dirnames, filenames) in os.walk(root) for file in filenames if ".tif" in file])
slide_paths

['center_0/patient_015_node_0.tif',
 'center_0/patient_015_node_1.tif',
 'center_1/patient_019_node_2.tif']

In [3]:
annotation_paths = sorted([os.path.relpath(os.path.join(dirpath, file), root) for (dirpath, dirnames, filenames) in os.walk(root) for file in filenames if ".xml" in file])
annotation_paths

['lesion_annotations/patient_015_node_0.xml']

In [4]:
slide_names = []
for path in slide_paths:
    head, tail = os.path.split(path)
    slide_names.append(tail.split('.')[0])

slide_names 

['patient_015_node_0', 'patient_015_node_1', 'patient_019_node_2']

In [5]:
slides_annotations_paths = []
for name in slide_names:
    for anno_path in annotation_paths:
        if  name in anno_path:
            slides_annotations_paths.append(anno_path)
        else:
            slides_annotations_paths.append("")

slides_annotations_paths

['lesion_annotations/patient_015_node_0.xml', '', '']

In [6]:
annotation_names = []
for path in annotation_paths:
    head, tail =  os.path.split(path)
    annotation_names.append(tail.split('.')[0])
annotation_names    

['patient_015_node_0']

In [7]:
labels = pd.read_csv(root / 'stage_labels.csv')
slides_labels_df =labels.loc[labels.stage.isin(["itc", "negative", "micro", "macro"])]
slide_level_labels = slides_labels_df.values.tolist()
slide_level_labels

[['patient_000_node_0.tif', 'negative'],
 ['patient_000_node_1.tif', 'negative'],
 ['patient_000_node_2.tif', 'negative'],
 ['patient_000_node_3.tif', 'negative'],
 ['patient_000_node_4.tif', 'negative'],
 ['patient_001_node_0.tif', 'negative'],
 ['patient_001_node_1.tif', 'negative'],
 ['patient_001_node_2.tif', 'negative'],
 ['patient_001_node_3.tif', 'negative'],
 ['patient_001_node_4.tif', 'negative'],
 ['patient_002_node_0.tif', 'negative'],
 ['patient_002_node_1.tif', 'negative'],
 ['patient_002_node_2.tif', 'negative'],
 ['patient_002_node_3.tif', 'negative'],
 ['patient_002_node_4.tif', 'negative'],
 ['patient_003_node_0.tif', 'negative'],
 ['patient_003_node_1.tif', 'negative'],
 ['patient_003_node_2.tif', 'negative'],
 ['patient_003_node_3.tif', 'negative'],
 ['patient_003_node_4.tif', 'negative'],
 ['patient_004_node_0.tif', 'negative'],
 ['patient_004_node_1.tif', 'negative'],
 ['patient_004_node_2.tif', 'negative'],
 ['patient_004_node_3.tif', 'negative'],
 ['patient_004_n

In [8]:
slide_labels = []
for lst in slide_level_labels:
    if  lst[0].split('.')[0]  in slide_names:
        slide_labels.append(lst[1])
        
slide_labels       

['negative', 'macro', 'macro']

In [9]:
patient_labels_df = labels.loc[labels.stage.isin(["pN0", "pN1", "pN2", "pN0(i+)", "pN1mi"])]
patient_level_labels = patient_labels_df.values.tolist()
patient_level_labels



[['patient_000.zip', 'pN0'],
 ['patient_001.zip', 'pN0'],
 ['patient_002.zip', 'pN0'],
 ['patient_003.zip', 'pN0'],
 ['patient_004.zip', 'pN0(i+)'],
 ['patient_005.zip', 'pN0(i+)'],
 ['patient_006.zip', 'pN0(i+)'],
 ['patient_007.zip', 'pN1mi'],
 ['patient_008.zip', 'pN1mi'],
 ['patient_009.zip', 'pN1mi'],
 ['patient_010.zip', 'pN1mi'],
 ['patient_011.zip', 'pN1mi'],
 ['patient_012.zip', 'pN1'],
 ['patient_013.zip', 'pN1'],
 ['patient_014.zip', 'pN1'],
 ['patient_015.zip', 'pN1'],
 ['patient_016.zip', 'pN1'],
 ['patient_017.zip', 'pN1'],
 ['patient_018.zip', 'pN1'],
 ['patient_019.zip', 'pN2'],
 ['patient_020.zip', 'pN1'],
 ['patient_021.zip', 'pN2'],
 ['patient_022.zip', 'pN1'],
 ['patient_023.zip', 'pN0'],
 ['patient_024.zip', 'pN0(i+)'],
 ['patient_025.zip', 'pN0'],
 ['patient_026.zip', 'pN2'],
 ['patient_027.zip', 'pN1mi'],
 ['patient_028.zip', 'pN2'],
 ['patient_029.zip', 'pN1'],
 ['patient_030.zip', 'pN1mi'],
 ['patient_031.zip', 'pN0'],
 ['patient_032.zip', 'pN1mi'],
 ['patient_

In [10]:
patient_names = patient_labels_df.patient.str.split('.')
patient_names = [row[0] for row in patient_names ]

In [11]:
def intersection(lst1, lst2): 
    return list(set(lst1) & set(lst2)) 
  
slides_names_with_anno = intersection(slide_names, annotation_names)
print(slides_names_with_anno)
slides_names_with_no_anno = [item for item in slide_names if item not in slides_names_with_anno]
print(slides_names_with_no_anno)


tags = []
for sname in slide_names:
    for row in patient_level_labels:
        name = row[0].split('.')[0]
        label = row[1]
        #print(name, label)
        tag = ''
        if sname in slides_names_with_anno and name in sname:
             tag = name + ';' + label + ';' + 'annotated'
        elif sname in slides_names_with_no_anno and name in sname:
             tag = name + ';' + label 
        tags.append(tag)
tags = [tag for tag in tags if tag != ""]
tags

['patient_015_node_0']
['patient_015_node_1', 'patient_019_node_2']


['patient_015;pN1;annotated', 'patient_015;pN1', 'patient_019;pN2']

In [None]:
df = pd.DataFrame()
df["slide"] = slide_paths
df["annotation"] = slides_annotations_paths
df["label"] = slide_labels
df["tags"] = tags
df


In [None]:
root = project_root / "data" / "camelyon17" / "raw" / "testing"
test_slides_dir = root / "patients"
slide_paths = sorted([p.relative_to(root) for p in test_slides_dir.glob("*.tif")])
slide_paths

In [None]:
labels = pd.read_csv(root /'evaluation/submission_example.csv')
labels

In [None]:
slides_labels_df = labels.loc[labels.stage.isin(["itc", "negative", "micro", "macro"])]
slide_level_labels = slides_labels_df.values.tolist()
slide_level_labels

In [None]:
patient_labels_df = labels.loc[labels.stage.isin(["pN0", "pN1", "pN2", "pN0(i+)", "pN1mi"])]
patient_level_labels = patient_labels_df.values.tolist()
patient_level_labels 

In [None]:
slide_names = []
for path in slide_paths:
    head, tail = os.path.split(path)
    slide_names.append(tail)
slide_names

In [None]:

#print(slide_level_labels)
slide_labels = []
for lst in slide_level_labels:
    print(lst[0])
    if  lst[0]  in slide_names:
        #print('found')
         slide_labels.append(lst[1])
slide_labels

In [None]:
tags = []
for lst in patient_level_labels:
    name = lst[0].split('.')[0]
    label = lst[1]
    for spath in slide_paths:
        if name in str(spath):
            tag = name + ';' + label
            tags.append(tag)
        
print(tags)


In [None]:
df = pd.DataFrame()
df["slide"] = slide_paths
df["annotation"] = ""
df["label"] = slide_labels
df["tags"] =tags
df