In [43]:
import xml.etree.ElementTree as ET
from time import time
import numpy as np
import cv2
import os
import pandas as pd
from openslide import OpenSlide
from PIL import Image
Image.MAX_IMAGE_PIXELS=None
from skimage.morphology import closing, square, remove_small_objects, remove_small_holes
from skimage.segmentation import clear_border

In [44]:
# xml_path = absolute filepath of xml
# mdict = 2D coordinates of annotation by class and object

def xml2mask(xml_path,imsrc):
    # try:
    fol,fn = os.path.split(xml_path)
    imfn = fn.replace('xml','ndpi')
    mskdst = os.path.join(fol,'labeledmask_20rsf')
    dstfn = os.path.join(mskdst, '{}.png'.format(imfn.replace('.ndpi','')))

    TAdst = os.path.join(fol,'TA_20rsf')
    TAdstfn = os.path.join(TAdst, '{}.png'.format(imfn.replace('.ndpi','')))

    if not os.path.exists(mskdst):
        os.mkdir(mskdst)
    if not os.path.exists(TAdst):
        os.mkdir(TAdst)

    if os.path.exists(dstfn):
        return

    # if not imfn.replace('ndpi','tif') in os.listdir(r"\\fatherserverdw\kyuex\clue images\1um"):
    #     return

    print(os.path.basename(xml_path))
    # Open XML file
    tree = ET.parse(xml_path)
    root = tree.getroot()

    # Read Class names and put into a list called classlut
    # classlut = []
    # for Annotation in root.iter('Annotation'):
    #     for Attrib in Annotation.iter('Attribute'):
    #         classlut.append(Attrib.attrib.get('Name'))
    # classluts = sorted(classlut)
    classluts = ['tissue']
    dfs = []
    for idx, Annotation in enumerate(root.iter('Annotation')): #iterate each class
        for Region in Annotation.iter('Region'): #iterate each circle
            x = np.array([float(Vertex.get('X')) for Vertex in Region.iter('Vertex')]).astype('int') #iterate each vertex
            y = np.array([float(Vertex.get('Y')) for Vertex in Region.iter('Vertex')]).astype('int')
            objid = np.array([int(Region.get('Id'))])
            classname = np.array([classluts[idx]])
            df = pd.DataFrame({'classname': classname,
                               'objid': objid,
                               'x': [x],
                               'y': [y], })
            dfs.append(df)

    dff = pd.concat(dfs).reset_index(drop=True)





    slide = OpenSlide(os.path.join(imsrc,imfn))
    rgb_dim = slide.dimensions
    target_level = slide.get_best_level_for_downsample(20)
    target_dim = slide.level_dimensions[target_level]
    rsf = [x/y for x,y in zip(rgb_dim,target_dim)][0]
    TA = slide.read_region(location=(0,0),level=target_level,size=slide.level_dimensions[target_level])
    TA = np.array(TA)
    bw = (150 < np.array(TA)[:, :, 0]) & (np.array(TA)[:, :, 1] < 210)
    bw = closing(bw, square(3))
    minTA = 10000
    bw = remove_small_objects(bw, min_size=minTA, connectivity=2)
    minTAhole = 4000
    bw = remove_small_holes(bw, area_threshold=minTAhole)
    # bw = clear_border(bw)
    TA = np.sum(bw)

    # slide = OpenSlide(os.path.join(imsrc,imfn))
    # rsf = 10 #8um = 1.25x #4um = 2.5x, #2um=5x, 1um=10x, 0.5um=20x, 0.25um=40x
    # rsf = rsf/float(slide.properties['openslide.mpp-x'])
    # target_dim = slide.dimensions
    # target_dim = [round(np.ceil(_/rsf)) for _ in imdim]

    mask = np.zeros(target_dim[::-1], dtype = np.uint8) #white
    for idx,elem in dff.iterrows():
        contours = np.array([elem['x'],elem['y']])
        contours2 = (contours/rsf).astype(int)
        mask = cv2.fillPoly(mask, pts=[contours2.T], color=idx+1)

    #save roi mask
    Image.fromarray(mask.astype('int8')).save(dstfn)

    #save TA mask
    Image.fromarray(bw.astype('int8')).save(TAdstfn)

    ROIA = np.sum(mask)
    ratio = round(ROIA/TA*100)
    # except:
    #     ratio = 0
    #     ROIA = 0
    #     TA = 0
    #     print('create roi for ',fn)
    return [fn,ROIA,TA,ratio]


In [45]:
src = r'\\fatherserverdw\kyuex\clue images\annotations\roi'
imsrc = r"\\fatherserverdw\kyuex\clue images"
LUT = r"\\fatherserverdw\kyuex\imlist_all.xlsx"
LUT = pd.read_excel(LUT)
xmlist = LUT['filename'][(LUT['student score']>1) & (LUT['Block or Slide?']=="Both")]
xmlist

74      2022-06-07 16.05.45.ndpi
87      2022-06-07 16.10.49.ndpi
96      2022-06-07 17.10.29.ndpi
102     2022-06-07 17.15.33.ndpi
110     2022-06-07 17.43.53.ndpi
                  ...           
1145    2022-07-07 19.56.55.ndpi
1150    2022-07-07 20.05.53.ndpi
1155    2022-07-07 20.21.33.ndpi
1156    2022-07-07 20.54.02.ndpi
1172    2022-07-07 22.11.00.ndpi
Name: filename, Length: 165, dtype: object

In [46]:
start=time()
# List Comprehension
xmlist = [os.path.join(src,xmlpth.replace('ndpi','xml')) for xmlpth in xmlist]
list = [xml2mask(os.path.join(src,xmlpth),imsrc) for xmlpth in xmlist if xmlpth.endswith('.xml')]
print('readxml took {:.2f} sec'.format(time() - start))
list

2022-06-07 16.05.45.xml
2022-06-07 16.10.49.xml
2022-06-07 17.10.29.xml
2022-06-07 17.15.33.xml
2022-06-07 17.43.53.xml
2022-06-07 17.48.30.xml
2022-06-08 16.52.37.xml
2022-06-08 17.04.20.xml
2022-06-08 17.14.34.xml
2022-06-08 17.25.47.xml
2022-06-08 17.33.23.xml
2022-06-08 17.38.55.xml
2022-06-08 17.54.57.xml
2022-06-08 18.08.15.xml
2022-06-08 18.13.05.xml
2022-06-08 18.23.12.xml
2022-06-08 18.27.20.xml
2022-06-09 11.32.55.xml
2022-06-09 11.36.47.xml
2022-06-09 11.46.15.xml
2022-06-09 11.51.43.xml
2022-06-09 12.15.00.xml
2022-06-09 12.20.35.xml
2022-06-09 12.48.49.xml
2022-06-09 15.59.56.xml
2022-06-09 16.05.39.xml
2022-06-09 16.46.04.xml
2022-06-09 16.53.12.xml
2022-06-09 16.56.40.xml
2022-06-09 17.17.37.xml
2022-06-09 17.29.25.xml
2022-06-09 17.39.52.xml
2022-06-09 17.54.10.xml
2022-06-09 18.02.27.xml
2022-06-09 18.09.31.xml
2022-06-09 18.31.45.xml
2022-06-09 18.34.58.xml
2022-06-09 18.52.30.xml
2022-06-09 19.01.26.xml
2022-06-09 19.13.25.xml
2022-06-09 19.37.42.xml
2022-06-09 19.42

[['2022-06-07 16.05.45.xml', 3552318, 4081561, 87],
 ['2022-06-07 16.10.49.xml', 424640, 1883590, 23],
 ['2022-06-07 17.10.29.xml', 238360, 3230201, 7],
 ['2022-06-07 17.15.33.xml', 2082174, 4319359, 48],
 ['2022-06-07 17.43.53.xml', 208008, 2765810, 8],
 ['2022-06-07 17.48.30.xml', 435544, 2701326, 16],
 ['2022-06-08 16.52.37.xml', 160063, 4435618, 4],
 ['2022-06-08 17.04.20.xml', 626975, 5156250, 12],
 ['2022-06-08 17.14.34.xml', 457731, 3515819, 13],
 ['2022-06-08 17.25.47.xml', 1577180, 3497291, 45],
 ['2022-06-08 17.33.23.xml', 570024, 1204007, 47],
 ['2022-06-08 17.38.55.xml', 518096, 1820837, 28],
 ['2022-06-08 17.54.57.xml', 610164, 2291952, 27],
 ['2022-06-08 18.08.15.xml', 2517843, 3910579, 64],
 ['2022-06-08 18.13.05.xml', 2199510, 4013789, 55],
 ['2022-06-08 18.23.12.xml', 1085163, 2446031, 44],
 ['2022-06-08 18.27.20.xml', 1187989, 3320791, 36],
 ['2022-06-09 11.32.55.xml', 149663, 2137182, 7],
 ['2022-06-09 11.36.47.xml', 2306883, 4055865, 57],
 ['2022-06-09 11.46.15.xml'

In [47]:
len(xmlist),np.array(list).shape

(165, (165, 4))

In [48]:
np.array(list)

array([['2022-06-07 16.05.45.xml', '3552318', '4081561', '87'],
       ['2022-06-07 16.10.49.xml', '424640', '1883590', '23'],
       ['2022-06-07 17.10.29.xml', '238360', '3230201', '7'],
       ['2022-06-07 17.15.33.xml', '2082174', '4319359', '48'],
       ['2022-06-07 17.43.53.xml', '208008', '2765810', '8'],
       ['2022-06-07 17.48.30.xml', '435544', '2701326', '16'],
       ['2022-06-08 16.52.37.xml', '160063', '4435618', '4'],
       ['2022-06-08 17.04.20.xml', '626975', '5156250', '12'],
       ['2022-06-08 17.14.34.xml', '457731', '3515819', '13'],
       ['2022-06-08 17.25.47.xml', '1577180', '3497291', '45'],
       ['2022-06-08 17.33.23.xml', '570024', '1204007', '47'],
       ['2022-06-08 17.38.55.xml', '518096', '1820837', '28'],
       ['2022-06-08 17.54.57.xml', '610164', '2291952', '27'],
       ['2022-06-08 18.08.15.xml', '2517843', '3910579', '64'],
       ['2022-06-08 18.13.05.xml', '2199510', '4013789', '55'],
       ['2022-06-08 18.23.12.xml', '1085163', '244603

In [49]:
pd.DataFrame(np.array(list),columns=['fn','ROIA','TA','ratio']).to_csv(r"\\fatherserverdw\kyuex\ROITA_ratio.csv")

In [50]:
# from matplotlib import pyplot as plt
# impth = r"\\fatherserverdw\kyuex\clue images\2022-06-07 16.10.49.ndpi"
# slide = OpenSlide(impth)
# rgb_dim = slide.dimensions
# target_level = slide.get_best_level_for_downsample(20)
# target_dim = slide.level_dimensions[target_level]
# rsf = [x/y for x,y in zip(rgb_dim,target_dim)][0]
# TA = slide.read_region(location=(0,0),level=target_level,size=slide.level_dimensions[target_level])
# TA = np.array(TA)
# bw = (150 < np.array(TA)[:, :, 0]) & (np.array(TA)[:, :, 1] < 210)
# bw = closing(bw, square(3))
# minTA = 10000
# bw = remove_small_objects(bw, min_size=minTA, connectivity=2)
# minTAhole = 4000
# bw = remove_small_holes(bw, area_threshold=minTAhole)
# # bw = clear_border(bw)
# plt.imshow(bw)