In [2]:
import os
import numpy as np 
import pandas as pd
# import pandas_read_xml as pdx

In [2]:
def process_xml(xml_path):
    """
    Process xml annotation file
    input: directory to xml file
    output: return required field in a list. The following fields in added in addition to field in process_label function: 
    date, patientid, patientPid, diagnosis

    NOTE: role of diagnosis column is unclear
    """
    
    # read xml file
    df = pdx.read_xml(xml_path, ["study_query"])
    
    # extract date
    date = df["study"]["@date"]

    # extract patientid
    patientid = df["study"]["patient"]["@id"]
    patientPid = df["study"]["patient"]["@pid"]

    #extract diagnosis for each tag
    diagnosis = [ele["id"] for ele in df["study"]["diagnosis"]]  

    # extract every other info
    out = [process_label(label) for label in df["study"]["label"]]

    rows = []
    # append annotations
    for i in out:
        row = [date, patientid, patientPid, diagnosis] + i
        rows.append(row)

    return rows


In [3]:
def process_label(label):
    """
    Process each <label> tag inside each xml file
    input: <label> tag in form of orderdict
    output: extract the following fields: 
    sessionid, type_anot, annotation, scope, unit, timestamp, imageuid, seriesuid, studyuid, tag, points

    NOTE: each xml file has several <label> tag
    """

    sessionid = label["@sessionId"]
    type_anot = label["@type"]
    annotation = label["@annotation"]
    scope = label["@scope"]
    unit = label["@pointUnit"]
    timestamp = label["@createTimestamp"]
    imageuid = label["@imageUid"]
    seriesuid = label["@seriesUid"]
    studyuid = label["@studyUid"]

    #NOTE some file has noisy <tag>
    # only keep tumor relate to brain
    

    if len(label["tags"]["value"]) > 1:
        tag = []
        for sub_tag in label["tags"]["value"]:
            tag.append(sub_tag["@name"])
    else:
        tag = [label["tags"]["value"]["@name"]]
    
    # if type_anot == "global":
    #     print("Before", tag)
        # tag = list(set(tag) & check_label)
    #     print("After", tag)
    #     if (len(tag) > 1) & ("Other tumor" in tag) :
    #         print(tag, "Type 1")
    #     elif (len(tag) > 1) & ("Other tumor" not in tag):
    #         print(tag, "Type 2")

    #NOTE file 95, 111, 120 has error value in <point>
    if label["point"] != None:
        if len(label["point"]["value"]) == 12:
            points = [[float(point["@x"]), float(point["@y"]), float(point["@z"])] for point in label["point"]["value"]]
        else: 
            points = None
    else:
        points = None

    return [sessionid, type_anot, annotation, scope, unit, timestamp, imageuid, seriesuid, studyuid, tag, points]

In [4]:
def process_dataset_xml(root_folder_path, to_csv = False, to_pickle = False, file_name = ""):
    """
    combine data from xml files into dataframe and
    input: 
    - root_folder_path: directory to folder contain xml files, 
    - to_csv, to_pickle: set to True if you want 
    - file_name: name of csv/pickle file in string without .csv/.pkl (optional but required when to_csv/to_pickle is TRUE)
    """
    print("Tags that has multiple value:")

    study_folder = [os.path.join(root_folder_path, i) for i in os.listdir(root_folder_path)]
    xml_paths = []
    for subdir in study_folder:
        for label_name in os.listdir(subdir):
            xml_path = os.path.join(subdir, label_name)
            xml_paths.append(xml_path)

    # add row from each xml file to datarfame
    df_rows = []
    for xml_path in xml_paths:
        xml_out = process_xml(xml_path)
        df_rows += xml_out

    # convert to dataframe
    df_out = pd.DataFrame(df_rows, columns = ["date", "patientid", "patientPid", "diagnosis","sessionid", "type_anot", "annotation", "scope", "unit", "timestamp", "imageuid", "seriesuid", "studyuid", "tag", "points"])

    drop_index = []
    for index, row in df_out.iterrows():
        if (row["points"] == None) & (row["type_anot"] == "local"):
            drop_index.append(index)
    df_out = df_out.drop(drop_index, axis = 0)
    df_out.reset_index(drop = True)

    # summary output
    print("\n")
    print("Number of xml files(study):", len(xml_paths))
    print("Number of annotated lession:", len(df_rows))
    print(f"Number of global annotation:", len(df_out[df_out.type_anot == 'global']))
    print(f"Number of local annotation:", len(df_out[df_out.type_anot == 'local']))

    # write to file
    if (to_csv & (file_name != "")):
        df_out.to_csv(file_name + ".csv")
    if (to_pickle & (file_name != "")):
        df_out.to_pickle(file_name + ".pkl")

    return df_out

In [15]:
root_folder_path = "/home/single1/BACKUP/tintrung/brain-mri-tumor-xml"
out = process_dataset_xml(root_folder_path, to_csv=True, to_pickle=True, file_name="summary_anot")

Tags that has multiple value:


Number of xml files(study): 148
Number of annotated lession: 515
Number of global annotation: 155
Number of local annotation: 321


In [11]:
check_label = set(["Other tumor", "Glioma", "Meningioma", "Pituitary adenoma", "Cerebral tumor", "Neurinoma", "Cavernoma","Lymphoma","Chordoma"])
# out[out.type_anot == "global"]["tag"]
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', None)

uid = out.iloc[14]["seriesuid"]
# out[out.seriesuid == uid]

In [8]:
drop_index = []
for index, row in out.iterrows():
    if (row["points"] == None) & (row["type_anot"] == "local"):
        drop_index.append(index)
out = out.drop(drop_index, axis = 0)
out.reset_index(drop = True)
print(len(out))

localtag = []
for i in out[out.type_anot == "local"]["tag"]:
    localtag += i
localtag = pd.Series(localtag).value_counts()
print(localtag)

localtagunique = []
for  i in localtag.keys():
    localtagunique.append(i)

476
Mass/Nodule                   155
Cerebral edema                 74
Sinus lesion                   24
Midline shift                  21
Cyst component                 14
Ischemia                       12
Ventricular dilation            8
Hemorrhagic component           7
Cavernoma                       6
Demyelination                   5
Subdural effusion               4
CSF-like lesion                 4
Bone lesion                     3
Mass effect                     2
Other lesion                    2
Intracranial herniation         1
Arteriovenous malformation      1
dtype: int64


In [9]:
localtagunique[0]

'Mass/Nodule'

In [None]:
out

In [166]:
for i in range(len(localtagunique)):
    print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16


In [3]:
df = pd.read_pickle("summary_anot.pkl")

FileNotFoundError: [Errno 2] No such file or directory: 'summary_annot.pkl'