In [5]:
import lxml.etree as etree
import pandas as pd
import os

def analyze_xml(xml_root):
    xmlns = '{http://www.nih.gov}'
    tree = etree.parse(xml_root)
    
    if tree.findall(xmlns + 'readingSession') == []: return {}
    
    dcm_nodules = {}
    
    readingSession = tree.findall(xmlns + 'readingSession')
    # print("readingSession:",len(readingSession))
    for doctor in readingSession:
      if doctor.findall(xmlns + 'unblindedReadNodule') == []: return {}
      Nodules = doctor.findall(xmlns + 'unblindedReadNodule')
      # print("Nodules:",len(Nodules))
      for nodule in Nodules:
        rois = nodule.findall(xmlns+'roi')
        # print("rois:",len(rois))
        for roi in rois:
          sop = roi.findall(xmlns+'imageSOP_UID')[0].text
          edge_x = []
          edge_y = []
          points = roi.findall(xmlns+'edgeMap')
          for point in points:
            x = int(point.findall(xmlns + 'xCoord')[0].text)
            y = int(point.findall(xmlns + 'yCoord')[0].text)
            edge_x.append(x)
            edge_y.append(y)
          x_max,x_min,y_max,y_min = max(edge_x),min(edge_x),max(edge_y),min(edge_y)
          edge = {}
          edge["xmax"] = x_max
          edge["ymax"] = y_max
          edge["xmin"] = x_min
          edge["ymin"] = y_min
          if sop not in dcm_nodules:
            dcm_nodules[sop] = [edge]
          else:
            dcm_nodules[sop].append(edge)
    return dcm_nodules

In [8]:
print(dcm_nodules)

{'1.3.6.1.4.1.14519.5.2.1.6279.6001.110383487652933113465768208719': [{'xmax': 328, 'ymax': 379, 'xmin': 299, 'ymin': 355}, {'xmax': 326, 'ymax': 374, 'xmin': 307, 'ymin': 359}, {'xmax': 328, 'ymax': 377, 'xmin': 304, 'ymin': 357}], '1.3.6.1.4.1.14519.5.2.1.6279.6001.499837844441581448374672853475': [{'xmax': 331, 'ymax': 386, 'xmin': 298, 'ymin': 346}, {'xmax': 330, 'ymax': 386, 'xmin': 299, 'ymin': 354}, {'xmax': 332, 'ymax': 386, 'xmin': 302, 'ymin': 351}, {'xmax': 335, 'ymax': 391, 'xmin': 301, 'ymin': 349}], '1.3.6.1.4.1.14519.5.2.1.6279.6001.299410838455281419536742634793': [{'xmax': 334, 'ymax': 387, 'xmin': 297, 'ymin': 346}, {'xmax': 340, 'ymax': 384, 'xmin': 298, 'ymin': 352}, {'xmax': 335, 'ymax': 386, 'xmin': 299, 'ymin': 348}, {'xmax': 334, 'ymax': 384, 'xmin': 299, 'ymin': 346}], '1.3.6.1.4.1.14519.5.2.1.6279.6001.824843590991776411530080688091': [{'xmax': 335, 'ymax': 385, 'xmin': 298, 'ymin': 340}, {'xmax': 334, 'ymax': 386, 'xmin': 298, 'ymin': 343}, {'xmax': 336, 'yma

In [6]:
xml_root = "../Data\\LIDC-IDRI-0001\\1.3.6.1.4.1.14519.5.2.1.6279.6001.298806137288633453246975630178\\000000\\069.xml"
dcm_nodules =  analyze_xml(xml_root)

readingSession: 4
Nodules: 4
rois: 8
rois: 1
rois: 1
rois: 1
Nodules: 1
rois: 7
Nodules: 4
rois: 1
rois: 1
rois: 1
rois: 8
Nodules: 4
rois: 9
rois: 1
rois: 1
rois: 1


In [7]:
import json
with open("data.json", "w") as f:
    json.dump(dcm_nodules, f)

********
gemini enhanced code

In [None]:
import xml.etree.ElementTree as ET
import pandas as pd 
from tqdm import tqdm

def analyze_xml(xml_root):
  """Parses an XML file and extracts data about unblinded read nodules.

  Args:
      xml_root (str): Path to the XML file.

  Returns:
      dict: A dictionary with SOP instance UIDs as keys and lists of edge
            dictionaries (containing bounding box coordinates) as values.
            If no relevant data is found, an empty dictionary is returned.
  """

  xmlns = '{http://www.nih.gov}'
  try:
    tree = ET.parse(xml_root)
  except FileNotFoundError:
    print(f"Error: XML file not found at '{xml_root}'.")
    return {}
  except Exception as e:
    print(f"Error parsing XML: {e}")
    return {}

  reading_sessions = tree.findall(xmlns + 'readingSession')
  if not reading_sessions:
    return {}

  dcm_nodules = {}
  for reading_session in reading_sessions:
    unblinded_read_nodules = reading_session.findall(xmlns + 'unblindedReadNodule')
    if not unblinded_read_nodules:
      continue

    for nodule in unblinded_read_nodules:
      rois = nodule.findall(xmlns + 'roi')
      for roi in rois:
        sop = roi.findall(xmlns + 'imageSOP_UID')[0].text

        edge_x = []
        edge_y = []
        for point in roi.findall(xmlns + 'edgeMap'):
          try:
            x = int(point.findall(xmlns + 'xCoord')[0].text)
            y = int(point.findall(xmlns + 'yCoord')[0].text)
          except ValueError:
            print(f"Warning: Invalid coordinates in ROI for SOP '{sop}'.")
            continue
          edge_x.append(x)
          edge_y.append(y)

      if edge_x:
        x_max, x_min, y_max, y_min = max(edge_x), min(edge_x), max(edge_y), min(edge_y)
        edge = {
          "xmax": x_max,
          "ymax": y_max,
          "xmin": x_min,
          "ymin": y_min
        }
        if sop not in dcm_nodules:
          dcm_nodules[sop] = [edge]
        else:
          dcm_nodules[sop].append(edge)

    return dcm_nodules

if __name__ =='__main__':
  csv_root = '../datafile.csv'
  df = pd.read_csv(csv_root)
  df['target'] = None
  xml_root = None
  dcm_nodules = None
  for index,row in tqdm(df.iterrows()):
    if xml_root!=row['xml'] :
      xml_root = row['xml']
      dcm_nodules = analyze_xml(xml_root)
      
    sop = row['SOP']
    if sop in dcm_nodules:
      row['target'] = dcm_nodules[sop]
    else:
      row['target'] = None
    df.iloc[index] = row
  df.to_csv(csv_root,index=False)