In [14]:
import xml.etree.ElementTree as ET
from xml.dom import minidom
from tqdm import tqdm
import os

## Extract Data from XML

In [15]:
# Function to get the data from XML Annotation
def extract_info_from_xml(xml_file):
    root = ET.parse(xml_file).getroot()
    
    # Initialise the info dict 
    info_dict = {}
    info_dict['bboxes'] = []

    # Parse the XML Tree
    for elem in root:
        # Get the file name 
        if elem.tag == "filename":
            info_dict['filename'] = elem.text
            
        # Get the image size
        elif elem.tag == "size":
            image_size = []
            for subelem in elem:
                image_size.append(int(subelem.text))
            
            info_dict['image_size'] = tuple(image_size)
        
        # Get details of the bounding box 
        elif elem.tag == "object":
            bbox = {}
            for subelem in elem:
                if subelem.tag == "name":
                    bbox["class"] = subelem.text
                    
                elif subelem.tag == "bndbox":
                    for subsubelem in subelem:
                        bbox[subsubelem.tag] = int(subsubelem.text)            
            info_dict['bboxes'].append(bbox)
    
    return info_dict


## Convert the data to YOLO v8 annotations

In [16]:
# Dictionary that maps class names to IDs
class_name_to_id_mapping = {"RBC": 0,
                           "WBC": 1,
                           "Platelets": 2}

# Convert the info dict to the required yolo format and write it to disk
def convert_to_yolov5(info_dict, setVal):
    print_buffer = []
    
    # For each bounding box
    for b in info_dict["bboxes"]:
        try:
            class_id = class_name_to_id_mapping[b["class"]]
        except KeyError:
            print("Invalid Class. Must be one from ", class_name_to_id_mapping.keys())
        
        # Transform the bbox co-ordinates as per the format required by YOLO v5
        b_center_x = (b["xmin"] + b["xmax"]) / 2 
        b_center_y = (b["ymin"] + b["ymax"]) / 2
        b_width    = (b["xmax"] - b["xmin"])
        b_height   = (b["ymax"] - b["ymin"])
        
        # Normalise the co-ordinates by the dimensions of the image
        image_w, image_h, image_c = info_dict["image_size"]  
        b_center_x /= image_w 
        b_center_y /= image_h 
        b_width    /= image_w 
        b_height   /= image_h 
        
        #Write the bbox details to the file 
        print_buffer.append("{} {:.3f} {:.3f} {:.3f} {:.3f}".format(class_id, b_center_x, b_center_y, b_width, b_height))
        
    # Name of the file which we have to save 
    save_file_name = os.path.join("data/"+ setVal + "/TextAnnotations", info_dict["filename"].replace("jpg", "txt"))
    
    # Save the annotation to disk
    print("\n".join(print_buffer), file= open(save_file_name, "w"))

### Prepare the YOLO data set for Training Set

In [17]:
# Get the annotations
annotations = [os.path.join('data/Training/Annotations', x) for x in os.listdir('data/Training/Annotations') if x[-3:] == "xml"]
annotations.sort()

# Convert and save the annotations
for ann in tqdm(annotations):
    info_dict = extract_info_from_xml(ann)
    # print(info_dict)
    # break
    convert_to_yolov5(info_dict, "Training")
annotations = [os.path.join('data/Training/TextAnnotations', x) for x in os.listdir('data/Training/TextAnnotations') if x[-3:] == "txt"]
annotations

100%|██████████| 300/300 [00:00<00:00, 608.31it/s]


['data/Training/TextAnnotations/BloodImage_00000.txt',
 'data/Training/TextAnnotations/BloodImage_00001.txt',
 'data/Training/TextAnnotations/BloodImage_00002.txt',
 'data/Training/TextAnnotations/BloodImage_00003.txt',
 'data/Training/TextAnnotations/BloodImage_00004.txt',
 'data/Training/TextAnnotations/BloodImage_00005.txt',
 'data/Training/TextAnnotations/BloodImage_00006.txt',
 'data/Training/TextAnnotations/BloodImage_00007.txt',
 'data/Training/TextAnnotations/BloodImage_00008.txt',
 'data/Training/TextAnnotations/BloodImage_00009.txt',
 'data/Training/TextAnnotations/BloodImage_00010.txt',
 'data/Training/TextAnnotations/BloodImage_00011.txt',
 'data/Training/TextAnnotations/BloodImage_00012.txt',
 'data/Training/TextAnnotations/BloodImage_00013.txt',
 'data/Training/TextAnnotations/BloodImage_00014.txt',
 'data/Training/TextAnnotations/BloodImage_00015.txt',
 'data/Training/TextAnnotations/BloodImage_00016.txt',
 'data/Training/TextAnnotations/BloodImage_00017.txt',
 'data/Tra

### Prepare the YOLO data set for Testing Set

In [18]:
# Get the annotations
annotations = [os.path.join('data/Testing/Annotations', x) for x in os.listdir('data/Testing/Annotations') if x[-3:] == "xml"]
annotations.sort()

# Convert and save the annotations
for ann in tqdm(annotations):
    info_dict = extract_info_from_xml(ann)
    # print(info_dict)
    # break
    convert_to_yolov5(info_dict, "Testing")
annotations = [os.path.join('data/Testing/TextAnnotations', x) for x in os.listdir('data/Testing/TextAnnotations') if x[-3:] == "txt"]
annotations

100%|██████████| 60/60 [00:00<00:00, 523.06it/s]


['data/Testing/TextAnnotations/BloodImage_00339.txt',
 'data/Testing/TextAnnotations/BloodImage_00340.txt',
 'data/Testing/TextAnnotations/BloodImage_00343.txt',
 'data/Testing/TextAnnotations/BloodImage_00344.txt',
 'data/Testing/TextAnnotations/BloodImage_00345.txt',
 'data/Testing/TextAnnotations/BloodImage_00346.txt',
 'data/Testing/TextAnnotations/BloodImage_00347.txt',
 'data/Testing/TextAnnotations/BloodImage_00348.txt',
 'data/Testing/TextAnnotations/BloodImage_00349.txt',
 'data/Testing/TextAnnotations/BloodImage_00351.txt',
 'data/Testing/TextAnnotations/BloodImage_00352.txt',
 'data/Testing/TextAnnotations/BloodImage_00353.txt',
 'data/Testing/TextAnnotations/BloodImage_00354.txt',
 'data/Testing/TextAnnotations/BloodImage_00355.txt',
 'data/Testing/TextAnnotations/BloodImage_00356.txt',
 'data/Testing/TextAnnotations/BloodImage_00357.txt',
 'data/Testing/TextAnnotations/BloodImage_00359.txt',
 'data/Testing/TextAnnotations/BloodImage_00360.txt',
 'data/Testing/TextAnnotatio

### Prepare the YOLO data set for Validation Set

In [19]:
# Get the annotations
annotations = [os.path.join('data/Validation/Annotations', x) for x in os.listdir('data/Validation/Annotations') if x[-3:] == "xml"]
annotations.sort()

# Convert and save the annotations
for ann in tqdm(annotations):
    info_dict = extract_info_from_xml(ann)
    # print(info_dict)
    # break
    convert_to_yolov5(info_dict, "Validation")
annotations = [os.path.join('data/Validation/TextAnnotations', x) for x in os.listdir('data/Validation/TextAnnotations') if x[-3:] == "txt"]
annotations

100%|██████████| 60/60 [00:00<00:00, 296.55it/s]


['data/Validation/TextAnnotations/BloodImage_00002.txt',
 'data/Validation/TextAnnotations/BloodImage_00004.txt',
 'data/Validation/TextAnnotations/BloodImage_00007.txt',
 'data/Validation/TextAnnotations/BloodImage_00011.txt',
 'data/Validation/TextAnnotations/BloodImage_00017.txt',
 'data/Validation/TextAnnotations/BloodImage_00019.txt',
 'data/Validation/TextAnnotations/BloodImage_00024.txt',
 'data/Validation/TextAnnotations/BloodImage_00028.txt',
 'data/Validation/TextAnnotations/BloodImage_00031.txt',
 'data/Validation/TextAnnotations/BloodImage_00039.txt',
 'data/Validation/TextAnnotations/BloodImage_00045.txt',
 'data/Validation/TextAnnotations/BloodImage_00046.txt',
 'data/Validation/TextAnnotations/BloodImage_00048.txt',
 'data/Validation/TextAnnotations/BloodImage_00049.txt',
 'data/Validation/TextAnnotations/BloodImage_00050.txt',
 'data/Validation/TextAnnotations/BloodImage_00058.txt',
 'data/Validation/TextAnnotations/BloodImage_00063.txt',
 'data/Validation/TextAnnotatio