In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
import zipfile
import xml.etree.ElementTree as ET

In [None]:
with zipfile.ZipFile('/content/drive/MyDrive/Roboflow Renaming/RockPredict2.v1i.voc (1).zip', 'r') as zip_ref:
    zip_ref.extractall('/content/drive/MyDrive/Roboflow Renaming')

In [None]:
base_folder = '/content/drive/MyDrive/Roboflow Renaming'

This function will inspect an .xml file, and rename the label if it exists. This is necessary for many publicly available Roboflow datasets, where the labels are in other languages, placeholders etc.

In [None]:
def rename_labels_in_xml(xml_file_path):
  tree = ET.parse(xml_file_path)
  root = tree.getroot()

  for obj in root.findall('object'):
    name = obj.find('name')
    if name is not None and name.text == 'top_cast':
      name.text = 'House'

  tree.write(xml_file_path)

Every image has a corresponding .xml file, and the following function accounts for Roboflow datasets which come with multiple unwanted classes along with the target class. This function will delete all the information in the < object > bracket, and leave an empty .xml file. We will later delete these images along with the empty .xml files:

In [None]:
def filter_annotations_for_label(xml_file_path, target_label):
  tree = ET.parse(xml_file_path)
  root = tree.getroot()

  elements_to_remove = []

  for obj in root.findall('object'):
    name = obj.find('name')
    if name is not None and name.text != target_label:
      elements_to_remove.append(obj)

  # Remove elements not related to the target label
  for elem in elements_to_remove:
    root.remove(elem)

  # Write the modified XML tree back to the same file
  tree.write(xml_file_path)

For each of the three subfolders (train, test, val) in a regular Roboflow zipfile, the labels for each .xml file will be passed through the 'rename_labels_in_xml' function:

In [None]:
subfolders = ['TOPhouse_train', 'TOPhouse_valid']

In [None]:
for subfolder in subfolders:
  subfolder_path = os.path.join(base_folder, subfolder)

  # Process .xml annotation files in the current subfolder
  for xml_filename in os.listdir(subfolder_path):
    if xml_filename.endswith('.xml'):
      xml_path = os.path.join(subfolder_path, xml_filename)
      rename_labels_in_xml(xml_path)

The image folders are each passed through the 'filter_annotations_for_label' function which removes information for all labels except the target class:

In [None]:
subfolders = ['CLIFFStrain', 'CLIFFStest', 'CLIFFSvalid']
base_folder = '/content/drive/MyDrive/Roboflow Renaming'
target_label = 'cliff'

In [None]:
for subfolder in subfolders:
  subfolder_path = os.path.join(base_folder, subfolder)

  # Process .xml annotation files in the current subfolder
  for xml_filename in os.listdir(subfolder_path):
    if xml_filename.endswith('.xml'):
      xml_path = os.path.join(subfolder_path, xml_filename)
      filter_annotations_for_label(xml_path, target_label)

As the 'filter_annotations_for_label' function leaves an empty .xml file in most cases, we will need to remove along with the image which is left without annotations.

This function returns the True or False value corresponding to if the .xml < object > details are present or not in the list respectively (if the < object > value is more than 0):

In [None]:
def has_annotations(xml_file_path):
  tree = ET.parse(xml_file_path)
  root = tree.getroot()

  return len(root.findall('object')) > 0

If the .xml file does not contain < object > details i.e. it was processed earlier to remove unwanted classes, then the following code removes this file along with its assorted image file:

In [None]:
for subfolder in subfolders:
  folder_path = os.path.join(base_folder, subfolder)

  # Process each file in the folder
  for file_name in os.listdir(folder_path):
    if file_name.endswith('.xml'):
      xml_path = os.path.join(folder_path, file_name)
      image_filename = os.path.splitext(file_name)[0] + '.jpg'
      image_path = os.path.join(folder_path, image_filename)

      if not has_annotations(xml_path):
        os.remove(xml_path)
        os.remove(image_path)