# Build data

The objective of this notebook is to take the raw data and organize it in a way that is easier to work with, creating two folders containing the images and the corresponding labels (if they have a swimming pool).

In [15]:
# imports

import os
import shutil
import xml.etree.ElementTree as ET

In [4]:
# Constants

ROOT = os.getcwd()

# Raw Dataset 1
CANNES_IMAGES = ROOT + "\\Cannes\\images"
CANNES_LABELS = ROOT + "\\Cannes\\labels"

# Raw Dataset 2
SWIMMING_POOL_TRAINING_IMAGES = ROOT + "\\swimmingPool\\training\\images"
SWIMMING_POOL_TRAINING_LABELS = ROOT + "\\swimmingPool\\training\\labels"
SWIMMING_POOL_TESTING_IMAGES = ROOT + "\\swimmingPool\\testing\\images"
SWIMMING_POOL_TESTING_LABELS = ROOT + "\\swimmingPool\\testing\\labels"

# New Dataset
DATASET = ROOT + "\\dataset"
DATASET_IMAGES = DATASET + "\\images"
DATASET_LABELS = DATASET + "\\labels"

### 1. Create the folders

In [5]:
if not(os.path.exists(DATASET)):
    os.makedirs(DATASET)
if not(os.path.exists(DATASET_IMAGES)):
    os.makedirs(DATASET_IMAGES)
if not(os.path.exists(DATASET_LABELS)):
    os.makedirs(DATASET_LABELS)

### 2. Info about the raw data

In [6]:

list_cannes_images = os.listdir(CANNES_IMAGES)
list_cannes_labels = os.listdir(CANNES_LABELS)

list_sp_training_images = os.listdir(SWIMMING_POOL_TRAINING_IMAGES)
list_sp_testing_images = os.listdir(SWIMMING_POOL_TESTING_IMAGES)
list_sp_training_labels = os.listdir(SWIMMING_POOL_TRAINING_LABELS)
list_sp_testing_labels = os.listdir(SWIMMING_POOL_TESTING_LABELS)

print("The Cannes dataset has %s images" % len(list_cannes_images))
print("And %s are labeled" % len(list_cannes_labels))
dif_cannes = len(list_cannes_images) - len(list_cannes_labels)

print()

print("The swimming pool dataset has %s images" % (len(list_sp_testing_images) + len(list_sp_training_images)))
print("And %s are labeled" % (len(list_sp_training_labels) + len(list_sp_testing_labels)))
dif_sp = len(list_sp_testing_images) + len(list_sp_training_images) - len(list_sp_training_labels) - len(list_sp_testing_labels)

The Cannes dataset has 2401 images
And 1224 are labeled

The swimming pool dataset has 4696 images
And 2517 are labeled


### 3. Cannes raw data

In [8]:
cannes_labels_names = list(map(lambda v: v.split(".")[1], list_cannes_labels))

for val in list_cannes_images:
    src  = CANNES_IMAGES + "\\" + val
    
    shutil.copy(src, DATASET_IMAGES + "\\" + val)

    if val.split(".")[1] in cannes_labels_names:

        new_val = ".".join(val.split(".")[0:2]) + ".xml"
        src_2 = CANNES_LABELS + "\\" + new_val
        
        shutil.copy(src_2, DATASET_LABELS + "\\" + new_val)
        
print("Cannes dataset copied")

### 4. Swimming pool data

In [9]:
# it will be necessary to rename them to merge
# it is better to do it in 2 steps, one for training and one for testing
prefix = "sp_"
counter = 0

# for the training folder
sp_training_labels_names = list(map(lambda v: v.split(".")[0], list_sp_training_labels))

for val in list_sp_training_images:
    src  = SWIMMING_POOL_TRAINING_IMAGES + "\\" + val
    new_name = prefix + str(counter)

    shutil.copy(src, DATASET_IMAGES + "\\" + new_name + ".jpg")

    if val.split(".")[0] in sp_training_labels_names:
        
        src_2 = SWIMMING_POOL_TRAINING_LABELS + "\\" + val.split(".")[0] + ".xml"
        shutil.copy(src_2, DATASET_LABELS + "\\" + new_name + ".xml")

    counter +=1

print("Training folder done!")

# for the testing folder
sp_testing_labels_names = list(map(lambda v: v.split(".")[0], list_sp_testing_labels))

for val in list_sp_testing_images:
    src  = SWIMMING_POOL_TESTING_IMAGES + "\\" + val
    new_name = prefix + str(counter)

    shutil.copy(src, DATASET_IMAGES + "\\" + new_name + ".jpg")

    if val.split(".")[0] in sp_testing_labels_names:

        src_2 = SWIMMING_POOL_TESTING_LABELS + "\\" + val.split(".")[0] + ".xml"
        shutil.copy(src_2, DATASET_LABELS + "\\" + new_name + ".xml")


    counter +=1

print("Testing folder done!")
print("Swimming pool dataset copied")

Training folder done!
Testing folder done!
Swimming pool dataset copied


In [14]:
# Confirmations 
sil = len(list_cannes_labels) + len(list_sp_training_labels) + len(list_sp_testing_labels)
sinl = dif_cannes + dif_sp

print("---- CONFIRMATIONS ----")
print("We should have %d images, check = %s" % (sil, len(os.listdir(DATASET_IMAGES))== sil + sinl))
print("We should habe %d labels, check = %s" % (sil, len(os.listdir(DATASET_LABELS))== sil))

---- CONFIRMATIONS ----
We should have 3741 images, check = True
We should habe 3741 labels, check = True

Modifying xmls...


### 5. Modify the labels

Because we changed the names of the images, we need to modify the labels to match the new names.

In [16]:
# Auxiliar functions

def getImagePath(xml_name):
    splitted = xml_name.split(".")
    final = ""
    if len(splitted) > 2:
        final = ".".join(splitted[0:2]) + ".png"
    else:
        final = splitted[0] + ".jpg"

    return DATASET_IMAGES + "\\" + final

def change_name_to_img(file):
    splitted = file.split(".")
    final = ""
    if len(splitted) > 2:
        final = ".".join(splitted[0:2]) + ".png"
    else:
        final = splitted[0] + ".jpg"

    return final

In [17]:
for filename in os.listdir(DATASET_LABELS):
    xml_file = os.path.join(DATASET_LABELS, filename)

    # Parse the XML file
    tree = ET.parse(xml_file)
    root = tree.getroot()

    # Find the filename and path elements and update their text
    for filename_element in root.iter('filename'):
        filename_element.text = change_name_to_img(filename)

    for path_element in root.iter('path'):
        path_element.text = getImagePath(filename)

    # Write the modified XML back to the file
    tree.write(xml_file)
