<a href="https://colab.research.google.com/github/PavanDaniele/drone-person-detection/blob/main/data_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Run this Every time you start a new session
from google.colab import drive
drive.mount('/content/drive') # to mount google drive (to see/access it)

Mounted at /content/drive


In [None]:
import os
import xml.etree.ElementTree as ET
import pandas as pd
import re  # To import the module for the regular expression
import shutil # To operate on files and folders

In [None]:
# Useful functions

def read_file_csv(folder_path, filename): # To read a file csv in a folder
  """
  Read a specific CSV file from a specified folder.

  Args:
    folder_path (str): Path to the folder containing the CSV file.
    filename (str): Name of the CSV file.

  Returns:
    df (pd.DataFrame): DataFrame containing the data from the CSV file.
  """
  filename_path = os.path.join(folder_path, filename)
  df_filename = pd.read_csv(filename_path)
  return df_filename



# Data Exploration
In this notebook, I will explore the dataset used for the person recognition project through drones. My goal is to analyze the content of the dataset, understand its characteristics, and apply necessary transformations (such as resizing, normalization, and creation of training and testing sets). This process is crucial to prepare the data so that it can be used for training deep learning models.

The chosen dataset includes images of people taken by drones at different altitudes, and it is essential to analyze it to identify the labels, classes, and any anomalies in the data.

## Dataset SARD:

In [None]:
# os.chdir('../') # to change the directory (../ is the start)
os.chdir('/content/drive/MyDrive/projectUPV/datasets/SARD_dataset/SARD') # the directory with the dataset
print(os.getcwd()) # to see in which directory we are
# print(os.listdir()) # to see what there is in the current directory (add the path inside the () to see the content of another directory)

/content/drive/.shortcut-targets-by-id/1LQbD7p_iS5KLqGNdfrYEvsAx0i_bgB0h/projectUPV/datasets/SARD_dataset/SARD


In [None]:
def analyze_dataset_annotations(dataset_path): # Basic Analysis of the dataset --> don't execute it
  """
  Analyze XML annotations inside a folder and count labels and images without labels.

  Args:
    dataset_path (str): Path to the dataset folder containing .jpg and .xml files.

  Returns:
    df (pd.DataFrame): DataFrame with image filenames and labels.
    images_without_labels (set): Set of images without any annotation.
  """

  data = []
  images_with_labels = set()
  all_images = set()

  # Collect all .jpg images
  for f in os.listdir(dataset_path):
    if f.endswith('.jpg'):
        all_images.add(f)

  # Parse all XML files
  for filename in os.listdir(dataset_path):
    if filename.endswith('.xml'):
      xml_path = os.path.join(dataset_path, filename)

      try:
        tree = ET.parse(xml_path)
        root = tree.getroot()

        image_filename = root.find('filename').text

        has_object = False
        for obj in root.findall('object'):
          label = obj.find('name').text
          data.append([image_filename, label])
          has_object = True

        if has_object:
          images_with_labels.add(image_filename)

      except ET.ParseError:
        print(f"Warning: Failed to parse {xml_path}")

  # Load labels in a DataFrame
  df = pd.DataFrame(data, columns=['image_filename', 'label'])

  # Find images without annotations
  images_without_labels = all_images - images_with_labels

  return all_images, images_with_labels, df, images_without_labels


dataset_path = '/content/drive/MyDrive/projectUPV/datasets/SARD_dataset/SARD'

all_images, images_with_labels, df, images_without_labels = analyze_dataset_annotations(dataset_path)

print("Total images:", len(all_images))
print("Images with at least one label:", len(images_with_labels))
print("Images without any label:", len(images_without_labels)) # there are the images without a file .xml
print("The images without any label are: ", images_without_labels) # we should eliminate these images cause we don't have any annotation for them

# Now we can count the labels
print("\n Total labels count: ")
print(df['label'].value_counts()) # total number of bounding boxes (annotations) for each classes (print a row for each different label)

print("\n Unique images per label:")
print(df.groupby('label')['image_filename'].nunique()) # how many different images contain at least one instance of each label


Total images: 1983
Images with at least one label: 1981
Images without any label: 2
The images without any label are:  {'gss1128.jpg', 'gss0806.jpg'}

 Total labels count: 
label
stands         1896
laying_down    1591
Walking        1303
not_defined    1039
seated          624
Running          79
Name: count, dtype: int64

 Unique images per label:
label
Running          68
Walking         776
laying_down    1115
not_defined     759
seated          603
stands         1001
Name: image_filename, dtype: int64


In [None]:
def find_really_empty_images_strict(dataset_path): # To find all the empty images
  """
  Find images where bounding box coordinates are all very small (<= 3)

  Args:
    dataset_path (str): Path to dataset containing .jpg and .xml files.

  Returns:
    really_empty_images (set): Set of image filenames considered truly empty.
  """

  really_empty_images = set()

  for filename in os.listdir(dataset_path):
    if filename.endswith('.xml'):
      xml_path = os.path.join(dataset_path, filename)

      try:
        tree = ET.parse(xml_path)
        root = tree.getroot()

        labels = []
        suspicious_bboxes = []

        for obj in root.findall('object'):
          label = obj.find('name').text.strip()
          bndbox = obj.find('bndbox')
          xmin = int(bndbox.find('xmin').text.strip())
          ymin = int(bndbox.find('ymin').text.strip())
          xmax = int(bndbox.find('xmax').text.strip())
          ymax = int(bndbox.find('ymax').text.strip())

          labels.append(label)
          suspicious = (xmin <= 3 or ymin <= 3 or xmax <= 3 or ymax <= 3)
          suspicious_bboxes.append(suspicious)

        if labels and all(suspicious_bboxes):
          image_filename = root.find('filename').text.strip()
          really_empty_images.add(image_filename)

      except ET.ParseError:
        print(f"Warning: Failed to parse {xml_path}")

  return really_empty_images

sard_dataset_path = '/content/drive/MyDrive/projectUPV/datasets/SARD_dataset/SARD'
really_empty_images = find_really_empty_images_strict(sard_dataset_path)

print("There are ", len(really_empty_images), " empty images")
print("Really empty images:", really_empty_images)

There are  4  empty images
Really empty images: {'gss1138.jpg', 'gss1129.jpg', 'gss0326.jpg', 'gss0327.jpg'}


In [None]:
# To read a specific file csv
folder_path_sard = '/content/drive/MyDrive/projectUPV/datasets/SARD_dataset/SARD'

read_file_csv(folder_path_sard, 'sard_labels.csv')
# read_file_csv(folder_path_sard, 'sard_person_labels.csv')

In [None]:
# As we see, the file is not sorted. We want to order it but first we should rename also the images and .xml files

def rename_images_and_xml_files(folder_path, prefix="gss"): # To rename a .jpg images and xml files with a prefix
  """
  Rename images and xml files with a common prefix. in the format gssNNNN.jpg/xml

  Args:
    folder_path (str): Path to dataset containing images
    prefix (str): Prefix to add to the new filenames
  """
  for filename in os.listdir(folder_path):
    if filename.endswith('.jpg') or filename.endswith('.xml'):
      match = re.match(rf'{prefix}(\d+)\.(jpg|xml)', filename)
      if match:
        number = int(match.group(1))
        extension = match.group(2)
        new_name = f"{prefix}{str(number).zfill(4)}.{extension}"

        old_path = os.path.join(folder_path, filename)
        new_path = os.path.join(folder_path, new_name)

        if old_path != new_path:
          os.rename(old_path, new_path)
          # print(f"Renamed: {filename} → {new_name}")

  print("Files renamed successfully!")

rename_images_and_xml_files(folder_path_sard)

Files renamed successfully!


In [None]:
# Modify the names in gssNNNN.jpg format
def format_filename(old_name, prefix="gss"):
  num = re.findall(r'\d+', old_name)
  if num:
    return f"{prefix}{int(num[0]):04d}.jpg"
  return old_name  # In caso non ci siano numeri

In [None]:
def update_csv_filenames(folder_path, csv_file, prefix="gss"): # to change the filename column in a csv file
  """
  Update the 'filename' column in a CSV file with a new prefix in the format gssNNNN.jpg.

  Args:
    folder_path (str): Path to dataset containing csv
    csv_file (str): Name of the CSV file
    prefix (str): Prefix to add to the new filenames
  """
  # After renaming, update the filenames in the CSV
  csv_path = os.path.join(folder_path, csv_file)
  df = pd.read_csv(csv_path)

  df['filename'] = df['filename'].apply(format_filename)
  df.to_csv(csv_path, index=False)

  # Sort de dataframe by filename
  df = df.sort_values(by='filename').reset_index(drop=True)

  # Print the updated DataFrame to verify the changes
  print(f"Updated CSV '{csv_file}':")
  print(df.head())  # Print first few rows to verify

# update_csv_filenames(folder_path_sard, 'sard_labels.csv')
# update_csv_filenames(folder_path_sard, 'sard_person_labels.csv')

# read_file_csv(folder_path_sard, 'sard_labels.csv')
# read_file_csv(folder_path_sard, 'sard_person_labels.csv')

Updated CSV 'sard_labels.csv':
      filename  width  height    class  xmin  ymin  xmax  ymax
0  gss0001.jpg   1920    1080  walking  1110   358  1134   424
1  gss0001.jpg   1920    1080   stands  1077   367  1100   428
2  gss0001.jpg   1920    1080   seated  1041   144  1061   173
3  gss0002.jpg   1920    1080   seated  1041   142  1062   173
4  gss0002.jpg   1920    1080   stands  1079   365  1101   428
Updated CSV 'sard_person_labels.csv':
      filename  width  height   class  xmin  ymin  xmax  ymax
0  gss0001.jpg   1920    1080  person  1110   358  1134   424
1  gss0001.jpg   1920    1080  person  1077   367  1100   428
2  gss0001.jpg   1920    1080  person  1041   144  1061   173
3  gss0002.jpg   1920    1080  person  1041   142  1062   173
4  gss0002.jpg   1920    1080  person  1079   365  1101   428


Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax
0,gss1299.jpg,1920,1080,person,473,667,519,733
1,gss1604.jpg,1920,1080,person,927,317,953,347
2,gss1604.jpg,1920,1080,person,999,589,1033,636
3,gss0801.jpg,1920,1080,person,315,394,391,456
4,gss0501.jpg,1920,1080,person,1412,337,1453,376
...,...,...,...,...,...,...,...,...
6527,gss0955.jpg,1920,1080,person,164,777,220,857
6528,gss0955.jpg,1920,1080,person,1094,664,1199,702
6529,gss0955.jpg,1920,1080,person,1491,460,1516,489
6530,gss0955.jpg,1920,1080,person,1668,595,1720,625


In [None]:
def update_xml_filenames(folder_path, prefix="gss"): # To update the filename tag in each xml file in a folder
  """
  Update the <filename> tag in each XML file in the folder to follow gssNNNN.jpg format.

  Args:
    folder_path (str): Path to the folder containing XML files.
    prefix (str): Prefix for the filenames (default: 'gss').
  """
  updated = 0
  for file in os.listdir(folder_path):
    if file.endswith('.xml'):
      xml_path = os.path.join(folder_path, file)
      try:
        tree = ET.parse(xml_path)
        root = tree.getroot()

        filename_tag = root.find('filename')
        if filename_tag is not None:
          old_name = filename_tag.text
          new_name = format_filename(old_name, prefix)

          if old_name != new_name:
            filename_tag.text = new_name
            tree.write(xml_path)
            updated += 1

      except ET.ParseError:
        print(f"Failed to parse {file}")

  print(f"Updated <filename> in {updated} XML files.")

update_xml_filenames(folder_path_sard)

Updated <filename> in 0 XML files.


In [None]:
def map_class_name(label):
  """
  Map a class name from CSV to its corresponding XML label.
  """
  class_mapping = {
    "Standing": "stands",
    "Not-defined": "not_defined",
    "Lying": "laying_down",
    "Sitting": "seated",
    "Walking": "Walking",
    "Running": "Running"
  }
  return class_mapping.get(label, label)

def normalize_csv_annotations(folder_path, csv_filename): # To normalize class name in a csv file to match XML annotation format
  """
  Normalize class names in a CSV file to match XML annotation format.

  Args:
    folder_path (str): Path to the folder containing the CSV file.
    csv_filename (str): Name of the CSV file.
  """
  csv_path = os.path.join(folder_path, csv_filename)
  df = pd.read_csv(csv_path)

  # Normalize class names
  df['class'] = df['class'].apply(map_class_name).str.lower().str.strip()

  # Sort by filename
  df = df.sort_values(by='filename').reset_index(drop=True)

  # Save the modified CSV
  df.to_csv(csv_path, index=False)
  print(f"CSV '{csv_filename}' normalized and saved.")
  return df

normalize_csv_annotations(folder_path_sard, 'sard_labels.csv')

CSV 'sard_labels.csv' normalized and saved.


Unnamed: 0,filename,width,height,class,xmin,ymin,xmax,ymax
0,gss0001.jpg,1920,1080,walking,1110,358,1134,424
1,gss0001.jpg,1920,1080,stands,1077,367,1100,428
2,gss0001.jpg,1920,1080,seated,1041,144,1061,173
3,gss0002.jpg,1920,1080,seated,1041,142,1062,173
4,gss0002.jpg,1920,1080,stands,1079,365,1101,428
...,...,...,...,...,...,...,...,...
6527,gss2104.jpg,1920,1080,stands,1174,604,1226,672
6528,gss2104.jpg,1920,1080,stands,1226,1040,1266,1080
6529,gss2104.jpg,1920,1080,stands,1391,931,1458,982
6530,gss2104.jpg,1920,1080,stands,1548,1032,1608,1071


In [None]:
def parse_xml_annotations(xml_folder): # Read all the XML files in a folder and return a pandas DataFrame with the annotations in them
  """
  Parses annotation data from all XML files in a given folder and returns a DataFrame containing object labels and bounding boxes.

  Args:
    xml_folder (str): Path to the folder containing XML annotation files.

  Returns:
    pandas.DataFrame: A DataFrame with columns ['filename', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
    containing all annotations extracted from the XML files.
  """
  annotations = []

  for file in os.listdir(xml_folder):
    if file.endswith('.xml'):
      xml_path = os.path.join(xml_folder, file)
      try:
        tree = ET.parse(xml_path)
        root = tree.getroot()

        filename = root.find('filename').text.strip()
        for obj in root.findall('object'):
          label = obj.find('name').text.strip().lower()
          bbox = obj.find('bndbox')
          xmin = int(bbox.find('xmin').text)
          ymin = int(bbox.find('ymin').text)
          xmax = int(bbox.find('xmax').text)
          ymax = int(bbox.find('ymax').text)

          annotations.append({
            'filename': filename,
            'class': label,
            'xmin': xmin,
            'ymin': ymin,
            'xmax': xmax,
            'ymax': ymax
          })

      except ET.ParseError:
        print(f"Warning: Failed to parse {xml_path}")

  return pd.DataFrame(annotations)

In [None]:
def check_csv_vs_xml_annotations(folder_path, csv_filename): # Check if the csv annotations match the xml files
  """
  Compares object annotations in a CSV file against annotations found in XML files.

  Args:
    folder_path (str): Path to the folder containing both the CSV and XML files.
    csv_filename (str): Name of the CSV file to validate.

  Returns:
    dict: Summary with:
      - 'matched': Number of matched annotation rows.
      - 'only_in_csv': Number of unmatched rows only in CSV.
      - 'only_in_xml': Number of unmatched rows only in XML.
      - 'files_only_in_csv': Set of filenames found only in the CSV.
      - 'files_only_in_xml': Set of filenames found only in the XML.
  """
  csv_path = os.path.join(folder_path, csv_filename)
  df_csv = pd.read_csv(csv_path)
  df_xml = parse_xml_annotations(folder_path)

  # Normalize filename and class
  df_csv['filename'] = df_csv['filename'].astype(str).str.strip()
  df_csv['class'] = df_csv['class'].astype(str).str.strip().str.lower()

  merged = pd.merge(
    df_csv,
    df_xml,
    how='outer',
    on=['filename', 'class', 'xmin', 'ymin', 'xmax', 'ymax'],
    indicator=True
  )

  # Identifica i filename con mismatch
  only_csv_filenames = set(merged[merged['_merge'] == 'left_only']['filename'])
  only_xml_filenames = set(merged[merged['_merge'] == 'right_only']['filename'])

  return {
    'matched': merged['_merge'].value_counts().get('both', 0),
    'only_in_csv': merged['_merge'].value_counts().get('left_only', 0),
    'only_in_xml': merged['_merge'].value_counts().get('right_only', 0),
    'files_only_in_csv': sorted(only_csv_filenames),
    'files_only_in_xml': sorted(only_xml_filenames)
  }

csv_filename = 'sard_labels.csv'
# csv_filename = 'sard__person_labels.csv'

result = check_csv_vs_xml_annotations(folder_path_sard, 'sard_labels.csv')

print("Correct Annotations:", result['matched'])
print("Annotations only in CSV:", result['only_in_csv'])
print("Annotazioni only in XML:", result['only_in_xml'])

# if result['files_only_in_csv']:
#   print("Images with annotations only in CSV:")
#   for f in result['files_only_in_csv']:
#     print("  -", f)

# if result['files_only_in_xml']:
#   print("Images with annotations only in XML:")
#   for f in result['files_only_in_xml']:
#     print("  -", f)

Correct Annotations: 6532
Annotations only in CSV: 0
Annotazioni only in XML: 0


In [None]:
# To compare two csv files: The purpose is to ensure that both CSV files are consistent when the class labels are normalized to "person"
def compare_personified_csvs(folder_path, full_csv='sard_labels.csv', person_csv='sard_person_labels.csv'):
  """
  Compares two CSV files where one contains full class labels and the other has all labels replaced with 'person'.

  Args:
    folder_path (str): Path to the folder containing the CSV files.
    full_csv (str): CSV file with full class names.
    person_csv (str): CSV file with all classes replaced by 'person'.

  Returns:
    dict: Summary of matching and mismatching rows.
  """
  path_full = os.path.join(folder_path, full_csv)
  path_person = os.path.join(folder_path, person_csv)

  df_full = pd.read_csv(path_full)
  df_person = pd.read_csv(path_person)

  # Normalize the full CSV to have 'person' as the class
  df_full_personified = df_full.copy()
  df_full_personified['class'] = 'person'

  # Sort both for consistent comparison
  df_full_personified = df_full_personified.sort_values(by=['filename', 'xmin', 'ymin', 'xmax', 'ymax']).reset_index(drop=True)
  df_person = df_person.sort_values(by=['filename', 'xmin', 'ymin', 'xmax', 'ymax']).reset_index(drop=True)

  # Compare relevant columns
  cols_to_compare = ['filename', 'xmin', 'ymin', 'xmax', 'ymax', 'class']
  comparison = df_full_personified[cols_to_compare].equals(df_person[cols_to_compare])

  if comparison:
    print("The CSV files match after normalizing class names to 'person'.")
  else:
    mismatches = df_full_personified[cols_to_compare].compare(df_person[cols_to_compare])
    print("Differences found between the two CSVs after normalizing classes.")
    print("Sample differences:")
    print(mismatches.head(10))

  return comparison


compare_personified_csvs(folder_path_sard)

The CSV files match after normalizing class names to 'person'.


True

In [None]:
def analyze_dataset_annotations(file_csv, dataset_folder):  # Basic Analysis of the dataset
  """
  Analize the csv file and verify if there are images (.jpg) without annotation
  """
  df = pd.read_csv(file_csv)

  # Count the number of annotations for each class
  count_labels = df['class'].value_counts()

  # Number of bounding box per image
  bboxes_per_image = df.groupby('filename').size()
  number_bboxes_per_image = bboxes_per_image.value_counts().sort_index()

  # All the images annotated in the csv
  annotated_images = set(df['filename'].unique())

  # All images in the folder
  all_images = {f for f in os.listdir(dataset_folder) if f.endswith('.jpg')}

  # Images without any annotation in csv file
  images_without_annotations = all_images - annotated_images

  return {
    "label_count": count_labels,
    "distribution_of_bbox": number_bboxes_per_image,
    "number_images_annotated": len(annotated_images),
    "number_all_images": len(all_images),
    "number_images_without_annotations": len(images_without_annotations),
    "images_without_annotations": images_without_annotations  # utile per debug o per rimuoverle
  }



folder_path_sard = '/content/drive/MyDrive/projectUPV/datasets/SARD_dataset/SARD'
csv_path_sard = os.path.join(folder_path_sard, 'sard_labels.csv')

risultati = analyze_dataset_annotations(csv_path_sard, folder_path_sard)

print("How many images for each label:")
print(risultati["label_count"])

print("\nDistribution of bbox for image:")
print(risultati["distribution_of_bbox"]) # there are 557 images with 1 bbox, etc..

for n_bbox, n_images in risultati["distribution_of_bbox"].items():
  print(f"There are {n_images} images with {n_bbox} bbox")

print("\nImages with at least one label (bounding box):", risultati["number_images_annotated"])
print("Total images:", risultati["number_all_images"])
print("Images without any label:", risultati["number_images_without_annotations"])
print("The images without any label are:", risultati["images_without_annotations"])


How many images for each label:
class
stands         1896
laying_down    1591
walking        1303
not_defined    1039
seated          624
running          79
Name: count, dtype: int64

Distribution of bbox for image:
{1: 557, 2: 755, 3: 468, 4: 155, 5: 45, 6: 1}
There are 557 images with 1 bbox
There are 755 images with 2 bbox
There are 468 images with 3 bbox
There are 155 images with 4 bbox
There are 45 images with 5 bbox
There are 1 images with 6 bbox

Images with at least one label (bounding box): 1981
Total images: 1983
Images without any label: 2
The images without any label are: {'gss1128.jpg', 'gss0806.jpg'}


In [None]:
def check_image_dimensions_consistency(file_csv): # Check if all the images have the same dimensions
  """
  Check if all images in the CSV have the same dimensions (width, height).

  Args:
    file_csv (str): path to csv wich contains columns 'filename', 'width' and 'height'.

  Returns:
    pd.DataFrame: different dimensions found with count.
  """
  df = pd.read_csv(file_csv)

  for col in ['width', 'height']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

  dimension_counts = df.groupby(['width', 'height'])['filename'].nunique().reset_index(name='image_count')

  if len(dimension_counts) == 1:
    print(f"All images have the same size: {dimension_counts.iloc[0]['width']}x{dimension_counts.iloc[0]['height']}")
  else:
    print(f"Found {len(dimension_counts)} different image sizes:")
    print(dimension_counts)

  return dimension_counts

folder_path_sard = '/content/drive/MyDrive/projectUPV/datasets/SARD_dataset/SARD'
csv_sard_labels_path = os.path.join(folder_path_sard, 'sard_labels.csv')
csv_sard_person_labels_path = os.path.join(folder_path_sard, 'sard_person_labels.csv')
check_image_dimensions_consistency(csv_sard_labels_path)
check_image_dimensions_consistency(csv_sard_person_labels_path)

All images have the same size: 1920x1080
All images have the same size: 1920x1080


Unnamed: 0,width,height,image_count
0,1920,1080,1981


In [None]:
def check_bboxes_out_of_bounds_from_csv(file_csv): # Check if the bbox aren't outside the image frame
  """
  Check if the bounding boxes in the CSV are within the image boundaries.

  Args:
    file_csv (str): path to csv wich contains columns 'filename', 'xmin', 'ymin', 'xmax', 'ymax', 'width', 'height'

  Returns:
    pd.DataFrame: rows with invalid bounding boxes
  """
  df = pd.read_csv(file_csv)

  # Assuce that the fields are numbers
  for col in ['xmin', 'ymin', 'xmax', 'ymax', 'width', 'height']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

  # Build mask for bbox outside boundaries
  out_of_bounds_mask = ~(
    (df['xmin'] >= 0) &
    (df['ymin'] >= 0) &
    (df['xmax'] <= df['width']) &
    (df['ymax'] <= df['height']) &
    (df['xmax'] > df['xmin']) &
    (df['ymax'] > df['ymin'])
  )

  invalid_rows = df[out_of_bounds_mask].copy()

  print(f"Checked all entries. Found {len(invalid_rows)} invalid bounding boxes.")
  return invalid_rows


csv_sard_labels_path = os.path.join(folder_path, 'sard_labels.csv')
csv_sard_person_labels_path = os.path.join(folder_path, 'sard_person_labels.csv')
invalid_bboxes = check_bboxes_out_of_bounds_from_csv(csv_sard_labels_path)
invalid_bboxes = check_bboxes_out_of_bounds_from_csv(csv_sard_person_labels_path)

print(invalid_bboxes[['filename', 'xmin', 'ymin', 'xmax', 'ymax', 'width', 'height']])


Checked all entries. Found 0 invalid bounding boxes.
Checked all entries. Found 0 invalid bounding boxes.
Empty DataFrame
Columns: [filename, xmin, ymin, xmax, ymax, width, height]
Index: []


In [None]:
# Note on the "not_defined" label:
# In the SARD dataset, the "not_defined" label is used when a person is clearly present in the image,but their activity or posture
#  cannot be reliably classified (e.g., due to occlusion or ambiguity).
# Since my task is person detection (i.e., detecting the presence of a person, regardless of their behavior), I decided to treat
#  all "not_defined" annotations as valid person instances.
# This avoids undercounting people in cases where the action is unclear but their presence is certain.

def classify_image_annotation_quality(file_csv, dataset_folder, threshold=3): # To find suspicious images
  """
  Classify images in 3 categories:
    1. suspected error annotation (images with all bbox with one of ymin, ymax, xmin, xmax parameters <= threshold)
    2. partially suspected annotation (images with at least one bbox with ymin, ymax, xmin, xmax parameters <= threshold)
    3. no annotation (no bounding box)
  """
  df = pd.read_csv(file_csv)

  # Conversion of numerical coordinates
  for col in ['xmin', 'ymin', 'xmax', 'ymax']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

  # Boolean column for bbox sus
  df['suspicious'] = (
    (df['xmin'] <= threshold) | (df['ymin'] <= threshold) | (df['xmax'] <= threshold) | (df['ymax'] <= threshold)
  )

  # Assemble images
  grouped = df.groupby('filename')['suspicious']

  case1_all_suspicious = grouped.all()  # All boxes are sus
  case2_some_suspicious = grouped.any() & ~grouped.all()  # just some boxes are sus

  # All images annotated
  annotated_images = set(df['filename'].unique())

  # All images in the folder
  all_images = {f for f in os.listdir(dataset_folder) if f.endswith('.jpg')}

  # Images not annotated (no raw in the csv)
  case3_no_annotations = all_images - annotated_images

  return {
    "case1_all_suspicious": set(case1_all_suspicious[case1_all_suspicious].index), # I'll treat these as if they were empty images
    "case2_some_suspicious": set(case2_some_suspicious[case2_some_suspicious].index),
    "case3_no_annotations": case3_no_annotations # these are empty images
  }

folder_path_sard = '/content/drive/MyDrive/projectUPV/datasets/SARD_dataset/SARD'
csv_sard_labels_path = os.path.join(folder_path_sard, 'sard_labels.csv')
csv_sard_person_labels_path = os.path.join(folder_path_sard, 'sard_person_labels.csv')

result = classify_image_annotation_quality(csv_sard_labels_path, folder_path_sard)
# result = classify_image_annotation_quality(csv_sard_person_labels_path, folder_path_sard)

print(f"Case 1 (only bbox suspicious): {len(result['case1_all_suspicious'])} and they are: {result['case1_all_suspicious']}")
print(f"Case 2 (bbox mix): {len(result['case2_some_suspicious'])}")
print(f"Case 3 (no annotation): {len(result['case3_no_annotations'])} and they are: {result['case3_no_annotations']}")

# I also think that the images in the case 1 and 3 are useless in this unbalanced dataset (So i'm going to delete it).

Case 1 (only bbox suspicious): 4 and they are: {'gss1138.jpg', 'gss0326.jpg', 'gss0327.jpg', 'gss1129.jpg'}
Case 2 (bbox mix): 107
Case 3 (no annotation): 2 and they are: {'gss1128.jpg', 'gss0806.jpg'}


In [None]:
def clean_dataset(image_folder, xml_folder, csv_files, images_to_remove): # To delete images and xml file (also in the csv)
  """
  Delete images and XML files, and directly remove related rows from original CSV files.

  Args:
    image_folder (str): Path to folder containing .jpg files.
    xml_folder (str): Path to folder containing .xml files.
    csv_files (list of str): List of paths to CSV files to clean.
    images_to_remove (list or set): Filenames (e.g., 'gss1128.jpg') to remove.
  """
  # 1. Delete image and XML files
  for image in images_to_remove:
    image_path = os.path.join(image_folder, image)
    xml_path = os.path.join(xml_folder, os.path.splitext(image)[0] + '.xml')

    if os.path.exists(image_path):
      os.remove(image_path)
      print(f"Deleted image: {image}")
    else:
      print(f"Image not found: {image}")

    if os.path.exists(xml_path):
      os.remove(xml_path)
      print(f"Deleted XML: {os.path.basename(xml_path)}")
    else:
      print(f"XML not found: {os.path.basename(xml_path)}")

  # 2. Remove related rows directly from original CSV files
  for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    initial_len = len(df)

    # Drop rows with matching filenames
    df = df[~df['filename'].isin(images_to_remove)] # ~ boolean negation operator
    # .isin(): to control if each element in a column is present in a list, set, ecc

    # Overwrite the original CSV file
    df.to_csv(csv_file, index=False)

    print(f"Updated {csv_file}: {initial_len - len(df)} rows removed")


csv_sard_labels_path = os.path.join(folder_path_sard, 'sard_labels.csv')
csv_sard_person_labels_path = os.path.join(folder_path_sard, 'sard_person_labels.csv')
csv_files = [csv_sard_labels_path, csv_sard_person_labels_path]
images_to_remove = result['case1_all_suspicious'].union(result['case3_no_annotations'])
# clean_dataset(folder_path, folder_path, csv_files, images_to_remove)

In [None]:
# controlla il file csv delle immagini con effetti visivi, a cosa serve? fai controllo incrociato anche con quello
# tale file contiene tutte le immagini con effetti? a me interessano solo fog, snow, motion_blur
# tali immagini con effetti dovrei aggiungerle al dataset come se facessero parte di un altro dataset?

In [None]:
"""
Il dataset SARD è sbilanciato, nel senso ci sono pochissime (quasi nessuna) immagini vuote (=senza persone) quindi penso sia sbilanciato,
  perché il modello si abituerà a vedere solo situazioni in cui ci sono effettivamente delle persone.
Invece credo sia buona cosa avere un certo numero di immagini vuote.
Quanto dovrebbe essere il rapporto tra immagini vuote ed immagini con persone (dammi una stima)?
Anche per questo pensavo di fare un merge con qualche altro dataset che invece contiene immagini vuote, però il rapporto sarà
 sempre a favore delle immagini con persone, è un bene?
Inoltre se vado aggiungere al dataset SARD anche le immagini modificate (che sono le stesse immagini del dataset SARD ma con
 degli effetti particolari) il divario tra numero di immagini vuote e numero di immagini con persone sarà ancora più grande.
"""


In [None]:
# per rinominare i file immagine con effetti visivi
folder_path_fog = '/content/drive/MyDrive/projectUPV/datasets/SARD_dataset/Corr/fog'
folder_path_snow = '/content/drive/MyDrive/projectUPV/datasets/SARD_dataset/Corr/snow'
folder_path_motion_blur = '/content/drive/MyDrive/projectUPV/datasets/SARD_dataset/Corr/motion_blur'

prefix_fog = 'fog'
prefix_snow = 'snow'
prefix_motion_blur = 'motion_blur'

def rename_files(folder_path, prefix):
  """
  Rename images with effects (fog, snow, motion blur)

  Args:
    folder_path (str): Path to dataset containing images
    prefix (str): Prefix to add to the new filenames
  """

  files = sorted([f for f in os.listdir(folder_path) if f.lower().endswith((".jpg", ".jpeg", ".png"))]) # to order files by name
  for i, filename in enumerate(files, start=1):
    ext = os.path.splitext(filename)[1] # to separate "namefile.jpg" in ("namefile", ".jpg"), and [1] to take just the extension
    new_name = f"{prefix}_{i:04d}{ext}" # to build the new name
    old_path = os.path.join(folder_path, filename)
    new_path = os.path.join(folder_path, new_name)
    os.rename(old_path, new_path)

rename_files(folder_path_fog, prefix_fog)
rename_files(folder_path_snow, prefix_snow)
rename_files(folder_path_motion_blur, prefix_motion_blur)

In [None]:

fog_mapping = {
  'fog_0001.jpg': 'gss2.jpg', 'fog_0002.jpg': 'gss4.jpg', 'fog_0003.jpg': 'gss7.jpg', 'fog_0004.jpg': 'gss9.jpg', 'fog_0005.jpg': 'gss12.jpg', 'fog_0006.jpg': 'gss14.jpg', 'fog_0007.jpg': 'gss17.jpg', 'fog_0008.jpg': 'gss19.jpg', 'fog_0009.jpg': 'gss22.jpg', 'fog_0010.jpg': 'gss24.jpg', 'fog_0011.jpg': 'gss27.jpg', 'fog_0012.jpg': 'gss29.jpg', 'fog_0013.jpg': 'gss32.jpg', 'fog_0014.jpg': 'gss34.jpg', 'fog_0015.jpg': 'gss37.jpg', 'fog_0016.jpg': 'gss39.jpg', 'fog_0017.jpg': 'gss42.jpg', 'fog_0018.jpg': 'gss44.jpg', 'fog_0019.jpg': 'gss47.jpg', 'fog_0020.jpg': 'gss49.jpg',
  'fog_0021.jpg': 'gss52.jpg', 'fog_0022.jpg': 'gss54.jpg', 'fog_0023.jpg': 'gss57.jpg', 'fog_0024.jpg': 'gss59.jpg', 'fog_0025.jpg': 'gss66.jpg', 'fog_0026.jpg': 'gss83.jpg', 'fog_0027.jpg': 'gss87.jpg', 'fog_0028.jpg': 'gss89.jpg', 'fog_0029.jpg': 'gss92.jpg', 'fog_0030.jpg': 'gss94.jpg', 'fog_0031.jpg': 'gss97.jpg', 'fog_0032.jpg': 'gss99.jpg', 'fog_0033.jpg': 'gss102.jpg', 'fog_0034.jpg': 'gss104.jpg', 'fog_0035.jpg': 'gss107.jpg', 'fog_0036.jpg': 'gss109.jpg', 'fog_0037.jpg': 'gss112.jpg', 'fog_0038.jpg': 'gss114.jpg', 'fog_0039.jpg': 'gss117.jpg', 'fog_0040.jpg': 'gss119.jpg',
  'fog_0041.jpg': 'gss122.jpg', 'fog_0042.jpg': 'gss124.jpg', 'fog_0043.jpg': 'gss127.jpg', 'fog_0044.jpg': 'gss129.jpg', 'fog_0045.jpg': 'gss132.jpg', 'fog_0046.jpg': 'gss134.jpg', 'fog_0047.jpg': 'gss137.jpg', 'fog_0048.jpg': 'gss139.jpg', 'fog_0049.jpg': 'gss142.jpg', 'fog_0050.jpg': 'gss144.jpg', 'fog_0051.jpg': 'gss147.jpg', 'fog_0052.jpg': 'gss149.jpg', 'fog_0053.jpg': 'gss152.jpg', 'fog_0054.jpg': 'gss154.jpg', 'fog_0055.jpg': 'gss157.jpg', 'fog_0056.jpg': 'gss159.jpg', 'fog_0057.jpg': 'gss162.jpg', 'fog_0058.jpg': 'gss164.jpg', 'fog_0059.jpg': 'gss167.jpg', 'fog_0060.jpg': 'gss169.jpg',
  'fog_0061.jpg': 'gss172.jpg', 'fog_0062.jpg': 'gss174.jpg', 'fog_0063.jpg': 'gss177.jpg', 'fog_0064.jpg': 'gss179.jpg', 'fog_0065.jpg': 'gss182.jpg', 'fog_0066.jpg': 'gss184.jpg', 'fog_0067.jpg': 'gss187.jpg', 'fog_0068.jpg': 'gss189.jpg', 'fog_0069.jpg': 'gss192.jpg', 'fog_0070.jpg': 'gss194.jpg', 'fog_0071.jpg': 'gss197.jpg', 'fog_0072.jpg': 'gss199.jpg', 'fog_0073.jpg': 'gss202.jpg', 'fog_0074.jpg': 'gss204.jpg', 'fog_0075.jpg': 'gss207.jpg', 'fog_0076.jpg': 'gss209.jpg', 'fog_0077.jpg': 'gss212.jpg', 'fog_0078.jpg': 'gss214.jpg', 'fog_0079.jpg': 'gss217.jpg', 'fog_0080.jpg': 'gss219.jpg',
  'fog_0081.jpg': 'gss222.jpg', 'fog_0082.jpg': 'gss224.jpg', 'fog_0083.jpg': 'gss227.jpg', 'fog_0084.jpg': 'gss229.jpg', 'fog_0085.jpg': 'gss232.jpg', 'fog_0086.jpg': 'gss234.jpg', 'fog_0087.jpg': 'gss237.jpg', 'fog_0088.jpg': 'gss239.jpg', 'fog_0089.jpg': 'gss242.jpg', 'fog_0090.jpg': 'gss244.jpg', 'fog_0091.jpg': 'gss247.jpg', 'fog_0092.jpg': 'gss249.jpg', 'fog_0093.jpg': 'gss252.jpg', 'fog_0094.jpg': 'gss254.jpg', 'fog_0095.jpg': 'gss257.jpg', 'fog_0096.jpg': 'gss259.jpg', 'fog_0097.jpg': 'gss262.jpg', 'fog_0098.jpg': 'gss264.jpg', 'fog_0099.jpg': 'gss267.jpg', 'fog_0100.jpg': 'gss269.jpg',
  'fog_0101.jpg': 'gss272.jpg', 'fog_0102.jpg': 'gss274.jpg', 'fog_0103.jpg': 'gss277.jpg', 'fog_0104.jpg': 'gss279.jpg', 'fog_0105.jpg': 'gss282.jpg', 'fog_0106.jpg': 'gss288.jpg', 'fog_0107.jpg': 'gss291.jpg', 'fog_0108.jpg': 'gss293.jpg', 'fog_0109.jpg': 'gss296.jpg', 'fog_0110.jpg': 'gss298.jpg', 'fog_0111.jpg': 'gss301.jpg', 'fog_0112.jpg': 'gss303.jpg', 'fog_0113.jpg': 'gss306.jpg', 'fog_0114.jpg': 'gss309.jpg', 'fog_0115.jpg': 'gss312.jpg', 'fog_0116.jpg': 'gss314.jpg', 'fog_0117.jpg': 'gss317.jpg', 'fog_0118.jpg': 'gss319.jpg', 'fog_0119.jpg': 'gss322.jpg', 'fog_0120.jpg': 'gss324.jpg',
  'fog_0121.jpg': 'gss327.jpg', 'fog_0122.jpg': 'gss331.jpg', 'fog_0123.jpg': 'gss334.jpg', 'fog_0124.jpg': 'gss336.jpg', 'fog_0125.jpg': 'gss339.jpg', 'fog_0126.jpg': 'gss341.jpg', 'fog_0127.jpg': 'gss344.jpg', 'fog_0128.jpg': 'gss346.jpg', 'fog_0129.jpg': 'gss349.jpg', 'fog_0130.jpg': 'gss351.jpg', 'fog_0131.jpg': 'gss354.jpg', 'fog_0132.jpg': 'gss356.jpg', 'fog_0133.jpg': 'gss359.jpg', 'fog_0134.jpg': 'gss361.jpg', 'fog_0135.jpg': 'gss364.jpg', 'fog_0136.jpg': 'gss366.jpg', 'fog_0137.jpg': 'gss369.jpg', 'fog_0207.jpg': 'gss544.jpg', 'fog_0208.jpg': 'gss546.jpg', 'fog_0209.jpg': 'gss549.jpg',
  'fog_0210.jpg': 'gss551.jpg', 'fog_0211.jpg': 'gss554.jpg', 'fog_0212.jpg': 'gss556.jpg', 'fog_0213.jpg': 'gss559.jpg', 'fog_0214.jpg': 'gss561.jpg', 'fog_0215.jpg': 'gss564.jpg', 'fog_0216.jpg': 'gss566.jpg', 'fog_0217.jpg': 'gss569.jpg', 'fog_0218.jpg': 'gss571.jpg', 'fog_0219.jpg': 'gss574.jpg', 'fog_0220.jpg': 'gss576.jpg', 'fog_0221.jpg': 'gss579.jpg', 'fog_0222.jpg': 'gss581.jpg', 'fog_0223.jpg': 'gss584.jpg', 'fog_0224.jpg': 'gss586.jpg', 'fog_0225.jpg': 'gss589.jpg', 'fog_0226.jpg': 'gss591.jpg', 'fog_0227.jpg': 'gss594.jpg', 'fog_0237.jpg': 'gss619.jpg', 'fog_0238.jpg': 'gss621.jpg',
  'fog_0239.jpg': 'gss624.jpg', 'fog_0240.jpg': 'gss626.jpg', 'fog_0241.jpg': 'gss629.jpg', 'fog_0242.jpg': 'gss631.jpg', 'fog_0243.jpg': 'gss634.jpg', 'fog_0244.jpg': 'gss636.jpg', 'fog_0245.jpg': 'gss639.jpg', 'fog_0246.jpg': 'gss641.jpg', 'fog_0247.jpg': 'gss644.jpg', 'fog_0248.jpg': 'gss646.jpg', 'fog_0249.jpg': 'gss649.jpg', 'fog_0250.jpg': 'gss651.jpg', 'fog_0251.jpg': 'gss654.jpg', 'fog_0252.jpg': 'gss656.jpg', 'fog_0253.jpg': 'gss659.jpg', 'fog_0254.jpg': 'gss661.jpg', 'fog_0255.jpg': 'gss664.jpg', 'fog_0256.jpg': 'gss666.jpg', 'fog_0257.jpg': 'gss669.jpg', 'fog_0258.jpg': 'gss671.jpg',
  'fog_0259.jpg': 'gss674.jpg', 'fog_0260.jpg': 'gss676.jpg', 'fog_0261.jpg': 'gss679.jpg', 'fog_0262.jpg': 'gss681.jpg', 'fog_0263.jpg': 'gss684.jpg', 'fog_0264.jpg': 'gss686.jpg', 'fog_0265.jpg': 'gss689.jpg', 'fog_0266.jpg': 'gss691.jpg', 'fog_0267.jpg': 'gss694.jpg', 'fog_0268.jpg': 'gss696.jpg', 'fog_0269.jpg': 'gss699.jpg', 'fog_0270.jpg': 'gss701.jpg', 'fog_0271.jpg': 'gss704.jpg', 'fog_0272.jpg': 'gss706.jpg', 'fog_0273.jpg': 'gss709.jpg', 'fog_0274.jpg': 'gss711.jpg', 'fog_0275.jpg': 'gss714.jpg', 'fog_0276.jpg': 'gss716.jpg', 'fog_0277.jpg': 'gss719.jpg', 'fog_0278.jpg': 'gss721.jpg',
  'fog_0279.jpg': 'gss724.jpg', 'fog_0280.jpg': 'gss726.jpg', 'fog_0281.jpg': 'gss729.jpg', 'fog_0282.jpg': 'gss731.jpg', 'fog_0283.jpg': 'gss734.jpg', 'fog_0284.jpg': 'gss736.jpg', 'fog_0285.jpg': 'gss739.jpg', 'fog_0286.jpg': 'gss741.jpg', 'fog_0287.jpg': 'gss748.jpg', 'fog_0288.jpg': 'gss750.jpg', 'fog_0289.jpg': 'gss753.jpg', 'fog_0290.jpg': 'gss755.jpg', 'fog_0291.jpg': 'gss758.jpg', 'fog_0292.jpg': 'gss760.jpg', 'fog_0293.jpg': 'gss763.jpg', 'fog_0294.jpg': 'gss765.jpg', 'fog_0295.jpg': 'gss768.jpg', 'fog_0296.jpg': 'gss770.jpg', 'fog_0297.jpg': 'gss773.jpg', 'fog_0298.jpg': 'gss775.jpg',
  'fog_0299.jpg': 'gss778.jpg', 'fog_0300.jpg': 'gss780.jpg', 'fog_0301.jpg': 'gss783.jpg', 'fog_0302.jpg': 'gss785.jpg', 'fog_0303.jpg': 'gss788.jpg', 'fog_0304.jpg': 'gss790.jpg', 'fog_0305.jpg': 'gss793.jpg', 'fog_0306.jpg': 'gss795.jpg', 'fog_0307.jpg': 'gss798.jpg', 'fog_0308.jpg': 'gss800.jpg', 'fog_0309.jpg': 'gss803.jpg', 'fog_0310.jpg': 'gss805.jpg', 'fog_0311.jpg': 'gss809.jpg', 'fog_0312.jpg': 'gss811.jpg', 'fog_0313.jpg': 'gss814.jpg', 'fog_0314.jpg': 'gss816.jpg', 'fog_0315.jpg': 'gss819.jpg', 'fog_0316.jpg': 'gss821.jpg', 'fog_0317.jpg': 'gss824.jpg', 'fog_0318.jpg': 'gss826.jpg',
  'fog_0319.jpg': 'gss829.jpg', 'fog_0320.jpg': 'gss831.jpg', 'fog_0321.jpg': 'gss834.jpg', 'fog_0322.jpg': 'gss837.jpg', 'fog_0323.jpg': 'gss840.jpg', 'fog_0324.jpg': 'gss842.jpg', 'fog_0325.jpg': 'gss845.jpg', 'fog_0326.jpg': 'gss847.jpg', 'fog_0327.jpg': 'gss850.jpg', 'fog_0328.jpg': 'gss852.jpg', 'fog_0329.jpg': 'gss855.jpg', 'fog_0330.jpg': 'gss857.jpg', 'fog_0331.jpg': 'gss860.jpg', 'fog_0332.jpg': 'gss862.jpg', 'fog_0333.jpg': 'gss865.jpg', 'fog_0334.jpg': 'gss867.jpg', 'fog_0335.jpg': 'gss870.jpg', 'fog_0336.jpg': 'gss872.jpg', 'fog_0337.jpg': 'gss875.jpg', 'fog_0338.jpg': 'gss877.jpg',
  'fog_0339.jpg': 'gss880.jpg', 'fog_0340.jpg': 'gss882.jpg', 'fog_0341.jpg': 'gss885.jpg', 'fog_0342.jpg': 'gss887.jpg', 'fog_0343.jpg': 'gss892.jpg', 'fog_0344.jpg': 'gss894.jpg', 'fog_0345.jpg': 'gss897.jpg', 'fog_0346.jpg': 'gss899.jpg', 'fog_0347.jpg': 'gss902.jpg', 'fog_0348.jpg': 'gss904.jpg', 'fog_0349.jpg': 'gss907.jpg', 'fog_0350.jpg': 'gss909.jpg', 'fog_0351.jpg': 'gss912.jpg', 'fog_0352.jpg': 'gss914.jpg', 'fog_0353.jpg': 'gss917.jpg', 'fog_0354.jpg': 'gss919.jpg', 'fog_0355.jpg': 'gss922.jpg', 'fog_0356.jpg': 'gss924.jpg', 'fog_0357.jpg': 'gss927.jpg', 'fog_0358.jpg': 'gss929.jpg',
  'fog_0359.jpg': 'gss932.jpg', 'fog_0360.jpg': 'gss934.jpg', 'fog_0361.jpg': 'gss937.jpg', 'fog_0362.jpg': 'gss939.jpg', 'fog_0363.jpg': 'gss942.jpg', 'fog_0364.jpg': 'gss944.jpg', 'fog_0365.jpg': 'gss947.jpg', 'fog_0366.jpg': 'gss949.jpg', 'fog_0367.jpg': 'gss952.jpg', 'fog_0368.jpg': 'gss954.jpg', 'fog_0369.jpg': 'gss957.jpg', 'fog_0370.jpg': 'gss959.jpg', 'fog_0371.jpg': 'gss962.jpg', 'fog_0372.jpg': 'gss964.jpg', 'fog_0373.jpg': 'gss967.jpg', 'fog_0374.jpg': 'gss969.jpg', 'fog_0375.jpg': 'gss972.jpg', 'fog_0376.jpg': 'gss974.jpg', 'fog_0377.jpg': 'gss1008.jpg', 'fog_0378.jpg': 'gss1010.jpg',
  'fog_0379.jpg': 'gss1013.jpg', 'fog_0380.jpg': 'gss1015.jpg', 'fog_0381.jpg': 'gss1018.jpg', 'fog_0382.jpg': 'gss1020.jpg', 'fog_0383.jpg': 'gss1023.jpg', 'fog_0384.jpg': 'gss1025.jpg', 'fog_0385.jpg': 'gss1028.jpg', 'fog_0386.jpg': 'gss1030.jpg', 'fog_0387.jpg': 'gss1033.jpg', 'fog_0388.jpg': 'gss1035.jpg', 'fog_0389.jpg': 'gss1038.jpg', 'fog_0390.jpg': 'gss1040.jpg', 'fog_0391.jpg': 'gss1043.jpg', 'fog_0392.jpg': 'gss1045.jpg', 'fog_0393.jpg': 'gss1048.jpg', 'fog_0394.jpg': 'gss1050.jpg', 'fog_0395.jpg': 'gss1053.jpg', 'fog_0396.jpg': 'gss1055.jpg', 'fog_0397.jpg': 'gss1058.jpg', 'fog_0398.jpg': 'gss1060.jpg',
  'fog_0399.jpg': 'gss1063.jpg', 'fog_0400.jpg': 'gss1065.jpg', 'fog_0401.jpg': 'gss1068.jpg', 'fog_0402.jpg': 'gss1070.jpg', 'fog_0403.jpg': 'gss1073.jpg', 'fog_0404.jpg': 'gss1075.jpg', 'fog_0405.jpg': 'gss1078.jpg', 'fog_0406.jpg': 'gss1080.jpg', 'fog_0407.jpg': 'gss1083.jpg', 'fog_0408.jpg': 'gss1085.jpg', 'fog_0409.jpg': 'gss1088.jpg', 'fog_0410.jpg': 'gss1090.jpg', 'fog_0411.jpg': 'gss1093.jpg', 'fog_0412.jpg': 'gss1095.jpg', 'fog_0413.jpg': 'gss1098.jpg', 'fog_0414.jpg': 'gss1100.jpg', 'fog_0415.jpg': 'gss1103.jpg', 'fog_0416.jpg': 'gss1105.jpg', 'fog_0417.jpg': 'gss1108.jpg', 'fog_0418.jpg': 'gss1110.jpg',
  'fog_0419.jpg': 'gss1113.jpg', 'fog_0420.jpg': 'gss1115.jpg', 'fog_0421.jpg': 'gss1118.jpg', 'fog_0422.jpg': 'gss1120.jpg', 'fog_0423.jpg': 'gss1123.jpg', 'fog_0424.jpg': 'gss1125.jpg', 'fog_0425.jpg': 'gss1129.jpg', 'fog_0426.jpg': 'gss1139.jpg', 'fog_0427.jpg': 'gss1142.jpg', 'fog_0428.jpg': 'gss1144.jpg', 'fog_0429.jpg': 'gss1147.jpg', 'fog_0430.jpg': 'gss1149.jpg', 'fog_0431.jpg': 'gss1152.jpg', 'fog_0432.jpg': 'gss1154.jpg', 'fog_0433.jpg': 'gss1157.jpg', 'fog_0434.jpg': 'gss1159.jpg', 'fog_0435.jpg': 'gss1162.jpg', 'fog_0436.jpg': 'gss1164.jpg', 'fog_0437.jpg': 'gss1167.jpg', 'fog_0438.jpg': 'gss1169.jpg',
  'fog_0439.jpg': 'gss1172.jpg', 'fog_0440.jpg': 'gss1174.jpg', 'fog_0441.jpg': 'gss1177.jpg', 'fog_0442.jpg': 'gss1179.jpg', 'fog_0443.jpg': 'gss1182.jpg', 'fog_0444.jpg': 'gss1184.jpg', 'fog_0445.jpg': 'gss1187.jpg', 'fog_0446.jpg': 'gss1189.jpg', 'fog_0447.jpg': 'gss1197.jpg', 'fog_0448.jpg': 'gss1199.jpg', 'fog_0449.jpg': 'gss1202.jpg', 'fog_0450.jpg': 'gss1204.jpg', 'fog_0451.jpg': 'gss1207.jpg', 'fog_0452.jpg': 'gss1209.jpg', 'fog_0453.jpg': 'gss1212.jpg', 'fog_0454.jpg': 'gss1214.jpg', 'fog_0455.jpg': 'gss1217.jpg', 'fog_0456.jpg': 'gss1219.jpg', 'fog_0457.jpg': 'gss1222.jpg', 'fog_0458.jpg': 'gss1224.jpg',
  'fog_0459.jpg': 'gss1227.jpg', 'fog_0460.jpg': 'gss1229.jpg', 'fog_0461.jpg': 'gss1232.jpg', 'fog_0462.jpg': 'gss1234.jpg', 'fog_0463.jpg': 'gss1259.jpg', 'fog_0464.jpg': 'gss1261.jpg', 'fog_0465.jpg': 'gss1264.jpg', 'fog_0466.jpg': 'gss1266.jpg', 'fog_0467.jpg': 'gss1269.jpg', 'fog_0468.jpg': 'gss1271.jpg', 'fog_0469.jpg': 'gss1291.jpg', 'fog_0470.jpg': 'gss1293.jpg', 'fog_0471.jpg': 'gss1296.jpg', 'fog_0472.jpg': 'gss1298.jpg', 'fog_0473.jpg': 'gss1301.jpg', 'fog_0474.jpg': 'gss1303.jpg', 'fog_0475.jpg': 'gss1306.jpg', 'fog_0476.jpg': 'gss1308.jpg', 'fog_0477.jpg': 'gss1311.jpg', 'fog_0478.jpg': 'gss1313.jpg',
  'fog_0479.jpg': 'gss1316.jpg', 'fog_0480.jpg': 'gss1318.jpg', 'fog_0481.jpg': 'gss1321.jpg', 'fog_0482.jpg': 'gss1323.jpg', 'fog_0483.jpg': 'gss1326.jpg', 'fog_0484.jpg': 'gss1328.jpg', 'fog_0485.jpg': 'gss1331.jpg', 'fog_0486.jpg': 'gss1333.jpg', 'fog_0487.jpg': 'gss1336.jpg', 'fog_0488.jpg': 'gss1338.jpg', 'fog_0489.jpg': 'gss1341.jpg', 'fog_0490.jpg': 'gss1343.jpg', 'fog_0491.jpg': 'gss1346.jpg', 'fog_0492.jpg': 'gss1348.jpg', 'fog_0493.jpg': 'gss1351.jpg', 'fog_0494.jpg': 'gss1353.jpg', 'fog_0495.jpg': 'gss1356.jpg', 'fog_0496.jpg': 'gss1358.jpg', 'fog_0497.jpg': 'gss1361.jpg', 'fog_0498.jpg': 'gss1363.jpg',
  'fog_0499.jpg': 'gss1366.jpg', 'fog_0500.jpg': 'gss1368.jpg', 'fog_0501.jpg': 'gss1371.jpg', 'fog_0502.jpg': 'gss1373.jpg', 'fog_0503.jpg': 'gss1376.jpg', 'fog_0504.jpg': 'gss1378.jpg', 'fog_0505.jpg': 'gss1381.jpg', 'fog_0506.jpg': 'gss1383.jpg', 'fog_0507.jpg': 'gss1386.jpg', 'fog_0508.jpg': 'gss1388.jpg', 'fog_0509.jpg': 'gss1391.jpg', 'fog_0510.jpg': 'gss1393.jpg', 'fog_0511.jpg': 'gss1396.jpg', 'fog_0512.jpg': 'gss1398.jpg', 'fog_0513.jpg': 'gss1401.jpg', 'fog_0514.jpg': 'gss1403.jpg', 'fog_0515.jpg': 'gss1406.jpg', 'fog_0516.jpg': 'gss1408.jpg', 'fog_0517.jpg': 'gss1411.jpg', 'fog_0518.jpg': 'gss1413.jpg',
  'fog_0519.jpg': 'gss1416.jpg', 'fog_0520.jpg': 'gss1418.jpg', 'fog_0521.jpg': 'gss1421.jpg', 'fog_0522.jpg': 'gss1423.jpg', 'fog_0523.jpg': 'gss1426.jpg', 'fog_0524.jpg': 'gss1428.jpg', 'fog_0525.jpg': 'gss1431.jpg', 'fog_0526.jpg': 'gss1433.jpg', 'fog_0527.jpg': 'gss1436.jpg', 'fog_0528.jpg': 'gss1438.jpg', 'fog_0529.jpg': 'gss1441.jpg', 'fog_0530.jpg': 'gss1443.jpg', 'fog_0531.jpg': 'gss1446.jpg', 'fog_0532.jpg': 'gss1448.jpg', 'fog_0533.jpg': 'gss1451.jpg', 'fog_0534.jpg': 'gss1453.jpg', 'fog_0535.jpg': 'gss1456.jpg', 'fog_0536.jpg': 'gss1458.jpg', 'fog_0537.jpg': 'gss1461.jpg', 'fog_0538.jpg': 'gss1463.jpg',
  'fog_0539.jpg': 'gss1466.jpg', 'fog_0540.jpg': 'gss1468.jpg', 'fog_0541.jpg': 'gss1471.jpg', 'fog_0542.jpg': 'gss1473.jpg', 'fog_0543.jpg': 'gss1476.jpg', 'fog_0544.jpg': 'gss1478.jpg', 'fog_0545.jpg': 'gss1481.jpg', 'fog_0546.jpg': 'gss1483.jpg', 'fog_0547.jpg': 'gss1486.jpg', 'fog_0548.jpg': 'gss1488.jpg', 'fog_0549.jpg': 'gss1491.jpg', 'fog_0550.jpg': 'gss1493.jpg', 'fog_0551.jpg': 'gss1496.jpg', 'fog_0552.jpg': 'gss1498.jpg', 'fog_0553.jpg': 'gss1501.jpg', 'fog_0554.jpg': 'gss1503.jpg', 'fog_0555.jpg': 'gss1506.jpg', 'fog_0556.jpg': 'gss1508.jpg', 'fog_0557.jpg': 'gss1511.jpg', 'fog_0558.jpg': 'gss1513.jpg',
  'fog_0559.jpg': 'gss1516.jpg', 'fog_0560.jpg': 'gss1518.jpg', 'fog_0561.jpg': 'gss1521.jpg', 'fog_0562.jpg': 'gss1523.jpg', 'fog_0563.jpg': 'gss1526.jpg', 'fog_0564.jpg': 'gss1528.jpg', 'fog_0565.jpg': 'gss1531.jpg', 'fog_0566.jpg': 'gss1533.jpg', 'fog_0567.jpg': 'gss1536.jpg', 'fog_0568.jpg': 'gss1538.jpg', 'fog_0569.jpg': 'gss1541.jpg', 'fog_0570.jpg': 'gss1543.jpg', 'fog_0571.jpg': 'gss1546.jpg', 'fog_0572.jpg': 'gss1548.jpg', 'fog_0573.jpg': 'gss1551.jpg', 'fog_0574.jpg': 'gss1553.jpg', 'fog_0575.jpg': 'gss1556.jpg', 'fog_0576.jpg': 'gss1558.jpg', 'fog_0577.jpg': 'gss1561.jpg', 'fog_0578.jpg': 'gss1563.jpg',
  'fog_0579.jpg': 'gss1566.jpg', 'fog_0580.jpg': 'gss1568.jpg', 'fog_0581.jpg': 'gss1571.jpg', 'fog_0582.jpg': 'gss1573.jpg', 'fog_0583.jpg': 'gss1576.jpg', 'fog_0584.jpg': 'gss1578.jpg', 'fog_0585.jpg': 'gss1581.jpg', 'fog_0586.jpg': 'gss1583.jpg', 'fog_0587.jpg': 'gss1586.jpg', 'fog_0588.jpg': 'gss1588.jpg', 'fog_0589.jpg': 'gss1591.jpg', 'fog_0590.jpg': 'gss1593.jpg', 'fog_0591.jpg': 'gss1596.jpg', 'fog_0592.jpg': 'gss1598.jpg', 'fog_0593.jpg': 'gss1601.jpg', 'fog_0594.jpg': 'gss1603.jpg', 'fog_0595.jpg': 'gss1606.jpg', 'fog_0596.jpg': 'gss1608.jpg', 'fog_0597.jpg': 'gss1611.jpg', 'fog_0598.jpg': 'gss1613.jpg',
  'fog_0599.jpg': 'gss1616.jpg', 'fog_0600.jpg': 'gss1618.jpg', 'fog_0601.jpg': 'gss1621.jpg', 'fog_0602.jpg': 'gss1623.jpg', 'fog_0603.jpg': 'gss1626.jpg', 'fog_0604.jpg': 'gss1628.jpg', 'fog_0605.jpg': 'gss1631.jpg', 'fog_0606.jpg': 'gss1633.jpg', 'fog_0607.jpg': 'gss1636.jpg', 'fog_0608.jpg': 'gss1638.jpg', 'fog_0609.jpg': 'gss1641.jpg', 'fog_0610.jpg': 'gss1643.jpg', 'fog_0611.jpg': 'gss1646.jpg', 'fog_0612.jpg': 'gss1648.jpg', 'fog_0613.jpg': 'gss1651.jpg', 'fog_0614.jpg': 'gss1653.jpg', 'fog_0615.jpg': 'gss1656.jpg', 'fog_0616.jpg': 'gss1658.jpg', 'fog_0617.jpg': 'gss1661.jpg', 'fog_0618.jpg': 'gss1663.jpg',
  'fog_0619.jpg': 'gss1666.jpg', 'fog_0620.jpg': 'gss1668.jpg', 'fog_0621.jpg': 'gss1671.jpg', 'fog_0622.jpg': 'gss1673.jpg', 'fog_0623.jpg': 'gss1676.jpg', 'fog_0624.jpg': 'gss1678.jpg', 'fog_0625.jpg': 'gss1681.jpg', 'fog_0626.jpg': 'gss1683.jpg', 'fog_0627.jpg': 'gss1686.jpg', 'fog_0628.jpg': 'gss1688.jpg', 'fog_0629.jpg': 'gss1691.jpg', 'fog_0630.jpg': 'gss1693.jpg', 'fog_0631.jpg': 'gss1696.jpg', 'fog_0632.jpg': 'gss1699.jpg', 'fog_0633.jpg': 'gss1701.jpg', 'fog_0634.jpg': 'gss1703.jpg', 'fog_0635.jpg': 'gss1706.jpg', 'fog_0636.jpg': 'gss1708.jpg', 'fog_0637.jpg': 'gss1711.jpg', 'fog_0638.jpg': 'gss1713.jpg',
  'fog_0639.jpg': 'gss1716.jpg', 'fog_0640.jpg': 'gss1718.jpg', 'fog_0641.jpg': 'gss1721.jpg', 'fog_0642.jpg': 'gss1723.jpg', 'fog_0643.jpg': 'gss1726.jpg', 'fog_0644.jpg': 'gss1728.jpg', 'fog_0645.jpg': 'gss1731.jpg', 'fog_0646.jpg': 'gss1733.jpg', 'fog_0647.jpg': 'gss1736.jpg', 'fog_0648.jpg': 'gss1738.jpg', 'fog_0649.jpg': 'gss1741.jpg', 'fog_0650.jpg': 'gss1743.jpg', 'fog_0651.jpg': 'gss1746.jpg', 'fog_0652.jpg': 'gss1748.jpg', 'fog_0653.jpg': 'gss1751.jpg', 'fog_0654.jpg': 'gss1753.jpg', 'fog_0655.jpg': 'gss1756.jpg', 'fog_0656.jpg': 'gss1758.jpg', 'fog_0657.jpg': 'gss1761.jpg', 'fog_0658.jpg': 'gss1763.jpg',
  'fog_0659.jpg': 'gss1766.jpg', 'fog_0660.jpg': 'gss1768.jpg', 'fog_0661.jpg': 'gss1771.jpg', 'fog_0662.jpg': 'gss1773.jpg', 'fog_0663.jpg': 'gss1776.jpg', 'fog_0664.jpg': 'gss1778.jpg', 'fog_0665.jpg': 'gss1781.jpg', 'fog_0666.jpg': 'gss1783.jpg', 'fog_0667.jpg': 'gss1786.jpg', 'fog_0668.jpg': 'gss1788.jpg', 'fog_0669.jpg': 'gss1791.jpg', 'fog_0670.jpg': 'gss1793.jpg', 'fog_0671.jpg': 'gss1796.jpg', 'fog_0672.jpg': 'gss1798.jpg', 'fog_0673.jpg': 'gss1801.jpg', 'fog_0674.jpg': 'gss1803.jpg', 'fog_0675.jpg': 'gss1806.jpg', 'fog_0676.jpg': 'gss1808.jpg', 'fog_0677.jpg': 'gss1811.jpg', 'fog_0678.jpg': 'gss1813.jpg',
  'fog_0679.jpg': 'gss1816.jpg', 'fog_0680.jpg': 'gss1818.jpg', 'fog_0681.jpg': 'gss1821.jpg', 'fog_0682.jpg': 'gss1823.jpg', 'fog_0683.jpg': 'gss1826.jpg', 'fog_0684.jpg': 'gss1828.jpg', 'fog_0685.jpg': 'gss1831.jpg', 'fog_0686.jpg': 'gss1833.jpg', 'fog_0687.jpg': 'gss1836.jpg', 'fog_0688.jpg': 'gss1838.jpg', 'fog_0689.jpg': 'gss1841.jpg', 'fog_0690.jpg': 'gss1843.jpg', 'fog_0691.jpg': 'gss1846.jpg', 'fog_0692.jpg': 'gss1848.jpg', 'fog_0693.jpg': 'gss1851.jpg', 'fog_0694.jpg': 'gss1853.jpg', 'fog_0695.jpg': 'gss1856.jpg', 'fog_0696.jpg': 'gss1858.jpg', 'fog_0697.jpg': 'gss1861.jpg', 'fog_0698.jpg': 'gss1863.jpg',
  'fog_0699.jpg': 'gss1866.jpg', 'fog_0700.jpg': 'gss1868.jpg', 'fog_0701.jpg': 'gss1871.jpg', 'fog_0702.jpg': 'gss1873.jpg', 'fog_0703.jpg': 'gss1876.jpg', 'fog_0704.jpg': 'gss1878.jpg', 'fog_0705.jpg': 'gss1881.jpg', 'fog_0706.jpg': 'gss1883.jpg', 'fog_0707.jpg': 'gss1886.jpg', 'fog_0708.jpg': 'gss1888.jpg', 'fog_0709.jpg': 'gss1891.jpg', 'fog_0710.jpg': 'gss1893.jpg', 'fog_0711.jpg': 'gss1896.jpg', 'fog_0712.jpg': 'gss1898.jpg', 'fog_0713.jpg': 'gss1901.jpg', 'fog_0714.jpg': 'gss1903.jpg', 'fog_0715.jpg': 'gss1906.jpg', 'fog_0716.jpg': 'gss1908.jpg', 'fog_0717.jpg': 'gss1911.jpg', 'fog_0718.jpg': 'gss1913.jpg',
  'fog_0719.jpg': 'gss1916.jpg', 'fog_0720.jpg': 'gss1918.jpg', 'fog_0721.jpg': 'gss1921.jpg', 'fog_0722.jpg': 'gss1923.jpg', 'fog_0723.jpg': 'gss1926.jpg', 'fog_0724.jpg': 'gss1928.jpg', 'fog_0725.jpg': 'gss1931.jpg', 'fog_0726.jpg': 'gss1933.jpg', 'fog_0727.jpg': 'gss1936.jpg', 'fog_0728.jpg': 'gss1938.jpg', 'fog_0729.jpg': 'gss1941.jpg', 'fog_0730.jpg': 'gss1943.jpg', 'fog_0731.jpg': 'gss1946.jpg', 'fog_0732.jpg': 'gss1948.jpg', 'fog_0733.jpg': 'gss1951.jpg', 'fog_0734.jpg': 'gss1953.jpg', 'fog_0735.jpg': 'gss1956.jpg', 'fog_0736.jpg': 'gss1958.jpg', 'fog_0737.jpg': 'gss1961.jpg', 'fog_0738.jpg': 'gss1963.jpg',
  'fog_0739.jpg': 'gss1966.jpg', 'fog_0740.jpg': 'gss1968.jpg', 'fog_0741.jpg': 'gss1971.jpg', 'fog_0742.jpg': 'gss1973.jpg', 'fog_0743.jpg': 'gss1980.jpg', 'fog_0744.jpg': 'gss1982.jpg', 'fog_0745.jpg': 'gss1985.jpg', 'fog_0746.jpg': 'gss1987.jpg', 'fog_0747.jpg': 'gss1990.jpg', 'fog_0748.jpg': 'gss1992.jpg', 'fog_0749.jpg': 'gss1995.jpg', 'fog_0750.jpg': 'gss1997.jpg', 'fog_0751.jpg': 'gss2000.jpg', 'fog_0752.jpg': 'gss2002.jpg', 'fog_0753.jpg': 'gss2005.jpg', 'fog_0754.jpg': 'gss2007.jpg', 'fog_0755.jpg': 'gss2010.jpg', 'fog_0756.jpg': 'gss2012.jpg', 'fog_0757.jpg': 'gss2015.jpg', 'fog_0758.jpg': 'gss2017.jpg',
  'fog_0759.jpg': 'gss2020.jpg', 'fog_0760.jpg': 'gss2022.jpg', 'fog_0761.jpg': 'gss2025.jpg', 'fog_0762.jpg': 'gss2027.jpg', 'fog_0763.jpg': 'gss2030.jpg', 'fog_0764.jpg': 'gss2032.jpg', 'fog_0765.jpg': 'gss2035.jpg', 'fog_0766.jpg': 'gss2037.jpg', 'fog_0767.jpg': 'gss2040.jpg', 'fog_0768.jpg': 'gss2042.jpg', 'fog_0769.jpg': 'gss2045.jpg', 'fog_0770.jpg': 'gss2047.jpg', 'fog_0771.jpg': 'gss2050.jpg', 'fog_0772.jpg': 'gss2052.jpg', 'fog_0773.jpg': 'gss2055.jpg', 'fog_0774.jpg': 'gss2057.jpg', 'fog_0775.jpg': 'gss2060.jpg', 'fog_0776.jpg': 'gss2062.jpg', 'fog_0777.jpg': 'gss2065.jpg', 'fog_0778.jpg': 'gss2067.jpg',
  'fog_0779.jpg': 'gss2070.jpg', 'fog_0780.jpg': 'gss2072.jpg', 'fog_0781.jpg': 'gss2075.jpg', 'fog_0782.jpg': 'gss2077.jpg', 'fog_0783.jpg': 'gss2080.jpg', 'fog_0784.jpg': 'gss2082.jpg', 'fog_0785.jpg': 'gss2085.jpg', 'fog_0786.jpg': 'gss2087.jpg', 'fog_0787.jpg': 'gss2090.jpg', 'fog_0788.jpg': 'gss2092.jpg', 'fog_0789.jpg': 'gss2095.jpg', 'fog_0790.jpg': 'gss2097.jpg', 'fog_0791.jpg': 'gss2100.jpg', 'fog_0792.jpg': 'gss2102.jpg'
  }

This dataset was built for detecting casualties and persons in search and rescue scenarios in drone images and videos. The actors in the footage have simulate exhausted and injured persons as well as "classic" types of movement of people in nature, such as running, walking, standing, sitting, or lying down. The shots include persons on macadam roads, in quarries, low and high grass, forest shade, and the like.

### Reflection on the "Not-Defined" label:

The image annotation consists of the position of the bounding box around each object of interest, the size of the bounding box in terms of width and height, and the corresponding class designation (Standing, Walking, Running, Sitting, Lying, Not Defined) for the person.

We want to implement a person detection system and are not interested in understanding the movement that the person is making. Therefore, the first step we will take is to unify all the labels into a single class, 'person'. But what should we do with the 'not-defined' label? What does it represent?

After analyzing the dataset, I noticed that there are many images containing the 'not-defined' label, so removing these images cannot be considered. In all of these, the subject is clearly a person. Therefore, the 'not-defined' class does not identify the presence or absence of a person, but rather the movement of the person in the image, which, for various reasons, is undefined.

We have therefore decided to include the 'not-defined' class in the unified 'person' class.

To increase the robustness of the SARD data, an extension of the SARD set, called Corr, was created that includes images that further simulate different weather conditions that may occur in actual search and rescue situations such as fog, snow, and ice. Also, blur images are included in the Corr set that occur in real conditions as a result of camera movement and aerial shooting in motion.

## Dataset Herida

In [None]:
# os.chdir('../') # to change the directory (../ is the start)
os.chdir('/content/drive/MyDrive/projectUPV/datasets/Herida_dataset/train') # the directory with the dataset
print(os.getcwd()) # to see in which directory we are
# print(os.listdir()) # to see what there is in the current directory (add the path inside the () to see the content of another directory)

/content/drive/.shortcut-targets-by-id/1LQbD7p_iS5KLqGNdfrYEvsAx0i_bgB0h/projectUPV/datasets/Herida_dataset/train


In [None]:
folder_path_herida = "/content/drive/MyDrive/projectUPV/datasets/Herida_dataset/train"

In [None]:
# Controllare se ci sono immagini jpg senza file xml associati o viceversa, in tal caso dovremmo eliminare questi file

def check_image_xml_consistency(folder_path): # NB POI FALLO ANCHE PER IL DATASET SARD ALL'INIZIO DI TUTTO
  """
  Check if every image .jpg has a .xml file with the same name and viceversa.

  Args:
    folder_path (str): path of the folder containing images and xml annotations.

  Returns:
    dict: dictionary that contain images without xml and xml without any associated image.
  """
  jpg_files = {os.path.splitext(f)[0] for f in os.listdir(folder_path) if f.lower().endswith('.jpg')}
  xml_files = {os.path.splitext(f)[0] for f in os.listdir(folder_path) if f.lower().endswith('.xml')}

  images_without_xml = sorted(jpg_files - xml_files)
  xml_without_images = sorted(xml_files - jpg_files)

  print(f"Total images: {len(jpg_files)}")
  print(f"Total XML files: {len(xml_files)}")
  print(f"Images without matching XML: {len(images_without_xml)}")
  print(f"XML files without matching image: {len(xml_without_images)}")

  return {
    "images_without_xml": images_without_xml,
    "xml_without_images": xml_without_images
  }

inconsistencies = check_image_xml_consistency(folder_path_herida)

# Let's see which files aren't matching
total_unmatched = len(inconsistencies["images_without_xml"]) + len(inconsistencies["xml_without_images"])
print("Total unmatched files:", total_unmatched)

print("\nImages without XML:")
for f in inconsistencies['images_without_xml']:
  print(f"  - {f}.jpg")

print("\nXML files without image:")
for f in inconsistencies['xml_without_images']:
  print(f"  - {f}.xml")


Total images: 1546
Total XML files: 1546
Images without matching XML: 0
XML files without matching image: 0
🔍 Total unmatched files: 0

🖼️ Images without XML:

📄 XML files without image:


In [None]:
# Rinominare tutte le immagini (jpg) e file xml --> usa un prefisso hda

def rename_paired_images_and_xml(folder_path, prefix="hda"):
  """
  Rename the pairs .jpg and .xml files in the 'prefixNNNN.jpg/xml' format.
  Skip orphan files (shouldn't be there, already checked).
  """
  # Take nama without extension
  jpg_basenames = {os.path.splitext(f)[0] for f in os.listdir(folder_path) if f.lower().endswith('.jpg')}
  xml_basenames = {os.path.splitext(f)[0] for f in os.listdir(folder_path) if f.lower().endswith('.xml')}

  # Find the couples
  paired = sorted(jpg_basenames & xml_basenames)

  print(f"Found {len(paired)} pairs to rename \n")

  for i, base_name in enumerate(paired, start=1):
    new_base = f"{prefix}{str(i).zfill(4)}"

    old_jpg_path = os.path.join(folder_path, base_name + '.jpg')
    old_xml_path = os.path.join(folder_path, base_name + '.xml')

    new_jpg_path = os.path.join(folder_path, new_base + '.jpg')
    new_xml_path = os.path.join(folder_path, new_base + '.xml')

    # Rename only if the file exists
    if os.path.exists(old_jpg_path):
      os.rename(old_jpg_path, new_jpg_path)
    if os.path.exists(old_xml_path):
      os.rename(old_xml_path, new_xml_path)

    #print(f"{base_name} → {new_base}.jpg / {new_base}.xml")

  print("\n All file renamed with success")

# Let's try
rename_paired_images_and_xml(folder_path_herida, prefix="hda")



Found 1546 pairs to rename 


 All file renamed with success


In [None]:
# Rinomina tutti i filename dei file xml

def update_xml_filenames(xml_folder): # To rename all the 'filename' in the xml files in a specific directory
  """
  Update <filename> and <path> tags in XML files to match renamed .jpg files.
  Assumes each XML file has a matching .jpg with the same basename.

  Args:
      xml_folder (str): Path to the folder containing renamed XML and JPG files.
  """
  updated = 0
  for file in os.listdir(xml_folder):
    if file.endswith('.xml'):
      xml_path = os.path.join(xml_folder, file)
      try:
        tree = ET.parse(xml_path)
        root = tree.getroot()

        new_filename = os.path.splitext(file)[0] + '.jpg'

        # Update <filename> and <path> tags
        filename_tag = root.find('filename')
        if filename_tag is not None:
          filename_tag.text = new_filename

        path_tag = root.find('path')
        if path_tag is not None:
          path_tag.text = new_filename  # or full path if needed

        # Save the updated XML
        tree.write(xml_path)
        updated += 1

      except ET.ParseError:
        print(f"Error parsing {file}, skipping.")

  print(f"Updated {updated} XML files with correct filename/path.")

# Let's try
update_xml_filenames(folder_path_herida)

Updated 1546 XML files with correct filename/path.


In [None]:
# Cambiare tutte le etichette da 'human' a 'person' --> nel dataset Herida sono etichettati come human e non come person

def update_xml_class(folder_path, new_class="person"): # To change all class from 'human' to 'person' in the Herida dataset
  """
  Update all <name> tags in each <object> of XML files to a new class name.

  Args:
    folder_path (str): Path to folder with .xml files.
    new_class (str): New class name to replace current ones.
  """
  updated = 0

  for file in os.listdir(folder_path):
    if file.endswith('.xml'):
      xml_path = os.path.join(folder_path, file)

      try:
        tree = ET.parse(xml_path)
        root = tree.getroot()

        modified = False
        for obj in root.findall('object'):
          name_tag = obj.find('name')
          if name_tag is not None and name_tag.text != new_class:
            name_tag.text = name
            modified = True

        if modified:
          tree.write(xml_path)
          updated += 1

      except ET.ParseError:
        print(f"Failed to parse {file}")

  print(f"Updated <name> in {updated} XML files.")



update_xml_class(folder_path_herida)
#update_xml_class(folder_path_herida, new_class="person")


In [None]:
"""
Ora che hai rinominato tutto (quindi è più visibile), elimina le immagini relative a sfondi urbani MANUALMENTE ed i file xml associati
 (perché poi creerai il file csv, e se non lo fai ora ma più avanti dovrai eliminare anche le righe del file csv)
"""

In [None]:
# approfondire che succede se un'immagine non contiene un object

# crea file csv con (filename, width, height, class, xmin, ymin, xmax, ymax) basandoti sulle info dei vari file xml

def create_csv_from_xml(xml_folder, output_csv_path):
  """
  Create a CSV from XML files in the folder and save it inside the same folder.
  The CSV contains the following columns: filename, width, height, class, xmin, ymin, xmax, ymax.

  Args:
    xml_folder (str): Path to folder with .xml files.
    csv_name (str): Name of the output CSV file (default: 'annotations.csv').

  Returns:
    str: Path to the created CSV file.
  """
  herida_person_labels = []

  for file in os.listdir(xml_folder):
    if file.endswith('.xml'):
      xml_path = os.path.join(xml_folder, file)

      try:
        tree = ET.parse(xml_path)
        root = tree.getroot()

        filename = root.find('filename').text.strip()
        size = root.find('size')
        width = int(size.find('width').text)
        height = int(size.find('height').text)

        objects = root.findall('object')

        if not objects:
          herida_person_labels.append({
            'filename': filename,
            'width': width,
            'height': height,
            'class': None,
            'xmin': None,
            'ymin': None,
            'xmax': None,
            'ymax': None
          })
        else:
          for obj in objects:
            label = obj.find('name').text.strip()
            bbox = obj.find('bndbox')
            xmin = int(bbox.find('xmin').text)
            ymin = int(bbox.find('ymin').text)
            xmax = int(bbox.find('xmax').text)
            ymax = int(bbox.find('ymax').text)

            herida_person_labels.append({
              'filename': filename,
              'width': width,
              'height': height,
              'class': label,
              'xmin': xmin,
              'ymin': ymin,
              'xmax': xmax,
              'ymax': ymax
            })

      except Exception as e:
        print(f"Error parsing {file}: {e}")

  df = pd.DataFrame(herida_person_labels)

  # Save the CSV in the same folder
  output_csv_path = os.path.join(xml_folder, csv_name)
  df.to_csv(output_csv_path, index=False)

  print(f"CSV created: {output_csv_path} ({len(df)} righe)")
  return output_csv_path

# Let's try:
csv_herida = create_csv_from_xml(folder_path_herida, "herida_person_labels.csv")


df = pd.read_csv(csv_herida)
print(df.head()) # Print first few rows to verify
# print(df.tail()) # Print last few rows to verify


In [None]:
# controlla che tutte le immagini abbiano la stessa dimensione

##################### NB: Questa funzione è identica a quella per SARD,
##################### quindi usane solo una e modifica il codice di sard per evitare di eseguirla, compilala solamente
def check_image_dimensions_consistency(file_csv): # Check if all the images have the same dimensions
  """
  Check if all images in the CSV have the same dimensions (width, height).

  Args:
    file_csv (str): path to csv wich contains columns 'filename', 'width' and 'height'.

  Returns:
    pd.DataFrame: different dimensions found with count.
  """
  df = pd.read_csv(file_csv)

  for col in ['width', 'height']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

  dimension_counts = df.groupby(['width', 'height'])['filename'].nunique().reset_index(name='image_count')

  if len(dimension_counts) == 1:
    print(f"All images have the same size: {dimension_counts.iloc[0]['width']}x{dimension_counts.iloc[0]['height']}")
  else:
    print(f"Found {len(dimension_counts)} different image sizes:")
    print(dimension_counts)

  return dimension_counts


folder_path_herida = '/content/drive/MyDrive/projectUPV/datasets/Herida_dataset/train'
csv_herida_person_labels_path = os.path.join(folder_path_herida, 'herida_person_labels.csv')
check_image_dimensions_consistency(csv_herida_person_labels_path)

In [None]:
# controlla che tutte le bbox stiano dentro i limiti delle immagini

##################### NB: Questa funzione è identica a quella per SARD,
##################### quindi usane solo una e modifica il codice di sard per evitare di eseguirla, compilala solamente
def check_bboxes_out_of_bounds_from_csv(file_csv): # Check if the bbox aren't outside the image frame
  """
  Check if the bounding boxes in the CSV are within the image boundaries.

  Args:
    file_csv (str): path to csv wich contains columns 'filename', 'xmin', 'ymin', 'xmax', 'ymax', 'width', 'height'

  Returns:
    pd.DataFrame: rows with invalid bounding boxes
  """
  df = pd.read_csv(file_csv)

  # Assuce that the fields are numbers
  for col in ['xmin', 'ymin', 'xmax', 'ymax', 'width', 'height']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

  # Build mask for bbox outside boundaries
  out_of_bounds_mask = ~(
    (df['xmin'] >= 0) &
    (df['ymin'] >= 0) &
    (df['xmax'] <= df['width']) &
    (df['ymax'] <= df['height']) &
    (df['xmax'] > df['xmin']) &
    (df['ymax'] > df['ymin'])
  )

  invalid_rows = df[out_of_bounds_mask].copy()

  print(f"Checked all entries. Found {len(invalid_rows)} invalid bounding boxes.")
  return invalid_rows


csv_herida_person_labels_path = os.path.join(folder_path_herida, 'herida_person_labels.csv')
invalid_bboxes = check_bboxes_out_of_bounds_from_csv(csv_herida_person_labels_path)

print(invalid_bboxes[['filename', 'xmin', 'ymin', 'xmax', 'ymax', 'width', 'height']])


In [None]:
# trova le bbox sus

##################### NB: Questa funzione è identica a quella per SARD,
##################### quindi usane solo una e modifica il codice di sard per evitare di eseguirla, compilala solamente
def classify_image_annotation_quality(file_csv, dataset_folder, threshold=3): # To find suspicious images
  """
  Classify images in 3 categories:
    1. suspected error annotation (images with all bbox with one of ymin, ymax, xmin, xmax parameters <= threshold)
    2. partially suspected annotation (images with at least one bbox with ymin, ymax, xmin, xmax parameters <= threshold)
    3. no annotation (no bounding box)
  """
  df = pd.read_csv(file_csv)

  # Conversion of numerical coordinates
  for col in ['xmin', 'ymin', 'xmax', 'ymax']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

  # Boolean column for bbox sus
  df['suspicious'] = (
    (df['xmin'] <= threshold) | (df['ymin'] <= threshold) | (df['xmax'] <= threshold) | (df['ymax'] <= threshold)
  )

  # Assemble images
  grouped = df.groupby('filename')['suspicious']

  case1_all_suspicious = grouped.all()  # All boxes are sus
  case2_some_suspicious = grouped.any() & ~grouped.all()  # just some boxes are sus

  # All images annotated
  annotated_images = set(df['filename'].unique())

  # All images in the folder
  all_images = {f for f in os.listdir(dataset_folder) if f.endswith('.jpg')}

  # Images not annotated (no raw in the csv)
  case3_no_annotations = all_images - annotated_images

  return {
    "case1_all_suspicious": set(case1_all_suspicious[case1_all_suspicious].index), # I'll treat these as if they were empty images
    "case2_some_suspicious": set(case2_some_suspicious[case2_some_suspicious].index),
    "case3_no_annotations": case3_no_annotations # these are empty images
  }



folder_path_herida = '/content/drive/MyDrive/projectUPV/datasets/Herida_dataset/train'
csv_herida_person_labels_path = os.path.join(folder_path_herida, 'herida_person_labels.csv')
result = classify_image_annotation_quality(csv_herida_person_labels_path, folder_path_herida)

print(f"Case 1 (only bbox suspicious): {len(result['case1_all_suspicious'])} and they are: {result['case1_all_suspicious']}")
print(f"Case 2 (bbox mix): {len(result['case2_some_suspicious'])}")
print(f"Case 3 (no annotation): {len(result['case3_no_annotations'])} and they are: {result['case3_no_annotations']}")

# I also think that the images in the case 1 and 3 are useless in this unbalanced dataset (So i'm going to delete it).

In [None]:
# elimina le immagini sus --->>(solo se ce ne sono, controlla)

#### NB: questo dataset conteneva immagini che non andavano bene perché erano di ambienti urbani, devi eliminarle
## FALLO MANUALMENTE (E RICORDATI DI ELIMINARE ANCHE I RELATIVI FILE XML)

In [None]:
# Analisi generica del dataset

def analyze_dataset_annotations(file_csv, dataset_folder): # Basic Analysis of the dataset
  """
  Analize the csv file and verify if there are images (.jpg) without annotation
  """
  df = pd.read_csv(file_csv)

  # Count the number of annotations for each class
  count_labels = df['class'].value_counts()

  # Number of bounding box per image
  bboxes_per_image = df.groupby('filename').size()
  number_bboxes_per_image = bboxes_per_image.value_counts().sort_index()

  # All the images annotated in the csv
  annotated_images = set(df['filename'].unique())

  # All images in the folder
  all_images = {f for f in os.listdir(dataset_folder) if f.endswith('.jpg')}

  # Images without any annotation in csv file
  images_without_annotations = all_images - annotated_images

  return {
    "label_count": count_labels,
    "distribution_of_bbox": number_bboxes_per_image,
    "number_images_annotated": len(annotated_images),
    "number_all_images": len(all_images),
    "number_images_without_annotations": len(images_without_annotations),
    "images_without_annotations": images_without_annotations # Useful for debug or to remove them
  }



folder_path_herida = '/content/drive/MyDrive/projectUPV/datasets/Herida_dataset/train'
csv_herida_person_labels_path = os.path.join(folder_path_herida, 'herida_person_labels.csv')
results = analyze_dataset_annotations(csv_herida_person_labels_path, folder_path_herida)

print("How many images for each label:")
print(results["label_count"])

print("\nDistribution of bbox for image:")
print(results["distribution_of_bbox"]) # there are 557 images with 1 bbox, etc..

for n_bbox, n_images in results["distribution_of_bbox"].items():
  print(f"There are {n_images} images with {n_bbox} bbox")

print("\nImages with at least one label (bounding box):", results["number_images_annotated"])
print("Total images:", results["number_all_images"])
print("Images without any label:", results["number_images_without_annotations"])
print("The images without any label are:", results["images_without_annotations"])


In [None]:
# Sarebbe interessante vedere/contare il numero di immagini che hanno associati file xml con <pose>Unspecificated</pose> ecc (e altri)




In [None]:
# Ridimensionare le immagini? --->> se lo fai ricordati di normalizzare anche le bbox, perché non si ridimensionano automaticamente