<a href="https://colab.research.google.com/github/PavanDaniele/drone-person-detection/blob/main/dataset_preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set up: mount drive + import libraries

In [1]:
# Run this Every time you start a new session
from google.colab import drive
drive.mount('/content/drive') # to mount google drive (to see/access it)

Mounted at /content/drive


In [None]:
# Run this snippet Just one time, to install packages
!pip install imagehash
!pip install pillow

In [None]:
from PIL import Image
import imagehash
import os
from itertools import combinations

# Dataset Preparation

In this notebook, I'm going to prepare the dataset for fine-tuning multiple deep learning models (e.g. YOLO, EfficientDet, SSD + MobileNetV2).
The steps include similarity check, dataset splitting (train/val/test), optional image resizing, and bounding box adaptation.
The goal is to generate separate, clean and model-ready datasets for each architecture to enable fair training and evaluation.

In [None]:
image_folder_path = "/content/drive/MyDrive/projectUPV/datasets/AERALIS"

HASH_METHODS = ['phash', 'ahash', 'dhash']
HAMMING_THRESHOLD = 5

In [None]:
def get_image_paths(folder_path): # To estract the images file (.jpg) and ignore the .xml and .csv files
  """
  Args:
    folder_path: path to folder containing images
  Returns:
    list of paths to images
  """
  return [os.path.join(image_folder_path, f) for f in os.listdir(image_folder_path)
               if f.lower().endswith(('.jpg'))]

In [None]:
def compute_hash(img_path, method):
  """
  Args:
    img_path: path to image
    method: hash method to use
  Returns:
    hash of image
  """
  img = Image.open(img_path).convert("L")  # Grayscale (because the hash algorithms works best when the image is in black and white)

  if method == 'phash':
    return imagehash.phash(img)
  elif method == 'ahash':
    return imagehash.average_hash(img)
  elif method == 'dhash':
    return imagehash.dhash(img)
  else:
    raise ValueError(f"Hash method not supported: {method}")

In [None]:
def compute_all_hashes(image_paths, methods): # Hash calculation for each images
  """
  Args:
    image_paths: list of paths to images
    methods: list of hash methods to use
  Returns:
    dictionary of hashes
  """
  hashes = {method: {} for method in methods} # to create a dictionary and for each method creates an empty sub-dictionary

  for method in methods:
    print(f"\nCalculation {method} for all images")

    for path in image_paths: # cycles over each image path in the image_paths list
      try:
        h = compute_hash(path, method)
        hashes[method][path] = h # saves the calculated hash in the dictionary structure
      except Exception as e:
        print(f"Error with {path}: {e}")

  return hashes

In [None]:
def compare_hashes(hashes, threshold): # Comparison of images in pairs
  """
  Args:
    hashes: dictionary of hashes
    threshold: distance threshold to consider images as similar
  """
  similar_images = []

  for method in hashes:
    print(f"\nRisultats with {method.upper()}:") # .upper() is used to convert the characters to 'uppercase'
    pairs = combinations(hashes[method].items(), 2) # combinations() is used to generate all the possible pairs without repetitions

    for (path1, hash1), (path2, hash2) in pairs:
      dist = hash1 - hash2
      if dist <= threshold:
        similar_images.append({
          'method': method,
          'image1': os.path.basename(path1),
          'image2': os.path.basename(path2),
          'distance': dist
        })
  return similar_images

In [1]:
image_paths = get_image_paths(image_folder_path)
hashes = compute_all_hashes(image_paths, HASH_METHODS)
similar_images = compare_hashes(hashes, HAMMING_THRESHOLD)

# to see how many distine images are considered similar:
img_set = set()
for entry in similar_images:
    img_set.add(entry['image1'])
    img_set.add(entry['image2'])

print(f"Number of similar distinct images: {len(img_set)}")
print(f"Number of All images: {len(image_paths)}")

NameError: name 'get_image_paths' is not defined

In [None]:
image_folder_path = "/content/drive/MyDrive/projectUPV/datasets/AERALIS"


# Parameters
HASH_METHODS = ['phash', 'ahash', 'dhash']  # to use pHash, aHash and dHash
HAMMING_THRESHOLD = 5  # Soglia per considerare due immagini simili

# To estract the images file (.jpg) and ignore the .xml and .csv files
image_paths = [os.path.join(image_folder_path, f) for f in os.listdir(image_folder_path)
               if f.lower().endswith(('.jpg'))]

# Hash calculation for each images
hashes = {method: {} for method in HASH_METHODS} # to create a dictionary and for each method creates an empty sub-dictionary
for method in HASH_METHODS:
  print(f"\nCalculation {method} for all images")
  for path in image_paths: # cycles over each image path in the image_paths list
    try:
      img = Image.open(path).convert("L")  # Grayscale (because the hash algorithms works best when the image is in black and white)
      if method == 'phash':
        h = imagehash.phash(img)
      elif method == 'ahash':
        h = imagehash.average_hash(img)
      elif method == 'dhash':
        h = imagehash.dhash(img)
      hashes[method][path] = h # saves the calculated hash in the dictionary structure
    except Exception as e:
      print(f"Errore con {path}: {e}")




# Comparison of images in pairs
for method in HASH_METHODS:
    print(f"\nRisultats with {method.upper()}:") # .upper() is used to convert the characters to 'uppercase'
    pairs = combinations(hashes[method].items(), 2) # combinations() is used to generate all the possible pairs without repetitions
    for (path1, hash1), (path2, hash2) in pairs:
      dist = hash1 - hash2
      if dist <= HAMMING_THRESHOLD:
        print(f"\nSIMILAR (distance = {dist}):")
        print(f" - {os.path.basename(path1)}")
        print(f" - {os.path.basename(path2)}")


In [None]:
# 📦 Installa la libreria necessaria
!pip install ImageHash

# ⚙️ Import delle librerie
import os
from PIL import Image
import imagehash
from collections import defaultdict
import pandas as pd

# 📂 Cartella contenente le immagini
image_folder = "/content/drive/MyDrive/tuo_dataset/images"  # cambia questo path

# 🔧 Parametri
hash_function = imagehash.phash      # puoi cambiare in ahash, dhash
hamming_threshold = 5                # distanza massima per considerare due immagini "simili"

# 🔍 Funzione per calcolare gli hash
def compute_image_hashes(folder, hash_func):
    hashes = {}
    for fname in os.listdir(folder):
        if fname.lower().endswith((".jpg", ".jpeg", ".png")):
            path = os.path.join(folder, fname)
            try:
                img = Image.open(path)
                img_hash = hash_func(img)
                hashes[fname] = img_hash
            except Exception as e:
                print(f"Errore su {fname}: {e}")
    return hashes

# 🔎 Funzione per confrontare le immagini
def find_similar_images(hashes, threshold):
    similar = defaultdict(list)
    files = list(hashes.items())
    for i in range(len(files)):
        name1, hash1 = files[i]
        for j in range(i + 1, len(files)):
            name2, hash2 = files[j]
            distance = hash1 - hash2
            if distance <= threshold:
                similar[name1].append((name2, distance))
    return similar

# 🚀 Esecuzione
hashes = compute_image_hashes(image_folder, hash_function)
similar_images = find_similar_images(hashes, hamming_threshold)

# 📊 Risultato in tabella
rows = []
for base, similars in similar_images.items():
    for sim_name, dist in similars:
        rows.append({"Image 1": base, "Image 2": sim_name, "Hamming Distance": dist})

df = pd.DataFrame(rows)
df.sort_values("Hamming Distance")
