# 0. Imports

In [None]:
# importing necessary libraries
import os
import shutil
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import cv2
from icrawler.builtin import BingImageCrawler
import zipfile
import urllib.request
import tarfile
from ultralytics import YOLO
from tqdm import tqdm

In [None]:
# Download and extract the dataset

# Create dataset directory
os.makedirs("fgvc_data", exist_ok=True)

# URL to download
url = "http://www.robots.ox.ac.uk/~vgg/data/fgvc-aircraft/fgvc-aircraft-2013b.tar.gz"
zip_path = "fgvc_data/fgvc-aircraft-2013b.tar.gz"

# Download
if not os.path.exists(zip_path):
    urllib.request.urlretrieve(url, zip_path)
    print("Download complete")

# Extract
if not os.path.exists("fgvc_data/fgvc-aircraft-2013b"):
    with tarfile.open(zip_path, "r:gz") as tar:
        tar.extractall("fgvc_data/")
    print("Extracted to fgvc_data/")

In [None]:
# define paths
base_path = "fgvc_data/fgvc-aircraft-2013b"
image_path = os.path.join(base_path, "data", "images")
annotation_files = [
    os.path.join(base_path, "images_family_trainval.txt"),
    os.path.join(base_path, "images_family_test.txt")
]

# 1. Join all pictures by family

In [None]:
# filtering images based on relevant aircraft families
relevant_families = [
    "A320", "A330", "A380", "A340",
    "ATR-72",
    "Boeing 737", "Boeing 747", "Boeing 757", "Boeing 767", "Boeing 777",
    "CRJ-700", "Dash 8", "Embraer E-Jet"
]


# aggregate image files
data = []
for annotation_file in annotation_files:
    with open(annotation_file, "r") as f:
        lines = f.readlines()
        for line in lines:
            parts = line.strip().split()
            image = parts[0]
            family = " ".join(parts[1:])
            data.append((image, family))

# create dataframe
df = pd.DataFrame(data, columns=["image", "family"])

# filter df by relevant families
df_filtered = df[df["family"].isin(relevant_families)]
print(f"Found images: {len(df_filtered)}")

# create target directory for filtered images
filtered_images = os.path.join(base_path, "2_filtered_images")
os.makedirs(filtered_images, exist_ok=True)

# copy the filtered images to the target directory
copied = 0
for img_name in df_filtered["image"]:
    src = os.path.join(image_path, img_name + ".jpg")
    dst = os.path.join(filtered_images, img_name + ".jpg")
    if os.path.exists(src):
        shutil.copyfile(src, dst)
        copied += 1

print(f"Copied {copied} images to '{filtered_images}'.")

# 2. Organizing of files

#### Renaming old files from digits (123456.jpg) to family_name (Airbus_A320_0001.jpg)

In [None]:
# new path for organized images
organized_images = os.path.join(base_path, "3_organized_images")
os.makedirs(organized_images, exist_ok=True)

# read label mappings
image_to_family = {}
for annotation_file in annotation_files:
    with open(annotation_file, "r") as f:
        for line in f:
            parts = line.strip().split()
            image_id = parts[0]
            family = " ".join(parts[1:])
            image_to_family[image_id] = family

# create counters for unique filenames
family_counters = defaultdict(int)

# copy and rename images
for image_id, family in image_to_family.items():
    src_filename = f"{image_id}.jpg"
    src_path = os.path.join(filtered_images, src_filename)
    
    if not os.path.exists(src_path):
        continue  # Skip if image doesn't exist

    # Sanitize family name for folder/file use
    family_folder = family.replace(" ", "_")
    dest_folder = os.path.join(organized_images, family_folder)
    os.makedirs(dest_folder, exist_ok=True)

    family_counters[family] += 1
    new_name = f"{family_folder}_{family_counters[family]:04d}.jpg"
    dest_path = os.path.join(dest_folder, new_name)

    shutil.copyfile(src_path, dest_path)

print("Images copied and renamed into organized folders.")

### Check class balance and image diversity and Sample images per class for visual diversity check 

Create a dataframe first

In [None]:
# create a dataframe with image paths and families
data_organized = []

for family in os.listdir(organized_images):
    family_path = os.path.join(organized_images, family)
    if os.path.isdir(family_path):
        for img_file in os.listdir(family_path):
            if img_file.lower().endswith((".jpg", ".jpeg", ".png")):
                full_path = os.path.join(family_path, img_file)  # ⬅️ now includes family in path
                data_organized.append({"image_path": full_path, "family": family})  # family stays the same

df_organized = pd.DataFrame(data_organized)
print(df_organized.head())

Check class balance

In [None]:
class_counts = df_organized["family"].value_counts()

# Plotting the distribution of images per family
plt.figure(figsize=(12, 6))
class_counts.plot(kind="bar")
plt.title("Number of images per airplane family")
plt.ylabel("Image count")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

Sample images to visually check diversity

In [None]:
# display 3 sample images from each family
sampled_df = df_organized.groupby("family").apply(lambda x: x.sample(min(len(x), 3))).reset_index(drop=True)

for family in sampled_df["family"].unique():
    images = sampled_df[sampled_df["family"] == family]["image_path"].values
    plt.figure(figsize=(10, 3))
    for i, img_path in enumerate(images):
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.subplot(1, len(images), i+1)
        plt.imshow(img)
        plt.axis("off")
        plt.title(family)
    plt.suptitle(f"Sample images: {family}")
    plt.tight_layout()
    plt.show()

In [None]:
# dictionary to track faulty images per class
broken_per_class = defaultdict(int)

# list to collect all broken image info (optional)
broken_images = []

# loop through all rows in the dataframe
for _, row in df.iterrows():
    img_path = row["image_path"]
    family = row["family"]
    
    img = cv2.imread(img_path)
    if img is None:
        broken_per_class[family] += 1
        broken_images.append(img_path)

# display summary
print("Faulty image count per class:\n")
for cls, count in broken_per_class.items():
    print(f"{cls:20} | {count} faulty images")

print(f"\nTotal broken images: {len(broken_images)}")

# Optional: Remove broken images
#for path in broken_images:
#    os.remove(path)

Result is: Dataset had faulty images that came from the added new images to the dataset (A350, Boeing 737 Max, Boeing 787), which were removed. It was also very skewed to the point were A320 was overrepresented (over 500), followed by A340, B747, A320 (around 300) and others below 200.

Next course of action is: Align all of the classes to an image count of 200, which means downsampling 

# 3. Balance classes to 200 image counts

1. Remove manually all photos of airplane models, Blueprints, Renders / packaging, Toys, etc.
2. Identify what classes need to be downsampled/upsampled, by how much?
3. Use downloader (icrawler with license + size filters) to add the amount of missing images for underrepresented classes
4. Check if all classes have 200 images, non-corrupt files, diverse shots (sample-wise)
5. Come up with a augmentation strategy (randomnly crop the images before training, so that the model gets used to the cropping of YOLO)

In [None]:
#1: Manually remove the unfitting images

In [None]:
#2: Idenfity the classes to be downsampled/upsampled

balanced_images = os.path.join(base_path, "4_balanced_images")
os.makedirs(balanced_images, exist_ok=True)

# Define the target count for each class
TARGET_COUNT = 200

image_data = []
missing_count = {}

for family in os.listdir(organized_images):
    class_dir = os.path.join(organized_images, family)
    
    if not os.path.isdir(class_dir):
        continue

    images = [img for img in os.listdir(class_dir) if img.lower().endswith(('.jpg', '.jpeg', '.png'))]

    if len(images) >= TARGET_COUNT:
        selected = pd.Series(images).sample(n=TARGET_COUNT, random_state=42)
    else:
        selected = pd.Series(images)
        missing_count[family] = TARGET_COUNT - len(images)

    target_class_dir = os.path.join(balanced_images, family)
    os.makedirs(target_class_dir, exist_ok=True)

    for img in selected:
        src = os.path.join(class_dir, img)
        dst = os.path.join(target_class_dir, img)
        shutil.copyfile(src, dst)

print("Balancing complete. Classes under 200:")
for family, missing in missing_count.items():
    print(f"{family:20} needs {missing} more images")

In [None]:
# 3: Download missing images and move to temp folder

# Root folder for temporary downloads
additional_pics = os.path.join(base_path, "3_1_add_pics")
os.makedirs(additional_pics, exist_ok=True)

# Dictionary with classes that need images
missing_images = {
    "A330": 72,
    "Airbus A350": 161,
    "Boeing 737 MAX": 164,
    "Boeing_767": 6,
    "ATR-72": 134,
    "Boeing_757": 73,
    "CRJ-700": 68,
    "Boeing_777": 69,
    "Boeing 787": 157,
    "A380": 136,
    "Dash_8": 70,
    "Embraer_E-Jet": 5
}

def download_aircraft_images(class_name, output_dir, num_images):
    class_folder = class_name.replace(" ", "_")
    target_dir = os.path.join(output_dir, class_folder)
    os.makedirs(target_dir, exist_ok=True)

    print(f" Attempting to download {num_images} images for: {class_name}")

    crawler = BingImageCrawler(storage={'root_dir': target_dir})
    crawler.crawl(
        keyword=class_name + " airplane",
        filters={"license": "commercial,modify", "size": "large"},
        max_num=num_images
    )

    print(f"Download complete for: {class_name}")

# Loop through each class
for class_name, target_amount in missing_images.items():
    class_folder = os.path.join(additional_pics, class_name.replace(" ", "_"))
    os.makedirs(class_folder, exist_ok=True)

    # Count current images
    current_count = len([
        f for f in os.listdir(class_folder)
        if f.lower().endswith(('.jpg', '.jpeg', '.png'))
    ])

    # Determine how many to download
    remaining = target_amount - current_count
    print(f"{class_name}: {current_count}/{target_amount} images already present. Need {remaining} more.")

    #if remaining <= 0:
    #    print(f"✅ {class_name}: already has {current_count}/{target_amount} images.")
    #    continue

    #download_aircraft_images(class_name, additional_pics, remaining)

# 4. Enhancing the folders to match image count.

Reached limit with iCrawler:
+ A330: 47/72 images already present. Need 25 more.
+ Airbus A350: 56/161 images already present. Need 105 more.
+ Boeing 737 MAX: 29/164 images already present. Need 135 more.
+ Boeing_767: 6/6 images already present. Need 0 more.
+ ATR-72: 98/134 images already present. Need 36 more.
+ Boeing_757: 47/73 images already present. Need 26 more.
+ CRJ-700: 68/68 images already present. Need 0 more.
+ Boeing_777: 54/69 images already present. Need 15 more.
+ Boeing 787: 53/157 images already present. Need 104 more.
+ A380: 24/136 images already present. Need 112 more.
+ Dash_8: 53/70 images already present. Need 17 more.
+ Embraer_E-Jet: 5/5 images already present. Need 0 more.
TOTAL: 575 images to be added.

**Now filter out unnecessary images that were downloaded with iCrawler, thereafter hand-pick remaining aircrafts, from www.jetphotos.com.** 

After hand picking out the broken/unrelevant images:
+ A330: 33/72 images already present. Need 39 more.
+ Airbus A350: 37/161 images already present. Need 124 more.
+ Boeing 737 MAX: 4/164 images already present. Need 160 more.
+ Boeing_767: 6/6 images already present. Need 0 more.
+ ATR-72: 75/134 images already present. Need 59 more.
+ Boeing_757: 32/73 images already present. Need 41 more.
+ CRJ-700: 65/68 images already present. Need 3 more.
+ Boeing_777: 32/69 images already present. Need 37 more.
+ Boeing 787: 30/157 images already present. Need 127 more.
+ A380: 7/136 images already present. Need 129 more.
+ Dash_8: 32/70 images already present. Need 38 more.
+ Embraer_E-Jet: 1/5 images already present. Need 4 more.
TOTAL: 761 images to be added.

In [None]:
# Validate images in 3_1_add_pics (detect broken or unreadable images)

def validate_images_in_folder(base_folder):
    broken_images = []

    for root, _, files in os.walk(base_folder):
        for file in files:
            if file.lower().endswith(('.jpg', '.jpeg', '.png')):
                path = os.path.join(root, file)
                img = cv2.imread(path)
                if img is None:
                    broken_images.append(path)
                    print(f"❌ Corrupt: {path}")

    print(f"\n🔍 Found {len(broken_images)} broken images.")
    return broken_images

broken = validate_images_in_folder(balanced_images)
print(f"{len(broken)} broken images found.")

In [None]:
# delete balanced_images folder for clean rebuild

# shutil.rmtree("/Users/williamhutchinson/Local_Docs/M.Sc. WINFO/SEM3/DAT255/notebooks/assets/fgvc-aircraft/data/4_balanced_images", ignore_errors=True)
#print("🗑️  Deleted 4_balanced_images for clean rebuild.")

# 5. Moving the images to the final target folder.

Moved the additional pictures from 3_1_add_pics to 4_balanced_images to match the 200 image count per class.

In [None]:
# Move valid images to the target directory

def safe_rename_images(base_dir="4_balanced_images"):
    for cls in sorted(os.listdir(base_dir)):
        cls_path = os.path.join(base_dir, cls)
        if not os.path.isdir(cls_path):
            continue

        files = sorted([
            f for f in os.listdir(cls_path)
            if f.lower().endswith(('.jpg', '.jpeg', '.png'))
        ])

        temp_names = []
        for i, file in enumerate(files, 1):
            ext = os.path.splitext(file)[1]
            temp_name = f"__temp_{i:04d}{ext}"
            os.rename(os.path.join(cls_path, file), os.path.join(cls_path, temp_name))
            temp_names.append(temp_name)

        for i, temp_file in enumerate(temp_names, 1):
            ext = os.path.splitext(temp_file)[1]
            new_name = f"{cls}_{i:04d}{ext}"
            os.rename(os.path.join(cls_path, temp_file), os.path.join(cls_path, new_name))

        print(f"Safely renamed {len(files)} files in {cls}")

# Call the function
safe_rename_images(balanced_images)

In [None]:
# Move valid images to the target directory
data_balanced = []

for family in sorted(os.listdir(balanced_images)):
    family_path = os.path.join(balanced_images, family)
    if not os.path.isdir(family_path):
        continue
    for img in os.listdir(family_path):
        if img.lower().endswith(('.jpg', '.jpeg', '.png')):
            data_balanced.append({
                "image_path": os.path.join(family_path, img),
                "family": family
            })

df_balanced = pd.DataFrame(data_balanced)

df_balanced["family"].value_counts().sort_index()

# 6. Run YOLOv8 Cropping on images

In [None]:
classFolder = os.path.join(base_path, "classFolder")
os.makedirs(classFolder, exist_ok=True)

yolo_model = YOLO("yolov8n.pt")
AIRPLANE_CLASS_ID = 4

# Loop through each class in the balanced images
for class_name in os.listdir(balanced_images):
    input_class_dir = os.path.join(balanced_images, class_name)
    output_class_dir = os.path.join(classFolder, class_name)
    
    if not os.path.isdir(input_class_dir):
        continue
    
    os.makedirs(output_class_dir, exist_ok=True)
    
    for img_file in tqdm(os.listdir(input_class_dir), desc=f"Processing {class_name}"):
        if not img_file.lower().endswith((".jpg", ".jpeg", ".png")):
            continue
        
        img_path = os.path.join(input_class_dir, img_file)
        image = cv2.imread(img_path)
        
        if image is None:
            print(f" Skipped unreadable image: {img_path}")
            continue

        # Run YOLO
        results = yolo_model.predict(image, conf=0.25, iou=0.3)

        for i, result in enumerate(results):
            for box in result.boxes:
                if int(box.cls[0]) == AIRPLANE_CLASS_ID:
                    x1, y1, x2, y2 = map(int, box.xyxy[0])
                    h, w, _ = image.shape
                    x1, y1 = max(0, x1), max(0, y1)
                    x2, y2 = min(w, x2), min(h, y2)

                    cropped = image[y1:y2, x1:x2]
                    if cropped.size == 0:
                        continue

                    out_path = os.path.join(output_class_dir, f"{os.path.splitext(img_file)[0]}_crop{i}.jpg")
                    cv2.imwrite(out_path, cropped)

# Result
**Result is the creation of the folder "classFolder", which contains the images used for the model training.**