In [17]:
import json
import shutil
from pathlib import Path

DATA_DIR = Path("../data/plantnet_300K")
OUTPUT_DIR = Path("../data/top_100_rand_samples")

SAMPLES_PER_CLASS = 3
SPLIT = "train"

In [18]:
import sys
sys.path.insert(0, '../src')

from plant_care_ai.training.class_selection import get_most_popular_classes

top_100, all_counts = get_most_popular_classes(str(DATA_DIR), top_k=100)
print(f"Top class: {top_100[0]} with {all_counts[top_100[0]]} samples")

Loaded 243916 samples, 1081 classes
Top class: 1363227 with 7208 samples


In [19]:
json_path = DATA_DIR / "plantnet300K_species_id_2_name.json"
with open(json_path, 'r', encoding='utf-8') as f:
    id_to_name = json.load(f)

list(id_to_name.items())[:3]

[('1355868', 'Lactuca virosa L.'),
 ('1355920', "Pelargonium capitatum (L.) L'Hér."),
 ('1355932', "Pelargonium graveolens L'Hér.")]

In [None]:
from PIL import Image
import random
from collections import Counter

resolutions = Counter()

for class_id in top_100:
    latin_name = id_to_name.get(class_id, "Unknown")
    clean_name = latin_name.replace(" ", "_").replace(".", "").replace("/", "-")
    
    src_dir = DATA_DIR / "images" / SPLIT / class_id
    dst_dir = OUTPUT_DIR / f"{class_id}_{clean_name}"
    
    if src_dir.exists():
        dst_dir.mkdir(parents=True, exist_ok=True)
        
        all_images = list(src_dir.glob("*.jpg"))
        
        num_to_sample = min(len(all_images), SAMPLES_PER_CLASS)
        selected_images = random.sample(all_images, num_to_sample)
        
        for img_path in selected_images:
            with Image.open(img_path) as img:
                width, height = img.size
                resolutions[f"{width}x{height}"] += 1
            shutil.copy2(img_path, dst_dir / img_path.name)

for res, count in resolutions.most_common():
    print(f"{res}: {count} images")

Done! Extracted images for 100 classes :3
600x600: 232 images
506x506: 43 images
437x437: 6 images
405x405: 4 images
450x450: 4 images
337x337: 3 images
506x900: 1 images
442x442: 1 images
540x540: 1 images
412x412: 1 images
414x414: 1 images
415x415: 1 images
505x505: 1 images
593x593: 1 images
