In [2]:
import pandas as pd
import shutil
import random
from pathlib import Path

random.seed(0)
CHUNK_SIZE = 100000
IMAGES_PATH = '../../PlantCLEF2024singleplanttrainingdata_800_max_side_size/images_max_side_800'
FILE_PATH = '../../PlantCLEF2024singleplanttrainingdata.csv'
DTYPE_DICT = {
    'image_name': str,
    'organ': str,
    'species_id': int,
    'obs_id': 'Int64',
    'license': str,
    'partner': str,
    'author': str,
    'altitude': float,
    'latitude': float,
    'longitude': float,
    'gbif_species_id': float,
    'species': str,
    'genus': str,
    'family': str,
    'dataset': str,
    'publisher': str,
    'references': str,
    'url': str,
    'learn_tag': str,
    'image_backup_url': str
}

In [2]:
def get_species_counts(file_path):
    species = pd.Series(dtype='int64')
    use_cols = ['species_id', 'learn_tag']
    chunks = pd.read_csv(file_path, usecols=use_cols, sep=';', chunksize=CHUNK_SIZE, dtype=DTYPE_DICT)
    for chunk in chunks:
        chunk = chunk[chunk['learn_tag'] == 'train']
        chunk_counts = chunk['species_id'].value_counts()
        species = species.add(chunk_counts, fill_value=0).astype('int64')
    return species.sort_values(ascending=False)

In [3]:
species = get_species_counts(FILE_PATH)
print(f'Number of species with unique number of images: {species.nunique()}')
species = species.drop_duplicates(keep='first')
species

Number of species with unique number of images: 580


species_id
1369068    803
1360257    773
1737559    750
1741625    680
1414366    654
          ... 
1564493      5
1744452      4
1393712      3
1744611      2
1744551      1
Length: 580, dtype: int64

In [4]:
def save_species(species_ids, output_filename, original_file_path):
    species = set(species_ids)
    first_chunk = True
    chunks = pd.read_csv(original_file_path, sep=';', chunksize=CHUNK_SIZE, dtype=DTYPE_DICT)

    for chunk in chunks:
        filtered_chunk = chunk[chunk['species_id'].isin(species)]
        if not filtered_chunk.empty:
            mode = 'w' if first_chunk else 'a'
            header = True if first_chunk else False
            filtered_chunk.to_csv(output_filename, mode=mode, header=header, index=False, sep=';')
            first_chunk = False

In [5]:
file_path_unique = '../../PlantCLEF2024singleplanttrainingdata_unique.csv'
save_species(species.index, file_path_unique, FILE_PATH)

In [6]:
species = pd.concat([species[species > 620], species[species <= 620].iloc[::3]])
species

species_id
1369068    803
1360257    773
1737559    750
1741625    680
1414366    654
          ... 
1392429     13
1744472     10
1550140      7
1744452      4
1744551      1
Length: 200, dtype: int64

In [7]:
file_path_60000 = '../../PlantCLEF2024singleplanttrainingdata_60000.csv'
save_species(species.index, file_path_60000, file_path_unique)

In [8]:
species = pd.concat([species[species > 620], species[(species <= 620) & (species > 50)].iloc[::2], species[species <= 50]])
species

species_id
1369068    803
1360257    773
1737559    750
1741625    680
1414366    654
          ... 
1392429     13
1744472     10
1550140      7
1744452      4
1744551      1
Length: 113, dtype: int64

In [9]:
file_path_30000 = '../../PlantCLEF2024singleplanttrainingdata_30000.csv'
save_species(species.index, file_path_30000, file_path_60000)

In [3]:
def copy_selected_species(csv_path, src_folder, dst_folder):
    src_dir = Path(src_folder)
    dst_dir = Path(dst_folder)
    dst_dir.mkdir(parents=True)
    chunks = pd.read_csv(csv_path, sep=';', chunksize=CHUNK_SIZE, dtype=DTYPE_DICT)
    processed_species = set()

    for chunk in chunks:
        species = chunk['species_id'].unique().astype(str)
        for sid in species:
            if sid not in processed_species:
                src_path = src_dir / sid
                dst_path = dst_dir / sid
                shutil.copytree(src_path, dst_path)
                processed_species.add(sid)

In [11]:
images_path_30000 = '../../PlantCLEF2024singleplanttrainingdata_800_max_side_size_30000'
copy_selected_species(file_path_30000, IMAGES_PATH, images_path_30000)

In [12]:
def copy_random_subset(src_folder, dst_folder, factor):
    src_dir = Path(src_folder)
    dst_dir = Path(dst_folder)
    dst_dir.mkdir(parents=True)

    for src_sid_path in src_dir.iterdir():
        dst_sid_path = dst_dir / src_sid_path.name
        dst_sid_path.mkdir(exist_ok=True)
        images = list(src_sid_path.iterdir())
        if images:
            num_to_keep = len(images) // factor
            num_to_keep = max(1, num_to_keep)
            selected_images = random.sample(images, num_to_keep)

            for img_path in selected_images:
                shutil.copy2(img_path, dst_sid_path / img_path.name)

In [13]:
images_path_15000 = '../../PlantCLEF2024singleplanttrainingdata_800_max_side_size_15000'
copy_random_subset(images_path_30000, images_path_15000, 2)

In [14]:
images_path_10000 = '../../PlantCLEF2024singleplanttrainingdata_800_max_side_size_10000'
copy_random_subset(images_path_30000, images_path_10000, 3)

In [8]:
def train_test_split(src_folder, dst_folder, train_size, test_size):
    src_dir = Path(src_folder)
    dst_dir = Path(dst_folder)
    species_dirs = [d for d in src_dir.iterdir() if d.is_dir()]

    for species_dir in species_dirs:
        images = list(species_dir.iterdir())
        random.shuffle(images)

        num_images = len(images)
        if num_images == 0:
            print(f'In {species_dir.name} no images!')
        if num_images == 1:
            splits = {
                'train': [images[0]],
                'test':  [images[0]],
                'val':   [images[0]]
            }
        elif num_images == 2:
            splits = {
                'train': [images[0]],
                'test':  [images[1]],
                'val':   [images[1]]
            }
        else:
            num_train = max(1, int(num_images * train_size))
            num_test = max(1, int(num_images * test_size))
            if num_train + num_test >= num_images:
                if num_train > 1:
                    num_train -= 1
                else:
                    num_test -= 1
            splits = {
                'train': images[:num_train],
                'test':  images[num_train : num_train + num_test],
                'val':   images[num_train + num_test:]
            }

        for split_name, split_files in splits.items():
            target_path = dst_dir / split_name / species_dir.name
            target_path.mkdir(parents=True, exist_ok=True)

            for img_path in split_files:
                shutil.copy2(img_path, target_path / img_path.name)

In [54]:
images_path_10000_split = '../../PlantCLEF2024singleplanttrainingdata_800_max_side_size_10000_split'
train_test_split(images_path_10000, images_path_10000_split, 0.8, 0.1)

In [11]:
def count_empty_folders(folder):
    folder = Path(folder)
    count = 0

    for path in folder.rglob('*'):
        if path.is_dir():
            if not any(path.iterdir()):
                count += 1
    return count

In [55]:
print(f'Empty folders in {images_path_10000_split}: {count_empty_folders(images_path_10000_split)}')

Empty folders in ../../PlantCLEF2024singleplanttrainingdata_800_max_side_size_10000_split: 0


In [21]:
def distribute_to_nodes(src_folder, dst_folder, num_workers):
    src_dir = Path(src_folder)
    dst_dir = Path(dst_folder)
    dst_dir.mkdir(parents=True)

    species_folders = [f for f in src_dir.iterdir()]
    total_folders = len(species_folders)
    folders_per_node = total_folders // num_workers

    for i in range(num_workers):
        node_dir = dst_dir / f'node{i+1}'
        node_dir.mkdir(parents=True)
        start_idx = i * folders_per_node
        if i != num_workers - 1:
            current_batch = species_folders[start_idx : start_idx + folders_per_node]
        else:
            current_batch = species_folders[start_idx:]

        for folder in current_batch:
            shutil.copytree(folder, node_dir / folder.name)

In [56]:
train_images = Path(images_path_10000_split) / 'train'
distribute_to_nodes(train_images, '../../data', 3)

In [6]:
images_path_unique = '../../PlantCLEF2024singleplanttrainingdata_800_max_side_size_unique'
copy_selected_species(file_path_unique, IMAGES_PATH, images_path_unique)

In [9]:
images_path_unique_split = '../../PlantCLEF2024singleplanttrainingdata_800_max_side_size_unique_split'
train_test_split(images_path_unique, images_path_unique_split, 0.8, 0.1)

In [12]:
print(f'Empty folders in {images_path_unique_split}: {count_empty_folders(images_path_unique_split)}')

Empty folders in ../../PlantCLEF2024singleplanttrainingdata_800_max_side_size_unique_split: 0
