In [1]:
%pip install torch torchvision matplotlib pillow


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import shutil

# Paths
source_dir = r"D:\Downloads\Datasets\Wildlife_dataset\images\species_train"
extra_dir = r"D:\Downloads\Datasets\Wildlife_Species_Identification\Animal_dataset\train"
target_dir = r"D:\Downloads\Datasets\Imbalanced_dataset"

os.makedirs(target_dir, exist_ok=True)

# Get all class names from both datasets
all_classes = set(os.listdir(source_dir)) | set(os.listdir(extra_dir))

for cls in all_classes:
    images = []

    # Collect images from source dataset
    src_path = os.path.join(source_dir, cls)
    if os.path.isdir(src_path):
        images.extend([os.path.join(src_path, img) for img in os.listdir(src_path) if os.path.isfile(os.path.join(src_path, img))])

    # Collect images from extra dataset
    extra_path = os.path.join(extra_dir, cls)
    if os.path.isdir(extra_path):
        images.extend([os.path.join(extra_path, img) for img in os.listdir(extra_path) if os.path.isfile(os.path.join(extra_path, img))])

    if not images:
        print(f"âš  Skipping class '{cls}' (no images found).")
        continue

    # Create class folder in target
    target_class_path = os.path.join(target_dir, cls)
    os.makedirs(target_class_path, exist_ok=True)

    # Copy with unique names (to avoid overwriting)
    for idx, img_path in enumerate(images):
        img_name = os.path.basename(img_path)
        new_name = f"{idx}_{img_name}"
        shutil.copy2(img_path, os.path.join(target_class_path, new_name))

    print(f"âœ… {cls}: {len(images)} images copied")

print("\nðŸŽ¯ Dataset merged successfully (kept imbalanced distribution)!")


âœ… Horse: 400 images copied
âœ… Sea turtle: 239 images copied
âœ… Chicken: 388 images copied
âœ… leopardus_wiedii: 180 images copied
âœ… Harbor seal: 240 images copied
âœ… Lion: 208 images copied
âœ… Canary: 113 images copied
âœ… Otter: 75 images copied
âœ… felis_silvestris: 108 images copied
âœ… caracal_caracal: 101 images copied
âœ… Spider: 856 images copied
âœ… Whale: 287 images copied
âœ… catopuma_temminckii: 2 images copied
âœ… leptailurus_serval: 74 images copied
âœ… Jellyfish: 457 images copied
âœ… leopardus_guigna: 19 images copied
âœ… Parrot: 421 images copied
âœ… leopardus_emiliae: 5 images copied
âœ… felis_nigripes: 4 images copied
âœ… Fish: 835 images copied
âœ… Fox: 148 images copied
âœ… leopardus_geoffroyi: 147 images copied
âœ… acinonyx_jubatus: 114 images copied
âœ… Tick: 74 images copied
âœ… Ostrich: 136 images copied
âœ… elephas_maximus: 173 images copied
âœ… Polar bear: 229 images copied
âœ… Hedgehog: 80 images copied
âœ… caracal_aurata: 16 images copied
âœ… Owl: 40

In [3]:
import os
import random
import shutil

dataset_dir = r"D:\Downloads\Datasets\Imbalanced_dataset"
output_dir = r"D:\Downloads\Datasets\Imbalanced_dataste_split"  # moved outside source folder

train_ratio = 0.7
val_ratio = 0.2
test_ratio = 0.1

os.makedirs(output_dir, exist_ok=True)
for split in ['train', 'val', 'test']:
    os.makedirs(os.path.join(output_dir, split), exist_ok=True)

for cls in os.listdir(dataset_dir):
    cls_path = os.path.join(dataset_dir, cls)

    # Skip if not a folder or is the output folder itself
    if not os.path.isdir(cls_path) or cls.lower() == "split_dataset":
        continue

    images = [f for f in os.listdir(cls_path) if os.path.isfile(os.path.join(cls_path, f))]
    random.shuffle(images)

    n_total = len(images)
    n_train = int(train_ratio * n_total)
    n_val = int(val_ratio * n_total)

    splits = {
        'train': images[:n_train],
        'val': images[n_train:n_train + n_val],
        'test': images[n_train + n_val:]
    }

    for split, split_images in splits.items():
        split_cls_dir = os.path.join(output_dir, split, cls)
        os.makedirs(split_cls_dir, exist_ok=True)
        for img in split_images:
            shutil.copy2(os.path.join(cls_path, img), os.path.join(split_cls_dir, img))

    print(f"âœ… {cls}: {n_total} â†’ Train:{n_train}, Val:{n_val}, Test:{len(images)-n_train-n_val}")

print("\nðŸŽ¯ Dataset split into train/val/test successfully!")


âœ… acinonyx_jubatus: 114 â†’ Train:79, Val:22, Test:13
âœ… Bear: 87 â†’ Train:60, Val:17, Test:10
âœ… Brown bear: 108 â†’ Train:75, Val:21, Test:12
âœ… Butterfly: 1875 â†’ Train:1312, Val:375, Test:188
âœ… Camel: 67 â†’ Train:46, Val:13, Test:8
âœ… Canary: 113 â†’ Train:79, Val:22, Test:12
âœ… caracal_aurata: 16 â†’ Train:11, Val:3, Test:2
âœ… caracal_caracal: 101 â†’ Train:70, Val:20, Test:11
âœ… Caterpillar: 494 â†’ Train:345, Val:98, Test:51
âœ… catopuma_temminckii: 2 â†’ Train:1, Val:0, Test:1
âœ… Cats: 255 â†’ Train:178, Val:51, Test:26
âœ… Cattle: 70 â†’ Train:49, Val:14, Test:7
âœ… Centipede: 194 â†’ Train:135, Val:38, Test:21
âœ… Cheetah: 132 â†’ Train:92, Val:26, Test:14
âœ… Chicken: 388 â†’ Train:271, Val:77, Test:40
âœ… Crab: 309 â†’ Train:216, Val:61, Test:32
âœ… Crocodile: 108 â†’ Train:75, Val:21, Test:12
âœ… Deer: 327 â†’ Train:228, Val:65, Test:34
âœ… Dogs: 272 â†’ Train:190, Val:54, Test:28
âœ… Duck: 542 â†’ Train:379, Val:108, Test:55
âœ… Eagle: 719 â†’ Train:503, Va