In [1]:
# Import libraries
import os
import pandas as pd
from pathlib import Path

In [None]:
# Read the animals folder and get subfolders
animals_path = Path("animals")
subfolders = [f for f in animals_path.iterdir() if f.is_dir()]
print(f"Number of subfolders: {len(subfolders)}")
print("Top 5 subfolders:", [f.name for f in subfolders[:5]])

Number of subfolders: 90
Top 5 subfolders: ['antelope', 'badger', 'bat', 'bear', 'bee']


In [3]:
# Count files in each subfolder (assuming all files are images)
data = []
for sub in subfolders:
    files = list(sub.glob('*'))
    num_files = len([f for f in files if f.is_file()])
    data.append({'animal': sub.name, 'num_images': num_files})
df = pd.DataFrame(data)
print("Complete DataFrame:")
print(df)

Complete DataFrame:
        animal  num_images
0     antelope          60
1       badger          60
2          bat          60
3         bear          60
4          bee          60
..         ...         ...
85       whale          60
86        wolf          60
87      wombat          60
88  woodpecker          60
89       zebra          60

[90 rows x 2 columns]


In [4]:
# Split into three DataFrames (each with up to 30 rows) and display side by side
df1 = df.iloc[:30]
df2 = df.iloc[30:60]
df3 = df.iloc[60:]

from IPython.display import display, HTML
html = '<div style="display: flex;">'
html += '<div style="margin-right: 20px;">' + df1.to_html() + '</div>'
html += '<div style="margin-right: 20px;">' + df2.to_html() + '</div>'
html += '<div>' + df3.to_html() + '</div>'
html += '</div>'
display(HTML(html))

Unnamed: 0,animal,num_images
0,antelope,60
1,badger,60
2,bat,60
3,bear,60
4,bee,60
5,beetle,60
6,bison,60
7,boar,60
8,butterfly,60
9,cat,60

Unnamed: 0,animal,num_images
30,goose,60
31,gorilla,60
32,grasshopper,60
33,hamster,60
34,hare,60
35,hedgehog,60
36,hippopotamus,60
37,hornbill,60
38,horse,60
39,hummingbird,60

Unnamed: 0,animal,num_images
60,parrot,60
61,pelecaniformes,60
62,penguin,60
63,pig,60
64,pigeon,60
65,porcupine,60
66,possum,60
67,raccoon,60
68,rat,60
69,reindeer,60


In [7]:
# Print total number of images across all subfolders
total = df['num_images'].sum()
print(f"Total number of images: {total}")

Total number of images: 5400


In [None]:
# Rename all images in subfolders
# This step is done because the image names were all random values like "02f4b3be2d.jpg". 
# It is done to prepare for classification, making the classification report and confusion matrix clearer, 
# as this is part of labeling the data after classification.
import os

animals_path = "animals"

for subfolder in os.listdir(animals_path):
    sub_path = os.path.join(animals_path, subfolder)
    if os.path.isdir(sub_path):
        i = 1
        for file in os.listdir(sub_path):
            file_path = os.path.join(sub_path, file)
            if os.path.isfile(file_path):
                ext = os.path.splitext(file)[1]
                new_name = f"{subfolder}_{i}{ext}"
                new_path = os.path.join(sub_path, new_name)
                os.rename(file_path, new_path)
                i += 1

In [None]:
# Create Reference_Images folder and randomly select 6 images from each subfolder
import os
import random
import shutil

animals_path = "animals"
ref_path = "Reference_Images"

os.makedirs(ref_path, exist_ok=True)

for subfolder in os.listdir(animals_path):
    sub_path = os.path.join(animals_path, subfolder)
    if os.path.isdir(sub_path):
        ref_sub_path = os.path.join(ref_path, subfolder)
        os.makedirs(ref_sub_path, exist_ok=True)
        
        files = [f for f in os.listdir(sub_path) if os.path.isfile(os.path.join(sub_path, f))]
        if len(files) >= 6:
            selected = random.sample(files, 6)
            for file in selected:
                src = os.path.join(sub_path, file)
                dst = os.path.join(ref_sub_path, file)
                shutil.copy(src, dst)