In [3]:
import pandas as pd
import os
import shutil
from sklearn.model_selection import train_test_split


In [4]:
train_folder = 'Road_Dataset_14/train'
img_folder = os.path.join(train_folder, 'img')
ann_folder = os.path.join(train_folder, 'ann')

train_output = 'datasets/yolo_dataset/train'
val_output = 'datasets/yolo_dataset/val'

train_img_output = os.path.join(train_output, 'images')
train_ann_output = os.path.join(train_output, 'annotations')
val_img_output = os.path.join(val_output, 'images')
val_ann_output = os.path.join(val_output, 'annotations')

for folder in [train_img_output, train_ann_output, val_img_output, val_ann_output]:
    if not os.path.exists(folder):
        os.makedirs(folder, exist_ok=True)

In [5]:
# Make a list of images and annotations
images_files = [f for f in os.listdir(img_folder) if f.endswith('.jpg')]
ann_files = [f for f in os.listdir(ann_folder) if f.endswith('.json')]
images_files.sort()
ann_files.sort()

# Check if the number of images matches the number of annotations
assert len(images_files) == len(ann_files), "Number of images and annotations do not match"

In [6]:
# Check all the image and their respective annotation file are there in the list
df = pd.DataFrame([images_files, ann_files])
df.T

Unnamed: 0,0,1
0,China_Drone_000000.jpg,China_Drone_000000.jpg.json
1,China_Drone_000001.jpg,China_Drone_000001.jpg.json
2,China_Drone_000002.jpg,China_Drone_000002.jpg.json
3,China_Drone_000003.jpg,China_Drone_000003.jpg.json
4,China_Drone_000004.jpg,China_Drone_000004.jpg.json
...,...,...
38380,United_States_004800.jpg,United_States_004800.jpg.json
38381,United_States_004801.jpg,United_States_004801.jpg.json
38382,United_States_004802.jpg,United_States_004802.jpg.json
38383,United_States_004803.jpg,United_States_004803.jpg.json


In [7]:
#Split the dataset
train_images, val_images, train_ann, val_ann = train_test_split(images_files, ann_files, test_size=0.2, random_state=42, shuffle=True)

In [8]:
#Checking the train
train_check = pd.DataFrame([train_images, train_ann])
train_check.T

Unnamed: 0,0,1
0,United_States_000805.jpg,United_States_000805.jpg.json
1,Japan_004210.jpg,Japan_004210.jpg.json
2,Norway_000802.jpg,Norway_000802.jpg.json
3,Japan_007890.jpg,Japan_007890.jpg.json
4,Norway_007584.jpg,Norway_007584.jpg.json
...,...,...
30703,Czech_002411.jpg,Czech_002411.jpg.json
30704,India_005239.jpg,India_005239.jpg.json
30705,United_States_004578.jpg,United_States_004578.jpg.json
30706,China_Drone_000860.jpg,China_Drone_000860.jpg.json


In [9]:
# Checking the validation
val_check = pd.DataFrame([val_images, val_ann])
val_check.T

Unnamed: 0,0,1
0,Czech_001051.jpg,Czech_001051.jpg.json
1,Czech_003107.jpg,Czech_003107.jpg.json
2,Japan_006809.jpg,Japan_006809.jpg.json
3,Japan_001745.jpg,Japan_001745.jpg.json
4,Japan_010527.jpg,Japan_010527.jpg.json
...,...,...
7672,Japan_004392.jpg,Japan_004392.jpg.json
7673,Japan_003905.jpg,Japan_003905.jpg.json
7674,Norway_003852.jpg,Norway_003852.jpg.json
7675,Japan_011626.jpg,Japan_011626.jpg.json


In [10]:
def get_country(filename):
    return filename.split('_')[0]

In [14]:
from collections import Counter
country_counts = Counter(get_country(f) for f in train_images)
total_country_counts = Counter(get_country(f) for f in images_files)
val_country_counts = Counter(get_country(f) for f in val_images)
print("---Total Dataset Country Counts--")
for country, count in total_country_counts.items():
    print(f"{country}: {count} images")

print("---Train Dataset Country Counts---")

for country, count in country_counts.items():
    print(f"{country}: {count} images")

print("---Validation Dataset Country Counts---")
for country, count in val_country_counts.items():
    print(f"{country}: {count} images")

---Total Dataset Country Counts--
China: 4378 images
Czech: 2829 images
India: 7706 images
Japan: 10506 images
Norway: 8161 images
United: 4805 images
---Train Dataset Country Counts---
United: 3846 images
Japan: 8470 images
Norway: 6486 images
Czech: 2258 images
India: 6163 images
China: 3485 images
---Validation Dataset Country Counts---
Czech: 571 images
Japan: 2036 images
India: 1543 images
Norway: 1675 images
China: 893 images
United: 959 images


In [15]:
# Copying the new split training images and their annotations files into new train folder
for img, ann in zip(train_images, train_ann):
    shutil.copy(os.path.join(img_folder, img), os.path.join(train_img_output, img))
    shutil.copy(os.path.join(ann_folder, ann), os.path.join(train_ann_output, ann))

In [16]:
# Copying the new split validation images and their annotations files into new
for img, ann in zip(val_images, val_ann):
    shutil.copy(os.path.join(img_folder, img), os.path.join(val_img_output, img))
    shutil.copy(os.path.join(ann_folder, ann), os.path.join(val_ann_output, ann))