In [9]:
import shutil
import os
# Path to the segmented directory
segmented_root = os.path.join("..", "datasets", "segmented")

# Loop through each segment type
for segment_type in os.listdir(segmented_root):
    segment_type_path = os.path.join(segmented_root, segment_type)
    if not os.path.isdir(segment_type_path):
        continue

    # Loop through each country in the segment type
    for country in os.listdir(segment_type_path):
        country_path = os.path.join(segment_type_path, country)
        if not os.path.isdir(country_path):
            continue

        # Move files from train and val to country root
        for split in ['train', 'val']:
            split_path = os.path.join(country_path, split)
            if os.path.isdir(split_path):
                # Move all files from split directory to country directory
                for file in os.listdir(split_path):
                    src = os.path.join(split_path, file)
                    dst = os.path.join(country_path, file)
                    shutil.move(src, dst)
                
                # Remove the now empty split directory
                os.rmdir(split_path)

print("Files have been moved successfully!")



Files have been moved successfully!


In [10]:
import os
from collections import defaultdict

# Path to the segmented directory (adjust if running from a different location)
segmented_root = os.path.join("..", "datasets", "segmented")

# Dictionary to store results
segment_type_country_counts = {}
segment_type_country_image_counts = {}

# Loop through each segment type (e.g., 'bicycle', etc.)
for segment_type in os.listdir(segmented_root):
    segment_type_path = os.path.join(segmented_root, segment_type)
    if not os.path.isdir(segment_type_path):
        continue

    countries = []
    country_image_counts = defaultdict(int)

    # Loop through each country in the segment type
    for country in os.listdir(segment_type_path):
        country_path = os.path.join(segment_type_path, country)
        if not os.path.isdir(country_path):
            continue
        countries.append(country)

        # Count images directly in country folder
        num_images = len([
            f for f in os.listdir(country_path)
            if os.path.isfile(os.path.join(country_path, f))
        ])
        country_image_counts[country] = num_images

    segment_type_country_counts[segment_type] = len(countries)
    segment_type_country_image_counts[segment_type] = dict(country_image_counts)

# Print results
for segment_type in segment_type_country_counts:
    print(f"Segment type: {segment_type}")
    print(f"  Number of countries: {segment_type_country_counts[segment_type]}")
    print(f"  Images per country:")
    for country, count in segment_type_country_image_counts[segment_type].items():
        print(f"    {country}: {count}")
    print()


Segment type: terrain
  Number of countries: 79
  Images per country:
    Montenegro: 376
    Ukraine: 366
    Germany: 351
    Australia: 387
    Kenya: 392
    Belgium: 371
    Ecuador: 311
    Bulgaria: 382
    Hungary: 390
    Italy: 337
    Canada: 395
    Russia: 382
    Peru: 345
    Bolivia: 304
    Ghana: 364
    Indonesia: 357
    Jordan: 283
    Botswana: 395
    Slovakia: 381
    New Zealand: 390
    Taiwan: 355
    Luxembourg: 360
    United States: 381
    Portugal: 369
    South Korea: 293
    Sri Lanka: 355
    Bangladesh: 351
    Estonia: 394
    Bhutan: 386
    Argentina: 366
    Guatemala: 335
    Finland: 397
    Mexico: 317
    Czechia: 381
    Mongolia: 357
    Greece: 338
    Singapore: 385
    Austria: 287
    United Arab Emirates: 322
    Romania: 372
    South Africa: 383
    Poland: 386
    Albania: 334
    Netherlands: 377
    Uruguay: 372
    Iceland: 391
    Kyrgyzstan: 375
    Switzerland: 366
    Sweden: 391
    Nigeria: 369
    Colombia: 345
    France:

In [11]:
# Find segment types that don't have all 79 countries
missing_countries = {}
for segment_type, country_counts in segment_type_country_counts.items():
    if country_counts < 79:  # 79 is the total number of countries in COUNTRIES list
        missing_countries[segment_type] = country_counts

# Print results
print("Segment types with missing countries:")
for segment_type, count in missing_countries.items():
    print(f"{segment_type}: {count} countries (missing {79 - count} countries)")


Segment types with missing countries:


In [12]:
# Calculate and print bottom 5 segments with lowest sum of bottom 5 countries
print("Bottom 10 segments with lowest sum of bottom 5 countries:")
print("-" * 50)

# Calculate sum of bottom 5 countries for each segment type
segment_sums = {}
for segment_type in segment_type_country_image_counts:
    sorted_countries = sorted(
        segment_type_country_image_counts[segment_type].items(),
        key=lambda x: x[1]
    )
    bottom_5_sum = sum(count for _, count in sorted_countries[:5])
    segment_sums[segment_type] = bottom_5_sum

# Sort segments by their bottom 5 sum
sorted_segments = sorted(segment_sums.items(), key=lambda x: x[1])

# Print bottom 5 segments
for segment_type, total in sorted_segments[:10]:
    print(f"\n{segment_type}:")
    print(f"  Total images in bottom 5 countries: {total}")
    # Print the individual countries and their counts
    sorted_countries = sorted(
        segment_type_country_image_counts[segment_type].items(),
        key=lambda x: x[1]
    )
    for country, count in sorted_countries[:5]:
        print(f"    {country}: {count} images")


Bottom 10 segments with lowest sum of bottom 5 countries:
--------------------------------------------------

rider:
  Total images in bottom 5 countries: 39
    Estonia: 3 images
    Palestine: 7 images
    Iceland: 9 images
    Bulgaria: 10 images
    Luxembourg: 10 images

motorcycle:
  Total images in bottom 5 countries: 47
    Estonia: 4 images
    Latvia: 8 images
    Eswatini: 10 images
    Iceland: 12 images
    Botswana: 13 images

train:
  Total images in bottom 5 countries: 56
    Estonia: 6 images
    Bhutan: 7 images
    Lithuania: 12 images
    Slovenia: 15 images
    Andorra: 16 images

bus:
  Total images in bottom 5 countries: 60
    Bhutan: 9 images
    Greece: 11 images
    Bulgaria: 13 images
    Estonia: 13 images
    Eswatini: 14 images

truck:
  Total images in bottom 5 countries: 100
    Lithuania: 16 images
    Bhutan: 20 images
    Hungary: 21 images
    Latvia: 21 images
    Estonia: 22 images

bicycle:
  Total images in bottom 5 countries: 104
    Estonia: 1

In [8]:
segment_type_country_image_counts.keys()

dict_keys(['terrain', 'road', 'bus', 'train', 'car', 'building', 'pole', 'vegetation', 'rider', 'traffic_light', 'bicycle', 'fence', 'person', 'truck', 'sidewalk', 'motorcycle', 'traffic_sign'])

In [16]:
import os
import shutil
import random

def create_train_val_split(source_dir, train_dir, val_dir, test_dir, train_ratio=0.8, val_ratio=0.1):
    """
    Create exact 80:10:10 train/val/test split while preserving existing test images
    """
    # All images in the source
    all_images = [f for f in os.listdir(source_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]
    
    # Existing test images
    existing_test_images = set(os.listdir(test_dir))
    n_test = len(existing_test_images)
    
    # Desired total number of images based on existing test count
    total_images_needed = int(n_test / (1 - train_ratio - val_ratio))
    
    # Compute how many more are needed for train and val
    n_train = int(total_images_needed * train_ratio)
    n_val = int(total_images_needed * val_ratio)
    
    # Exclude test images from all
    remaining_images = [img for img in all_images if img not in existing_test_images]
    
    # Sanity check
    if len(remaining_images) < (n_train + n_val):
        print(f"Not enough images to fulfill desired split. Need {n_train + n_val}, but only {len(remaining_images)} available.")
    
    # Shuffle and sample
    random.shuffle(remaining_images)
    train_images = remaining_images[:n_train]
    val_images = remaining_images[n_train:n_train + n_val]
    
    # Copy images
    for img in train_images:
        shutil.copy2(os.path.join(source_dir, img), os.path.join(train_dir, img))
        
    for img in val_images:
        shutil.copy2(os.path.join(source_dir, img), os.path.join(val_dir, img))

def clear_train_val_dirs(base_dir):
    for segment_type in segment_type_country_image_counts.keys():
        segment_dir = os.path.join(base_dir, segment_type)
        for country in segment_type_country_image_counts[segment_type].keys():
            country_dir = os.path.join(segment_dir, country)
            train_dir = os.path.join(country_dir, "train")
            val_dir = os.path.join(country_dir, "val")
            
            if os.path.exists(train_dir):
                shutil.rmtree(train_dir)
            if os.path.exists(val_dir):
                shutil.rmtree(val_dir)

# Base directory containing all segment types
base_dir = "/home/andreafabbricatore/rainbot/datasets/segmented"  # Replace with your actual base directory

clear_train_val_dirs(base_dir)

# Process each segment type
for segment_type in segment_type_country_image_counts.keys():
    segment_dir = os.path.join(base_dir, segment_type)
    
    # Process each country
    for country in segment_type_country_image_counts[segment_type].keys():
        country_dir = os.path.join(segment_dir, country)
        
        # Create train/val/test directories if they don't exist
        train_dir = os.path.join(country_dir, "train")
        val_dir = os.path.join(country_dir, "val")
        test_dir = os.path.join(country_dir, "test")
        
        os.makedirs(train_dir, exist_ok=True)
        os.makedirs(val_dir, exist_ok=True)
        os.makedirs(test_dir, exist_ok=True)
        
        # Create splits while preserving test images
        create_train_val_split(
            source_dir=country_dir,
            train_dir=train_dir,
            val_dir=val_dir,
            test_dir=test_dir
        )

print("Dataset splitting completed successfully!")


Not enough images to fulfill desired split. Need 378, but only 376 available.
Not enough images to fulfill desired split. Need 387, but only 366 available.
Not enough images to fulfill desired split. Need 405, but only 387 available.
Not enough images to fulfill desired split. Need 405, but only 392 available.
Not enough images to fulfill desired split. Need 378, but only 371 available.
Not enough images to fulfill desired split. Need 405, but only 390 available.
Not enough images to fulfill desired split. Need 405, but only 395 available.
Not enough images to fulfill desired split. Need 387, but only 382 available.
Not enough images to fulfill desired split. Need 351, but only 345 available.
Not enough images to fulfill desired split. Need 324, but only 283 available.
Not enough images to fulfill desired split. Need 396, but only 395 available.
Not enough images to fulfill desired split. Need 387, but only 355 available.
Not enough images to fulfill desired split. Need 378, but only 3

In [19]:
# Calculate and print train:test:val ratios for each segment type
print("Train:Test:Val ratios (80:10:10) for each segment type:")

segments_to_skip = []

for segment_type in segment_type_country_image_counts.keys():
    print(f"\n{segment_type}:")
    for country in segment_type_country_image_counts[segment_type].keys():
        country_dir = os.path.join(base_dir, segment_type, country)
        
        # Count images in each split
        train_count = len(os.listdir(os.path.join(country_dir, "train")))
        test_count = len(os.listdir(os.path.join(country_dir, "test")))
        val_count = len(os.listdir(os.path.join(country_dir, "val")))
        
        total = train_count + test_count + val_count
        
        try:
            # Calculate actual ratios
            train_ratio = train_count / total
            test_ratio = test_count / total
            val_ratio = val_count / total
            
            print(f"{country}:")
            print(f"  Train: {train_ratio:.2%}")
            print(f"  Test:  {test_ratio:.2%}")
            print(f"  Val:   {val_ratio:.2%}")
        except ZeroDivisionError:
            segments_to_skip.append(f"{segment_type}/{country}")

if segments_to_skip:
    print("\nSegments with zero images (skipped):")
    for segment in segments_to_skip:
        print(f"- {segment}")


Train:Test:Val ratios (80:10:10) for each segment type:

terrain:
Montenegro:
  Train: 80.38%
  Test:  10.05%
  Val:   9.57%
Ukraine:
  Train: 84.11%
  Test:  10.51%
  Val:   5.38%
Germany:
  Train: 80.00%
  Test:  10.00%
  Val:   10.00%
Australia:
  Train: 83.33%
  Test:  10.42%
  Val:   6.25%
Kenya:
  Train: 82.38%
  Test:  10.30%
  Val:   7.32%
Belgium:
  Train: 81.36%
  Test:  10.17%
  Val:   8.47%
Ecuador:
  Train: 80.00%
  Test:  10.00%
  Val:   10.00%
Bulgaria:
  Train: 80.00%
  Test:  10.00%
  Val:   10.00%
Hungary:
  Train: 82.76%
  Test:  10.34%
  Val:   6.90%
Italy:
  Train: 80.00%
  Test:  10.00%
  Val:   10.00%
Canada:
  Train: 81.82%
  Test:  10.23%
  Val:   7.95%
Russia:
  Train: 80.94%
  Test:  10.12%
  Val:   8.94%
Peru:
  Train: 81.25%
  Test:  10.16%
  Val:   8.59%
Bolivia:
  Train: 80.00%
  Test:  10.00%
  Val:   10.00%
Ghana:
  Train: 80.00%
  Test:  10.00%
  Val:   10.00%
Indonesia:
  Train: 80.00%
  Test:  10.00%
  Val:   10.00%
Jordan:
  Train: 88.71%
  Test:  1

In [22]:
segments_to_skip = list(set([i.split("/")[0] for i in segments_to_skip]))

In [None]:
COUNTRIES = ['Albania', 'Andorra', 'Australia', 'Austria', 'Bangladesh', 'Belgium', 'Bhutan', 'Bolivia', 'Brazil', 'Bulgaria', 'Cambodia', 'Canada', 'Chile', 'Colombia', 'Croatia', 'Czechia', 'Denmark', 'Dominican Republic', 'Ecuador', 'Estonia', 'Eswatini', 'Finland', 'France', 'Germany', 'Greece', 'Guatemala', 'Hungary', 'Iceland', 'Indonesia', 'Ireland', 'Israel', 'Italy', 'Japan', 'Jordan', 'Latvia', 'Lesotho', 'Lithuania', 'Luxembourg', 'Malaysia', 'Mexico', 'Montenegro', 'Netherlands', 'New Zealand', 'North Macedonia', 'Norway', 'Palestine', 'Peru', 'Poland', 'Portugal', 'Romania', 'Russia', 'Serbia', 'Singapore', 'Slovakia', 'Slovenia', 'South Africa', 'South Korea', 'Spain', 'Sweden', 'Switzerland', 'Taiwan', 'Thailand', 'Turkey', 'United Arab Emirates', 'United Kingdom', 'United States']
# Create final_datasets directory structure for each segment type
for segment_type in segment_type_country_image_counts.keys():
    if segment_type in segments_to_skip:
        continue
        
    # Create segment-specific final_datasets directory
    segment_final_dir = os.path.join(base_dir, segment_type, "final_datasets")
    
    # Create train/test/val splits
    for split in ["train", "test", "val"]:
        split_dir = os.path.join(segment_final_dir, split)
        os.makedirs(split_dir, exist_ok=True)
        
        # Create country subdirectories
        for country in COUNTRIES:
            country_dir = os.path.join(split_dir, country)
            os.makedirs(country_dir, exist_ok=True)
    
    # Copy images to new structure
    for country in segment_type_country_image_counts[segment_type].keys():
        # Source directories for each split
        train_src = os.path.join(base_dir, segment_type, country, "train")
        test_src = os.path.join(base_dir, segment_type, country, "test")
        val_src = os.path.join(base_dir, segment_type, country, "val")
        
        # Destination directories
        train_dst = os.path.join(segment_final_dir, "train", country)
        test_dst = os.path.join(segment_final_dir, "test", country)
        val_dst = os.path.join(segment_final_dir, "val", country)
        
        # Copy files for each split
        for src, dst in [(train_src, train_dst), (test_src, test_dst), (val_src, val_dst)]:
            if os.path.exists(src):
                for img in os.listdir(src):
                    src_path = os.path.join(src, img)
                    dst_path = os.path.join(dst, img)
                    shutil.copy2(src_path, dst_path)

print("Dataset restructuring completed!")


Dataset restructuring completed!
