In [1]:
import pandas as pd
import numpy as np
import os

# User defined parameters
TRAINING_FILE_NAME = "training_images.txt"    # Name for combined training image list
TEST_FILE_NAME = "test_images.txt"            # Name for combined test image list
RANDOM_FILE_NAME = "random_images.txt"        # Name for random image list

save_training = False
save_test = True
save_random = False  # Flag to control saving random images
number_of_images_per_class = 4500            # Number of training images per class
number_of_images_test_per_class = 200        # Number of test images per class
number_of_images_random = 20000               # Number of random images to select

# Load the CSV files
train_annotations = pd.read_csv('oidv6-train-annotations-bbox.csv')
class_descriptions = pd.read_csv('oidv7-class-descriptions-boxable.csv')
test_annotations = pd.read_csv('test-annotations-bbox.csv')

In [2]:
# Get classes to use based on UseInModel column
selected_classes = class_descriptions[class_descriptions['UseInModel'] == True]

# Create output directory if it doesn't exist
os.makedirs('image_lists', exist_ok=True)

# Initialize lists to store all ImageIDs
all_training_ids = []
all_test_ids = []

# Process each selected class
for _, row in selected_classes.iterrows():
    display_name = row['DisplayName']
    label = row['LabelName']
    
    print(f"\nProcessing class: {display_name}")
    
    # Filter annotations for current class
    class_train_annotations = train_annotations[train_annotations['LabelName'] == label]
    class_test_annotations = test_annotations[test_annotations['LabelName'] == label]
    
    # Get unique ImageIDs
    train_unique_image_ids = class_train_annotations['ImageID'].unique()
    test_unique_image_ids = class_test_annotations['ImageID'].unique()
    
    # Limit to specified number of unique ImageIDs
    limited_train_ids = train_unique_image_ids[:number_of_images_per_class]
    limited_test_ids = test_unique_image_ids[:number_of_images_test_per_class]
    
    # Add to combined lists
    all_training_ids.extend(limited_train_ids)
    all_test_ids.extend(limited_test_ids)
    
    print(f"Added {len(limited_train_ids)} training and {len(limited_test_ids)} test ImageIDs for {display_name}")

# Write combined training ImageIDs
if save_training:
    output_file = os.path.join('image_lists', TRAINING_FILE_NAME)
    with open(output_file, 'w') as f:
        for image_id in all_training_ids:
            f.write(f"train/{image_id}\n")
    print(f"\nWrote {len(all_training_ids)} total training ImageIDs to {output_file}")

# Write combined test ImageIDs
if save_test:
    output_file = os.path.join('image_lists', TEST_FILE_NAME)
    with open(output_file, 'w') as f:
        for image_id in all_test_ids:
            f.write(f"test/{image_id}\n")
    print(f"Wrote {len(all_test_ids)} total test ImageIDs to {output_file}")

# Handle random images (if enabled)
if save_random:
    all_unique_image_ids = train_annotations['ImageID'].unique()
    random_image_ids = np.random.choice(
        all_unique_image_ids,
        size=min(number_of_images_random, len(all_unique_image_ids)),
        replace=False
    )
    
    output_file = os.path.join('image_lists', RANDOM_FILE_NAME)
    with open(output_file, 'w') as f:
        for image_id in random_image_ids:
            f.write(f"train/{image_id}\n")
    print(f"Wrote {len(random_image_ids)} random ImageIDs to {output_file}")

print("\nProcessing complete!")


Processing class: Footwear
Added 4500 training and 200 test ImageIDs for Footwear

Processing class: Suit
Added 4500 training and 200 test ImageIDs for Suit

Processing class: Glasses
Added 4500 training and 200 test ImageIDs for Glasses

Processing class: Dress
Added 4500 training and 200 test ImageIDs for Dress

Processing class: Jeans
Added 4500 training and 200 test ImageIDs for Jeans

Processing class: Tire
Added 4500 training and 200 test ImageIDs for Tire

Processing class: Fashion accessory
Added 4500 training and 200 test ImageIDs for Fashion accessory

Processing class: Microphone
Added 4500 training and 200 test ImageIDs for Microphone

Processing class: Guitar
Added 4500 training and 200 test ImageIDs for Guitar

Processing class: Toy
Added 4500 training and 200 test ImageIDs for Toy

Processing class: Poster
Added 4500 training and 200 test ImageIDs for Poster

Processing class: Drink
Added 4500 training and 200 test ImageIDs for Drink

Processing class: Bicycle wheel
Add