In [1]:
import pandas as pd
import os
import shutil
import random

## Splitting Data set

The original dataset is too large to train a model efficiently.  
20 unique drivers will be chosen randomly from the driver_imgs_list.csv.
An image from each classification will be chosen for each driver.  
We will only pull data from the original train subset since these are labeled.

In [2]:
random.seed(333)

images_dir = 'Imgs/train/'
excel_file = 'driver_imgs_list.csv'
output_dir = 'limited_dataset'

os.makedirs(output_dir, exist_ok=True)

In [3]:
df = pd.read_csv(excel_file)

# List of unique drivers
unique_drivers = df['subject'].unique()

random.shuffle(unique_drivers)

# Selecting the first 20 drivers
drivers = unique_drivers[:20]

image_counts = {f'c{i}': 0 for i in range(10)}

In [4]:
driver_photo_counts = {driver_id: 0 for driver_id in drivers}
selected_images = {driver_id: {f'c{i}': False for i in range(10)} for driver_id in drivers}


In [5]:
for driver_id in drivers:

    driver_df = df[df['subject'] == driver_id]
    driver_df =driver_df.sample(frac=1).reset_index(drop=True)

    for index, row in driver_df.iterrows():
        if image_counts[row['classname']] >= 20:
            continue

        if not selected_images[driver_id][row['classname']]:
            src_path = os.path.join(images_dir, row['classname'], row['img'])
            dst_path = os.path.join(output_dir, row['img'])
            shutil.copyfile(src_path, dst_path)

            image_counts[row['classname']] +=1
            driver_photo_counts[driver_id] += 1
            selected_images[driver_id][row['classname']] = True

        
        if all(count >= 20 for count in image_counts.values()):
            break

    if all(count >= 20 for count in image_counts.values()):
        break
