<a href="https://colab.research.google.com/github/Takosaga/ai_group_project/blob/main/notebooks/data_sorter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import random
from google.colab import auth
from google.cloud import storage

In [3]:
# Authenticate and initialize the Google Cloud client
auth.authenticate_user()
project_id = "ai-group-project"  # Replace with your Google Cloud project ID
client = storage.Client(project=project_id)
bucket_name = "ai-group-project-data"  # Replace with your bucket name
bucket = client.get_bucket(bucket_name)

In [4]:
# Define source and target paths
source_path = "data/raw/images/"  # Path in the bucket where raw images are stored
target_path = "datasets/v1"  # Path in the bucket for train/val/test datasets

In [5]:
# Create folders for train, val, and test splits
splits = ["train", "val", "test"]
for split in splits:
    os.makedirs(f"{target_path}/{split}/images", exist_ok=True)

In [6]:
# Proportions for train, val, and test
split_ratios = {"train": 0.6, "val": 0.2, "test": 0.2}

In [7]:
# List all image files in the source path
blobs = list(bucket.list_blobs(prefix=source_path))
image_files = [blob.name for blob in blobs if blob.name.endswith((".jpg", ".png", ".jpeg"))]

In [8]:
# Shuffle the files for random distribution
random.shuffle(image_files)

# Calculate number of files for each split
total_files = len(image_files)
train_count = int(total_files * split_ratios["train"])
val_count = int(total_files * split_ratios["val"])
test_count = total_files - train_count - val_count

# Split the files
train_files = image_files[:train_count]
val_files = image_files[train_count:train_count + val_count]
test_files = image_files[train_count + val_count:]

In [9]:
# Function to copy files to the target location
def copy_files(file_list, split_name):
    copied_files = 0
    for file_path in file_list:
        blob = bucket.blob(file_path)
        target_blob_name = file_path.replace(source_path, f"{target_path}/{split_name}/images/")
        bucket.copy_blob(blob, bucket, new_name=target_blob_name)
        copied_files += 1
    return copied_files

In [10]:
# Copy files to their respective folders
print("Copying train files...")
train_copied = copy_files(train_files, "train")
assert train_copied == train_count, f"Expected {train_count} files, but only {train_copied} were copied to train."

Copying train files...


In [11]:
print("Copying val files...")
val_copied = copy_files(val_files, "val")
assert val_copied == val_count, f"Expected {val_count} files, but only {val_copied} were copied to val."

Copying val files...


In [12]:
print("Copying test files...")
test_copied = copy_files(test_files, "test")
assert test_copied == test_count, f"Expected {test_count} files, but only {test_copied} were copied to test."

Copying test files...


In [13]:
# Final verification summary
print("\nVerification Summary:")
print(f"Train files copied: {train_copied}/{train_count}")
print(f"Validation files copied: {val_copied}/{val_count}")
print(f"Test files copied: {test_copied}/{test_count}")
print("File distribution completed successfully!")


Verification Summary:
Train files copied: 2521/2521
Validation files copied: 840/840
Test files copied: 842/842
File distribution completed successfully!
