In [5]:
import os
from collections import Counter

# Define the path to the labels folder
labels_dir = './Brain_Tumor_Detection_v1/labels'

# Initialize a Counter to track the class occurrences
class_counts = Counter()

# Loop through each label file in the directory
for label_file in os.listdir(labels_dir):
    # Construct the full path to the label file
    label_path = os.path.join(labels_dir, label_file)
    
    # Open the label file and count the classes
    with open(label_path, 'r') as file:
        for line in file:
            # The first number in each line represents the class
            class_id = int(line.split()[0])
            class_counts[class_id] += 1

# Display the class balance
print("Class balance in the dataset:")
for class_id, count in sorted(class_counts.items()):
    print(f"Class {class_id}: {count} instances")


Class balance in the dataset:
Class 0: 6072 instances
Class 1: 9651 instances
Class 2: 5802 instances


In [16]:
import os
import shutil
import random
from pathlib import Path

# Paths
base_dir = './Brain_Tumor_Detection_v1'
images_dir = os.path.join(base_dir, 'images')
labels_dir = os.path.join(base_dir, 'labels')

# Output directories for split data
split_dirs = {
    'train_images': os.path.join(base_dir, 'train/images'),
    'train_labels': os.path.join(base_dir, 'train/labels'),
    'val_images': os.path.join(base_dir, 'valid/images'),
    'val_labels': os.path.join(base_dir, 'valid/labels'),
    'test_images': os.path.join(base_dir, 'test/images'),
    'test_labels': os.path.join(base_dir, 'test/labels')
}

# Create split directories if they don't exist
for dir_path in split_dirs.values():
    os.makedirs(dir_path, exist_ok=True)

# Get all image paths
image_paths = list(Path(images_dir).glob("*.jpg"))
label_paths = [Path(labels_dir) / (img_path.stem + '.txt') for img_path in image_paths]

# Combine image and label paths into a list of tuples
image_label_pairs = list(zip(image_paths, label_paths))

# Shuffle the image-label pairs to randomize the split
random.shuffle(image_label_pairs)

# Calculate the split counts
total_images = len(image_label_pairs)
train_count = int(total_images * 0.8)
val_count = int(total_images * 0.1)
test_count = total_images - train_count - val_count

# Split the data
for idx, (image_path, label_path) in enumerate(image_label_pairs):
    if idx < train_count:
        split = 'train'
    elif idx < train_count + val_count:
        split = 'val'
    else:
        split = 'test'

    # Copy the images to the appropriate split directory
    shutil.copy(image_path, split_dirs[f'{split}_images'])

    # Check if the corresponding label file exists before copying
    if label_path.exists():
        shutil.copy(label_path, split_dirs[f'{split}_labels'])
    else:
        print(f"Warning: Label file for {image_path.name} not found.")

print("Data successfully split into train, validation, and test sets.")


Data successfully split into train, validation, and test sets.
