In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os

# Set the working directory
cbis_path = '/content/drive/MyDrive/colab_scripts/repository/datasets'
os.chdir(cbis_path)

# Confirm current directory
print("Current Directory:", os.getcwd())

Current Directory: /content/drive/MyDrive/colab_scripts/repository/datasets


In [3]:
import os
import shutil
import pandas as pd
import random
from tqdm.notebook import tqdm

In [4]:
# Paths and configuration
csv_path = 'CBIS-DDSM/csv/train_plus_test.csv'
images_dir = 'CBIS-DDSM/NEW/IMAGES'
labels_dir = 'CBIS-DDSM/NEW/LABELS'
output_root = 'CBIS-DDSM/yolo_dataset'

In [5]:
# Set split percentages (sum must be 1.0)
train_split = 0.7
val_split = 0.15
test_split = 0.15

In [6]:
# Define output structure
splits = ['train', 'val', 'test']
for split in splits:
    os.makedirs(os.path.join(output_root, split, 'images'), exist_ok=True)
    os.makedirs(os.path.join(output_root, split, 'labels'), exist_ok=True)

In [7]:
# Load CSV and prepare groupings
df = pd.read_csv(csv_path)
unique_groups = df['new name'].dropna().unique().tolist()
random.shuffle(unique_groups)

In [8]:
# Calculate split sizes
total = len(unique_groups)
train_count = int(train_split * total)
val_count = int(val_split * total)
test_count = total - train_count - val_count

split_map = {
    'train': unique_groups[:train_count],
    'val': unique_groups[train_count:train_count+val_count],
    'test': unique_groups[train_count+val_count:]
}

In [9]:
#  Copy files to respective split folders
for split, group_names in split_map.items():
    print(f"üìÇ Copying {split} set with {len(group_names)} groups...")
    for name in tqdm(group_names):
        image_src = os.path.join(images_dir, name + '.jpg')
        label_src = os.path.join(labels_dir, name + '.txt')

        image_dst = os.path.join(output_root, split, 'images', name + '.jpg')
        label_dst = os.path.join(output_root, split, 'labels', name + '.txt')

        # Check file existence before copying
        if os.path.exists(image_src):
            shutil.copy2(image_src, image_dst)
        else:
            print(f"‚ö†Ô∏è Missing image: {image_src}")

        if os.path.exists(label_src):
            shutil.copy2(label_src, label_dst)
        else:
            print(f"‚ö†Ô∏è Missing label: {label_src}")

print("‚úÖ Dataset split and file copy complete.")

üìÇ Copying train set with 1059 groups...


  0%|          | 0/1059 [00:00<?, ?it/s]

üìÇ Copying val set with 227 groups...


  0%|          | 0/227 [00:00<?, ?it/s]

üìÇ Copying test set with 228 groups...


  0%|          | 0/228 [00:00<?, ?it/s]

‚úÖ Dataset split and file copy complete.


In [10]:
# Generate data.yaml
classes = ['benign', 'malignant']
yaml_content = f"""train: {output_root}/train/images
val: {output_root}/val/images
test: {output_root}/test/images

nc: {len(classes)}
names: {classes}
"""

with open(os.path.join(output_root, 'data.yaml'), 'w') as f:
    f.write(yaml_content)

print("‚úÖ data.yaml created.")

‚úÖ data.yaml created.
