In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import os
import shutil
import random

# Paths (change these to match your structure)
csv_path = '/content/drive/MyDrive/MIAS_third_try/filtered_info.csv'
images_folder = '/content/drive/MyDrive/MIAS_third_try/RGB_jpg_images_CLAHE'
labels_folder = '/content/drive/MyDrive/MIAS_third_try/YOLO_labels2'
output_root = '/content/drive/MyDrive/MIAS_third_try/yolo_dataset_clahe'

# Read CSV
df = pd.read_csv(csv_path)

# Filter: keep rows where Abnormality Type is NOT 'CALC'
filtered_df = df[df['Abnormality Type'] != 'CALC']

# Extract the relevant IDs (as strings)
filtered_ids = filtered_df['ID'].astype(str).tolist()

print(f"✅ Number of images NOT having CALC: {len(filtered_ids)}")
print("Sample IDs:", filtered_ids[:5])

✅ Number of images NOT having CALC: 299
Sample IDs: ['mdb001', 'mdb002', 'mdb003', 'mdb004', 'mdb005']


In [None]:
# Set your split percentages
train_split = 0.7
valid_split = 0.2
test_split = 0.1

# Shuffle and split
random.shuffle(filtered_ids)
total = len(filtered_ids)
train_ids = filtered_ids[:int(total * train_split)]
valid_ids = filtered_ids[int(total * train_split):int(total * (train_split + valid_split))]
test_ids = filtered_ids[int(total * (train_split + valid_split)):]


In [None]:
def copy_files(id_list, subset):
    subset_img_dir = os.path.join(output_root, subset, 'images')
    subset_lbl_dir = os.path.join(output_root, subset, 'labels')
    os.makedirs(subset_img_dir, exist_ok=True)
    os.makedirs(subset_lbl_dir, exist_ok=True)

    for img_id in id_list:
        # Handle various extensions if needed
        for ext in ['.png', '.jpg', '.jpeg', '.bmp']:
            img_path = os.path.join(images_folder, img_id + ext)
            if os.path.exists(img_path):
                shutil.copy(img_path, os.path.join(subset_img_dir, os.path.basename(img_path)))
                break
        # Copy label if it exists
        label_path = os.path.join(labels_folder, img_id + '.txt')
        if os.path.exists(label_path):
            shutil.copy(label_path, os.path.join(subset_lbl_dir, os.path.basename(label_path)))

In [None]:
copy_files(train_ids, 'train')
copy_files(valid_ids, 'valid')
copy_files(test_ids, 'test')
print("✅ Dataset structure created.")

✅ Dataset structure created.


In [None]:
yaml_path = os.path.join(output_root, 'data.yaml')

# You can also make this dynamic by extracting class names from labels if needed
class_names = ['benign', 'malignant']  # Replace with actual class names if you have more

with open(yaml_path, 'w') as f:
    f.write(f"path: {output_root}\n")
    f.write(f"train: {output_root}/train/images\n")
    f.write(f"val: {output_root}/valid/images\n")
    f.write(f"test: {output_root}/test/images\n")
    f.write(f"names: {class_names}\n")
    f.write(f"nc: {len(class_names)}\n")

print(f"✅ YOLO data.yaml created at {yaml_path}")

✅ YOLO data.yaml created at /content/drive/MyDrive/MIAS_third_try/yolo_dataset_clahe/data.yaml
