In [30]:
"""
MURA Path-Label Collector (YAML Config Version)
-----------------------------------------------
This script reads configuration from a YAML file (data_config.yaml)
and collects image paths and corresponding labels into a CSV file.
"""

# ---- Import Required Libraries ----
import os
import yaml
import pandas as pd
from tqdm import tqdm
from pathlib import Path


In [31]:
# ---- Embedded Configuration ----
CONFIG_YAML = """
data:
  base_path: "D:/collage project/fracture detection comp vision/MURA-v1.1"
  subsets: ["train", "valid"]
  body_parts: [
    "XR_ELBOW",
    "XR_FINGER",
    "XR_FOREARM",
    "XR_HAND",
    "XR_HUMERUS",
    "XR_SHOULDER",
    "XR_WRIST"
  ]
  metadata_output: "data/processed/metadata.csv"
"""

In [32]:
# ---- Modified Function to Load Configurations ----
def load_config(config_path=None):
    """
    Load and validate configuration settings from embedded YAML.
    (Modified to use embedded config if no path provided)
    """
    if config_path:
        # If path provided, load from file (original behavior)
        with open(config_path) as f:
            config = yaml.safe_load(f)
    else:
        # Use embedded config
        config = yaml.safe_load(CONFIG_YAML)
    
    # Check if all required fields exist
    required_fields = ['base_path', 'subsets', 'body_parts', 'metadata_output']
    for field in required_fields:
        if field not in config['data']:
            raise ValueError(f"Missing required config field: data.{field}")
    
    return config['data']




In [33]:

# ---- Function to Collect Image Paths and Labels ----
def collect_paths_labels(config):
    """
    Traverse dataset folders to collect image paths and their corresponding labels.

    Args:
        config (dict): Configuration dictionary from load_config().

    Returns:
        pd.DataFrame: A DataFrame containing 'path' and 'label' columns.
    """
    paths = []   # Store image file paths
    labels = []  # Store corresponding labels (0 = negative, 1 = positive)
    
    print(f"Collecting paths from {config['base_path']}...")

    # Loop over train/valid subsets
    for subset in config['subsets']:
        subset_path = os.path.join(config['base_path'], subset)
        
        if not os.path.exists(subset_path):
            print(f"Warning: {subset_path} not found. Skipping...")
            continue
        
        # Loop over each body part
        for body_part in config['body_parts']:
            body_part_path = os.path.join(subset_path, body_part)
            
            if not os.path.exists(body_part_path):
                continue

            # Get all patients under a body part
            patients = [d for d in os.listdir(body_part_path) 
                        if os.path.isdir(os.path.join(body_part_path, d))]

            # Loop over patients with a progress bar
            for patient in tqdm(patients, desc=f"{subset}/{body_part}"):
                patient_path = os.path.join(body_part_path, patient)

                # Get all studies (positive/negative) for this patient
                studies = [d for d in os.listdir(patient_path) 
                           if os.path.isdir(os.path.join(patient_path, d))]

                # Loop over studies
                for study in studies:
                    study_path = os.path.join(patient_path, study)

                    # Determine label based on folder name
                    label = 0 if 'negative' in study.lower() else 1

                    # Loop over images inside the study folder
                    for img_file in os.listdir(study_path):
                        if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
                            # Append full path and label
                            paths.append(os.path.join(study_path, img_file))
                            labels.append(label)

    # Convert collected data into a DataFrame
    return pd.DataFrame({'path': paths, 'label': labels})

In [34]:
# ---- Function to Save Metadata to CSV ----
def save_metadata(df, output_path):
    """
    Save the collected paths and labels to a CSV file.

    Args:
        df (pd.DataFrame): DataFrame with 'path' and 'label' columns.
        output_path (str): Path where the CSV file will be saved.
    """
    # Ensure output directory exists
    os.makedirs(os.path.dirname(output_path), exist_ok=True)

    # Save DataFrame to CSV
    df.to_csv(output_path, index=False)
    print(f"✅ Saved {len(df)} records to {output_path}")

In [35]:
# ---- MAIN EXECUTION BLOCK ----
if __name__ == "__main__":
    try:
        # Step 1: Load YAML configuration
        config = load_config()

        # Step 2: Collect paths and labels based on config
        metadata_df = collect_paths_labels(config)

        # Step 3: Save metadata to a CSV file
        save_metadata(metadata_df, config['metadata_output'])

        # Step 4: Print some dataset insights
        print("\n📊 Label Distribution:")
        print(metadata_df['label'].value_counts())

        print("\n🔎 Sample Records:")
        print(metadata_df.head(3))

    except Exception as e:
        # Handle any errors during execution
        print(f"❌ Error: {str(e)}")

Collecting paths from D:/collage project/fracture detection comp vision/MURA-v1.1...


train/XR_ELBOW: 100%|██████████| 1711/1711 [00:00<00:00, 7306.70it/s]
train/XR_FINGER: 100%|██████████| 1865/1865 [00:00<00:00, 7630.58it/s]
train/XR_FOREARM: 100%|██████████| 865/865 [00:00<00:00, 8884.91it/s]
train/XR_HAND: 100%|██████████| 1945/1945 [00:00<00:00, 7913.39it/s]
train/XR_HUMERUS: 100%|██████████| 587/587 [00:00<00:00, 9339.88it/s]
train/XR_SHOULDER: 100%|██████████| 2694/2694 [00:00<00:00, 7661.64it/s]
train/XR_WRIST: 100%|██████████| 3267/3267 [00:00<00:00, 8171.72it/s]
valid/XR_ELBOW: 100%|██████████| 152/152 [00:00<00:00, 5871.35it/s]
valid/XR_FINGER: 100%|██████████| 166/166 [00:00<00:00, 8064.10it/s]
valid/XR_FOREARM: 100%|██████████| 129/129 [00:00<00:00, 7170.42it/s]
valid/XR_HAND: 100%|██████████| 159/159 [00:00<00:00, 8682.84it/s]
valid/XR_HUMERUS: 100%|██████████| 132/132 [00:00<00:00, 5463.60it/s]
valid/XR_SHOULDER: 100%|██████████| 173/173 [00:00<00:00, 5646.72it/s]
valid/XR_WRIST: 100%|██████████| 207/207 [00:00<00:00, 5646.38it/s]


✅ Saved 40009 records to data/processed/metadata.csv

📊 Label Distribution:
label
0    23606
1    16403
Name: count, dtype: int64

🔎 Sample Records:
                                                path  label
0  D:/collage project/fracture detection comp vis...      0
1  D:/collage project/fracture detection comp vis...      0
2  D:/collage project/fracture detection comp vis...      0
