In [1]:
import pandas as pd

# Load the CSV file
csv_path = "../CXR/datasets/train-rsna.csv"
df = pd.read_csv(csv_path)

# Display basic info and first few rows to understand the structure
df.info(), df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18657 entries, 0 to 18656
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   path            18657 non-null  object
 1   Pneumonia_RSNA  18657 non-null  int64 
 2   Sex             18657 non-null  object
 3   Age             18657 non-null  int64 
 4   Age_group       18657 non-null  object
dtypes: int64(2), object(3)
memory usage: 728.9+ KB


(None,
                path  Pneumonia_RSNA Sex  Age Age_group
 0  00000003_000.png               0   F   81       80+
 1  00000003_002.png               0   F   75     60-80
 2  00000003_003.png               0   F   76     60-80
 3  00000003_005.png               0   F   78     60-80
 4  00000005_002.png               0   F   69     60-80)

In [2]:
import os
import shutil
from pathlib import Path

# Define source base directory
source_base = Path("../CXR/datasets/rsna")

# Define destination base directory
destination_base = Path("../CXR/datasets/cxr_dreambooth")

# Mapping for Sex
sex_mapping = {
    "M": "male",
    "F": "female"
}

# Mapping for Age_group
age_group_mapping = {
    "0-20": "0-20Y",
    "20-40": "20-40Y",
    "40-60": "40-60Y",
    "60-80": "60-80Y",
    "80+": "80+Y"
}

# Create a list to track any missing source files
missing_files = []

# Helper function to copy images
def copy_samples(group_df, group_name, max_samples=300):
    dest_dir = destination_base / group_name
    dest_dir.mkdir(parents=True, exist_ok=True)

    sampled_df = group_df.sample(n=min(max_samples, len(group_df)), random_state=42)
    copied = 0
    for _, row in sampled_df.iterrows():
        src_path = source_base / row['path']
        dst_path = dest_dir / row['path']
        if src_path.exists():
            shutil.copy2(src_path, dst_path)
            copied += 1
        else:
            missing_files.append(str(src_path))
    return copied

# Copy for each sex category
sex_counts = {}
for code, name in sex_mapping.items():
    group_df = df[df["Sex"] == code]
    sex_counts[name] = copy_samples(group_df, name)

# Copy for each age group
age_counts = {}
for code, name in age_group_mapping.items():
    group_df = df[df["Age_group"] == code]
    age_counts[name] = copy_samples(group_df, name)

sex_counts, age_counts, len(missing_files)

({'male': 300, 'female': 300},
 {'0-20Y': 300, '20-40Y': 300, '40-60Y': 300, '60-80Y': 300, '80+Y': 226},
 0)