In [1]:
import pandas as pd
from pathlib import Path

# -----------------------------
# CONFIGURATION
# -----------------------------
RAW_METADATA_DIR = Path("../raw_metadata")
OUTPUT_DIR = Path("../processed")

# CHANGE THIS to your actual file name
RADIOLOGY_METADATA_FILE = RAW_METADATA_DIR / "radiology-metadata.csv"

# -----------------------------
# MAIN LOGIC
# -----------------------------
def main():
    print("Loading radiology metadata...")

    # Try CSV first, fallback to TSV
    try:
        df = pd.read_csv(RADIOLOGY_METADATA_FILE)
    except Exception:
        df = pd.read_csv(RADIOLOGY_METADATA_FILE, sep="\t")

    print(f"Total rows loaded: {len(df)}")

    if "Subject ID" not in df.columns:
        raise ValueError("Column 'Subject ID' not found in metadata file")

    # Extract unique TCGA case IDs
    cases = (
        df[["Subject ID"]]
        .dropna()
        .drop_duplicates()
        .rename(columns={"Subject ID": "case_id"})
    )

    cases["has_radiology"] = 1

    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    output_file = OUTPUT_DIR / "radiology_cases.csv"
    cases.to_csv(output_file, index=False)

    print("Radiology case mapping complete.")
    print(f"Unique cases found: {len(cases)}")
    print(f"Saved to: {output_file.resolve()}")

if __name__ == "__main__":
    main()


Loading radiology metadata...
Total rows loaded: 625
Radiology case mapping complete.
Unique cases found: 69
Saved to: E:\OncoVisionX\ml\data_mapping\processed\radiology_cases.csv
