In [1]:
import pandas as pd
from pathlib import Path

# -----------------------------
# CONFIG
# -----------------------------
SAMPLE_SHEET = Path(
    r"F:\FYP_Preparation\Data\pathalogy\pathology-sample_sheet.tsv"
)

PATHOLOGY_ROOTS = [
    Path(r"E:\CancerVision_Data\data\pathology\pathology_1"),
    Path(r"E:\CancerVision_Data\data\pathology\pathology_2"),
    Path(r"E:\CancerVision_Data\data\pathology\pathology_3"),
    Path(r"F:\FYP_Preparation\Data\pathalogy\pathology_4"),
]

OUTPUT_DIR = Path("../processed")
OUTPUT_DIR.mkdir(exist_ok=True)

OUTPUT_FILE = OUTPUT_DIR / "pathology_file_to_case.csv"

# -----------------------------
def main():
    print("Loading pathology sample sheet...")
    df = pd.read_csv(SAMPLE_SHEET, sep="\t")
    print(f"Rows in sample sheet: {len(df)}")

    # Normalize columns
    df.columns = [c.lower().strip().replace(" ", "_") for c in df.columns]

    required = {"file_id", "case_id"}
    if not required.issubset(df.columns):
        raise ValueError(f"Missing required columns: {required - set(df.columns)}")

    # Collect local pathology UUID folders
    local_file_ids = set()
    for root in PATHOLOGY_ROOTS:
        if root.exists():
            for d in root.iterdir():
                if d.is_dir():
                    local_file_ids.add(d.name)

    print(f"Local pathology folders found: {len(local_file_ids)}")

    # Filter to only downloaded pathology files
    df_local = df[df["file_id"].isin(local_file_ids)].copy()

    print(f"Matched pathology files on disk: {len(df_local)}")

    # Reduce to essential mapping
    df_out = df_local[
        ["file_id", "case_id", "sample_id", "tissue_type", "specimen_type"]
    ].drop_duplicates()

    df_out["local_verified"] = 1
    df_out.to_csv(OUTPUT_FILE, index=False)

    print(f"Saved mapping: {OUTPUT_FILE.resolve()}")

if __name__ == "__main__":
    main()


Loading pathology sample sheet...
Rows in sample sheet: 541
Local pathology folders found: 209
Matched pathology files on disk: 209
Saved mapping: E:\OncoVisionX\ml\data_mapping\processed\pathology_file_to_case.csv
