In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install -q lxml pandas tqdm



In [None]:
# ===== ONE-CELL: PARSE ALL CT ANNOTATION XMLs (READ-ONLY) =====

import os
from lxml import etree
from collections import defaultdict, Counter

# -------- PATH TO ANNOTATIONS ONLY --------
ANN_ROOT = "/content/drive/MyDrive/Lung cancer Dataset/LIDC-XML-only/tcia-lidc-xml"

# -------- Helper: check if XML is valid LIDC CT annotation --------
def is_valid_lidc_ct_xml(xml_path):
    try:
        tree = etree.parse(xml_path)
        root = tree.getroot()

        has_patient = root.find(".//{*}PatientID") is not None
        has_malignancy = root.find(".//{*}malignancy") is not None
        has_cxr = root.find(".//{*}CXRreadingSession") is not None

        return has_patient and has_malignancy and not has_cxr
    except:
        return False

# -------- Step 1: Collect ALL XML files --------
all_xmls = []
for root, _, files in os.walk(ANN_ROOT):
    for f in files:
        if f.lower().endswith(".xml"):
            all_xmls.append(os.path.join(root, f))

print(f"Total XML files found: {len(all_xmls)}")

# -------- Step 2: Filter CT XMLs + extract patient IDs --------
patient_to_xmls = defaultdict(list)
invalid_xmls = []

for xml_path in all_xmls:
    if not is_valid_lidc_ct_xml(xml_path):
        invalid_xmls.append(xml_path)
        continue

    tree = etree.parse(xml_path)
    root = tree.getroot()

    pid = root.find(".//{*}PatientID").text
    patient_to_xmls[pid].append(xml_path)

# -------- Step 3: REPORTS --------
all_patients = sorted(patient_to_xmls.keys())

print("\n===== CT ANNOTATION REPORT =====")
print(f"Valid CT annotation XMLs     : {sum(len(v) for v in patient_to_xmls.values())}")
print(f"Unique patients with CT ann.: {len(all_patients)}")
print(f"Ignored (non-CT / CXR) XMLs : {len(invalid_xmls)}")

print("\nFirst 20 patient IDs with CT annotations:")
for p in all_patients[:20]:
    print(p)

# -------- Step 4: How many XMLs per patient --------
xml_count_distribution = Counter(len(v) for v in patient_to_xmls.values())

print("\nXMLs per patient distribution (radiologist reads):")
for k in sorted(xml_count_distribution):
    print(f"{k} XMLs → {xml_count_distribution[k]} patients")

# -------- Step 5: Inspect one patient (example) --------
if all_patients:
    example = all_patients[0]
    print(f"\nExample patient: {example}")
    print("Associated CT XMLs:")
    for x in patient_to_xmls[example]:
        print(" ", x)

print("\n✅ CT XML parsing COMPLETE (no image data touched)")


Total XML files found: 1318

===== CT ANNOTATION REPORT =====
Valid CT annotation XMLs     : 0
Unique patients with CT ann.: 0
Ignored (non-CT / CXR) XMLs : 1318

First 20 patient IDs with CT annotations:

XMLs per patient distribution (radiologist reads):

✅ CT XML parsing COMPLETE (no image data touched)


In [None]:
!pip install -q pydicom tqdm

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/2.4 MB[0m [31m7.4 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━[0m [32m2.2/2.4 MB[0m [31m31.8 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# Pick one image SOP UID from XML
sop_uid = "1.3.6.1.4.1.14519.5.2.1.6279.6001.150739457477763063347777523734"

# Search your CT_scan_images DICOMs for it
import os, pydicom

for root, _, files in os.walk("/content/drive/MyDrive/CT_scan_images"):
    for f in files:
        if f.endswith(".dcm"):
            ds = pydicom.dcmread(os.path.join(root, f), stop_before_pixels=True)
            if ds.SOPInstanceUID == sop_uid:
                print("FOUND")
                print("PatientID:", ds.PatientID)
                print("StudyInstanceUID:", ds.StudyInstanceUID)
                raise SystemExit


KeyboardInterrupt: 

In [None]:
# ===== FIXED: MAP CT XMLs → PATIENT IDs USING SeriesInstanceUid =====

import os
import pandas as pd
from lxml import etree
from collections import defaultdict

# -------- PATHS --------
ANN_ROOT = "/content/drive/MyDrive/LIDC-XML-Only/tcia-lidc-xml"
META_PATH = "/content/drive/MyDrive/Lung cancer Dataset/metadata.csv"

# -------- Load metadata --------
meta = pd.read_csv(META_PATH)

# Keep only CT
meta_ct = meta[meta["Modality"] == "CT"]

# Build SeriesUID → PatientID mapping
series_to_patient = dict(
    zip(meta_ct["Series UID"], meta_ct["Subject ID"])
)

print("CT series in metadata:", len(series_to_patient))

# -------- Identify valid CT annotation XML --------
def is_valid_lidc_ct_xml(xml_path):
    try:
        tree = etree.parse(xml_path)
        root = tree.getroot()
        return (
            root.tag.endswith("LidcReadMessage") and
            root.find(".//{*}readingSession") is not None
        )
    except:
        return False

# -------- Parse XMLs --------
patient_to_xmls = defaultdict(list)
unmapped_xmls = []

for root, _, files in os.walk(ANN_ROOT):
    for f in files:
        if not f.endswith(".xml"):
            continue

        xml_path = os.path.join(root, f)
        if not is_valid_lidc_ct_xml(xml_path):
            continue

        tree = etree.parse(xml_path)
        root_xml = tree.getroot()

        series_uid_elem = root_xml.find(".//{*}SeriesInstanceUid")
        if series_uid_elem is None:
            unmapped_xmls.append(xml_path)
            continue

        series_uid = series_uid_elem.text.strip()
        patient_id = series_to_patient.get(series_uid)

        if patient_id:
            patient_to_xmls[patient_id].append(xml_path)
        else:
            unmapped_xmls.append(xml_path)

# -------- REPORT --------
print("\n===== CT ANNOTATION → PATIENT REPORT (FIXED) =====")
print("Patients with CT annotations:", len(patient_to_xmls))
print("Total mapped CT annotation XMLs:", sum(len(v) for v in patient_to_xmls.values()))
print("Unmapped XMLs:", len(unmapped_xmls))

print("\nFirst 15 patients with CT annotations:")
for p in list(patient_to_xmls.keys())[:15]:
    print(p, "→", len(patient_to_xmls[p]), "XMLs")

if patient_to_xmls:
    example = next(iter(patient_to_xmls))
    print(f"\nExample XMLs for {example}:")
    for x in patient_to_xmls[example][:3]:
        print(" ", x)

print("\n✅ Series UID based mapping COMPLETE (no images touched)")


CT series in metadata: 1018

===== CT ANNOTATION → PATIENT REPORT (FIXED) =====
Patients with CT annotations: 0
Total mapped CT annotation XMLs: 0
Unmapped XMLs: 0

First 15 patients with CT annotations:

✅ Series UID based mapping COMPLETE (no images touched)


In [None]:
import pandas as pd

META_PATH = "/content/drive/MyDrive/Lung cancer Dataset/metadata.csv"
meta = pd.read_csv(META_PATH)

print("Metadata columns:\n")
for c in meta.columns:
    print(f"'{c}'")

print("\nFirst 5 rows (UID columns only):")
print(meta[["Series UID", "Study UID", "Subject ID", "Modality"]].head())


Metadata columns:

'Series UID'
'Collection'
'3rd Party Analysis'
'Data Description URI'
'Subject ID'
'Study UID'
'Study Description'
'Study Date'
'Series Description'
'Manufacturer'
'Modality'
'SOP Class Name'
'SOP Class UID'
'Number of Images'
'File Size'
'File Location'
'Download Timestamp'

First 5 rows (UID columns only):
                                          Series UID  \
0  1.3.6.1.4.1.14519.5.2.1.6279.6001.493562949900...   
1  1.3.6.1.4.1.14519.5.2.1.6279.6001.141365756818...   
2  1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636...   
3  1.3.6.1.4.1.14519.5.2.1.6279.6001.142026812390...   
4  1.3.6.1.4.1.14519.5.2.1.6279.6001.170706757615...   

                                           Study UID      Subject ID Modality  
0  1.3.6.1.4.1.14519.5.2.1.6279.6001.116951808801...  LIDC-IDRI-0002       DX  
1  1.3.6.1.4.1.14519.5.2.1.6279.6001.175012972118...  LIDC-IDRI-0001       DX  
2  1.3.6.1.4.1.14519.5.2.1.6279.6001.298806137288...  LIDC-IDRI-0001       CT  
3  1.3.6.1.4.1

In [None]:
# ===== PATIENT-ANCHOR MAPPING: LIDC-IDRI-0001 to 0100 =====

import os
import pandas as pd
from lxml import etree
from collections import defaultdict

# -------- PATHS --------
META_PATH = "/content/drive/MyDrive/Lung cancer Dataset/metadata.csv"
ANN_ROOT = "/content/drive/MyDrive/LIDC-XML-Only/tcia-lidc-xml"

# -------- STEP 1: Define patient cohort --------
patients = {f"LIDC-IDRI-{i:04d}" for i in range(1, 101)}
print("Target patients:", len(patients))

# -------- STEP 2: Load metadata and collect CT SeriesUIDs per patient --------
meta = pd.read_csv(META_PATH)

meta_ct = meta[
    (meta["Modality"] == "CT") &
    (meta["Subject ID"].isin(patients))
]

patient_to_series = defaultdict(set)

for _, row in meta_ct.iterrows():
    patient_to_series[row["Subject ID"]].add(str(row["Series UID"]).strip())

print("Patients with CT series in metadata:", len(patient_to_series))

# -------- STEP 3: Parse XMLs and match by SeriesInstanceUid --------
patient_to_xmls = defaultdict(list)
total_xmls_checked = 0
matched_xmls = 0

def is_ct_annotation_xml(xml_path):
    try:
        tree = etree.parse(xml_path)
        root = tree.getroot()
        return root.tag.endswith("LidcReadMessage") and root.find(".//{*}readingSession") is not None
    except:
        return False

for root, _, files in os.walk(ANN_ROOT):
    for f in files:
        if not f.endswith(".xml"):
            continue

        xml_path = os.path.join(root, f)
        total_xmls_checked += 1

        if not is_ct_annotation_xml(xml_path):
            continue

        tree = etree.parse(xml_path)
        root_xml = tree.getroot()

        series_elem = root_xml.find(".//{*}SeriesInstanceUid")
        if series_elem is None:
            continue

        xml_series_uid = series_elem.text.strip()

        # Check which patient owns this Series UID
        for patient, series_set in patient_to_series.items():
            if xml_series_uid in series_set:
                patient_to_xmls[patient].append(xml_path)
                matched_xmls += 1
                break

# -------- STEP 4: REPORT --------
print("\n===== FINAL REPORT =====")
print("Total XMLs scanned:", total_xmls_checked)
print("Matched CT annotation XMLs:", matched_xmls)
print("Patients with ≥1 CT annotation XML:", len(patient_to_xmls))

print("\nPatient-wise XML counts (first 15):")
for p in sorted(patient_to_xmls.keys())[:15]:
    print(p, "→", len(patient_to_xmls[p]), "XMLs")

print("\nPatients with CT series but NO XML annotations:")
print(sorted(set(patient_to_series.keys()) - set(patient_to_xmls.keys())))

# -------- Example inspection --------
if patient_to_xmls:
    example = sorted(patient_to_xmls.keys())[0]
    print(f"\nExample XMLs for {example}:")
    for x in patient_to_xmls[example][:3]:
        print(" ", x)

print("\n✅ Patient-anchored XML mapping COMPLETE (no images touched)")


Target patients: 100
Patients with CT series in metadata: 100

===== FINAL REPORT =====
Total XMLs scanned: 0
Matched CT annotation XMLs: 0
Patients with ≥1 CT annotation XML: 0

Patient-wise XML counts (first 15):

Patients with CT series but NO XML annotations:
['LIDC-IDRI-0001', 'LIDC-IDRI-0002', 'LIDC-IDRI-0003', 'LIDC-IDRI-0004', 'LIDC-IDRI-0005', 'LIDC-IDRI-0006', 'LIDC-IDRI-0007', 'LIDC-IDRI-0008', 'LIDC-IDRI-0009', 'LIDC-IDRI-0010', 'LIDC-IDRI-0011', 'LIDC-IDRI-0012', 'LIDC-IDRI-0013', 'LIDC-IDRI-0014', 'LIDC-IDRI-0015', 'LIDC-IDRI-0016', 'LIDC-IDRI-0017', 'LIDC-IDRI-0018', 'LIDC-IDRI-0019', 'LIDC-IDRI-0020', 'LIDC-IDRI-0021', 'LIDC-IDRI-0022', 'LIDC-IDRI-0023', 'LIDC-IDRI-0024', 'LIDC-IDRI-0025', 'LIDC-IDRI-0026', 'LIDC-IDRI-0027', 'LIDC-IDRI-0028', 'LIDC-IDRI-0029', 'LIDC-IDRI-0030', 'LIDC-IDRI-0031', 'LIDC-IDRI-0032', 'LIDC-IDRI-0033', 'LIDC-IDRI-0034', 'LIDC-IDRI-0035', 'LIDC-IDRI-0036', 'LIDC-IDRI-0037', 'LIDC-IDRI-0038', 'LIDC-IDRI-0039', 'LIDC-IDRI-0040', 'LIDC-IDRI-0041

In [None]:
import os

ANN_ROOT = "/content/drive/MyDrive/Lung cancer Dataset/LIDC-XML-only/tcia-lidc-xml"

xml_count = 0
for root, _, files in os.walk(ANN_ROOT):
    for f in files:
        if f.lower().endswith(".xml"):
            xml_count += 1

print("XML files found:", xml_count)


XML files found: 1318


In [None]:
# ===== PATIENT-ANCHOR MAPPING (FIXED PATH) =====

import os
import pandas as pd
from lxml import etree
from collections import defaultdict

# -------- PATHS --------
META_PATH = "/content/drive/MyDrive/Lung cancer Dataset/metadata.csv"
ANN_ROOT = "/content/drive/MyDrive/Lung cancer Dataset/LIDC-XML-only/tcia-lidc-xml"

# -------- STEP 1: Define patient cohort --------
patients = {f"LIDC-IDRI-{i:04d}" for i in range(1, 101)}
print("Target patients:", len(patients))

# -------- STEP 2: Load metadata and collect CT SeriesUIDs per patient --------
meta = pd.read_csv(META_PATH)

meta_ct = meta[
    (meta["Modality"] == "CT") &
    (meta["Subject ID"].isin(patients))
]

patient_to_series = defaultdict(set)
for _, row in meta_ct.iterrows():
    patient_to_series[row["Subject ID"]].add(str(row["Series UID"]).strip())

print("Patients with CT series in metadata:", len(patient_to_series))

# -------- STEP 3: Parse XMLs and match by SeriesInstanceUid --------
patient_to_xmls = defaultdict(list)
total_xmls = 0
matched_xmls = 0

def is_ct_annotation_xml(xml_path):
    try:
        tree = etree.parse(xml_path)
        root = tree.getroot()
        return root.tag.endswith("LidcReadMessage") and root.find(".//{*}readingSession") is not None
    except:
        return False

for root, _, files in os.walk(ANN_ROOT):
    for f in files:
        if not f.lower().endswith(".xml"):
            continue

        total_xmls += 1
        xml_path = os.path.join(root, f)

        if not is_ct_annotation_xml(xml_path):
            continue

        tree = etree.parse(xml_path)
        root_xml = tree.getroot()

        series_elem = root_xml.find(".//{*}SeriesInstanceUid")
        if series_elem is None:
            continue

        xml_series_uid = series_elem.text.strip()

        for patient, series_set in patient_to_series.items():
            if xml_series_uid in series_set:
                patient_to_xmls[patient].append(xml_path)
                matched_xmls += 1
                break

# -------- STEP 4: REPORT --------
print("\n===== FINAL REPORT =====")
print("Total XMLs scanned:", total_xmls)
print("Matched CT annotation XMLs:", matched_xmls)
print("Patients with ≥1 CT annotation XML:", len(patient_to_xmls))

print("\nPatient-wise XML counts (first 15):")
for p in sorted(patient_to_xmls.keys())[:100]:
    print(p, "→", len(patient_to_xmls[p]), "XMLs")

print("\nPatients with CT series but NO XML annotations:")
print(sorted(set(patient_to_series.keys()) - set(patient_to_xmls.keys())))

if patient_to_xmls:
    example = sorted(patient_to_xmls.keys())[0]
    print(f"\nExample XMLs for {example}:")
    for x in patient_to_xmls[example][:3]:
        print(" ", x)

print("\n✅ Patient-anchored XML mapping COMPLETE (no images touched)")


Target patients: 100
Patients with CT series in metadata: 100

===== FINAL REPORT =====
Total XMLs scanned: 1318
Matched CT annotation XMLs: 100
Patients with ≥1 CT annotation XML: 100

Patient-wise XML counts (first 15):
LIDC-IDRI-0001 → 1 XMLs
LIDC-IDRI-0002 → 1 XMLs
LIDC-IDRI-0003 → 1 XMLs
LIDC-IDRI-0004 → 1 XMLs
LIDC-IDRI-0005 → 1 XMLs
LIDC-IDRI-0006 → 1 XMLs
LIDC-IDRI-0007 → 1 XMLs
LIDC-IDRI-0008 → 1 XMLs
LIDC-IDRI-0009 → 1 XMLs
LIDC-IDRI-0010 → 1 XMLs
LIDC-IDRI-0011 → 1 XMLs
LIDC-IDRI-0012 → 1 XMLs
LIDC-IDRI-0013 → 1 XMLs
LIDC-IDRI-0014 → 1 XMLs
LIDC-IDRI-0015 → 1 XMLs
LIDC-IDRI-0016 → 1 XMLs
LIDC-IDRI-0017 → 1 XMLs
LIDC-IDRI-0018 → 1 XMLs
LIDC-IDRI-0019 → 1 XMLs
LIDC-IDRI-0020 → 1 XMLs
LIDC-IDRI-0021 → 1 XMLs
LIDC-IDRI-0022 → 1 XMLs
LIDC-IDRI-0023 → 1 XMLs
LIDC-IDRI-0024 → 1 XMLs
LIDC-IDRI-0025 → 1 XMLs
LIDC-IDRI-0026 → 1 XMLs
LIDC-IDRI-0027 → 1 XMLs
LIDC-IDRI-0028 → 1 XMLs
LIDC-IDRI-0029 → 1 XMLs
LIDC-IDRI-0030 → 1 XMLs
LIDC-IDRI-0031 → 1 XMLs
LIDC-IDRI-0032 → 1 XMLs
LIDC-IDRI-

In [None]:
# ===== FINAL SAFE MAPPING: XMLs → CT_HU_Windowed (Patients 0001–0100) =====

import os
import pandas as pd
from lxml import etree
from collections import defaultdict

# ---------------- PATHS ----------------
CT_ROOT = "/content/drive/MyDrive/CT_HU_Windowed"
META_PATH = "/content/drive/MyDrive/Lung cancer Dataset/metadata.csv"
ANN_ROOT = "/content/drive/MyDrive/Lung cancer Dataset/LIDC-XML-only/tcia-lidc-xml"

# ---------------- STEP 1: Patient list from CT_HU_Windowed ----------------
patients = sorted([
    p for p in os.listdir(CT_ROOT)
    if p.startswith("LIDC-IDRI-")
])

print("Patients found in CT_HU_Windowed:", len(patients))

# ---------------- STEP 2: Load metadata & collect CT SeriesUIDs ----------------
meta = pd.read_csv(META_PATH)

meta_ct = meta[
    (meta["Modality"] == "CT") &
    (meta["Subject ID"].isin(patients))
]

patient_to_series = defaultdict(set)
for _, row in meta_ct.iterrows():
    patient_to_series[row["Subject ID"]].add(str(row["Series UID"]).strip())

print("Patients with CT series in metadata:", len(patient_to_series))

# ---------------- STEP 3: Parse XMLs and map to patients ----------------
patient_to_xmls = defaultdict(list)
total_xmls = 0
ct_xmls = 0

def is_ct_annotation_xml(xml_path):
    try:
        tree = etree.parse(xml_path)
        root = tree.getroot()
        return root.tag.endswith("LidcReadMessage") and root.find(".//{*}readingSession") is not None
    except:
        return False

for root, _, files in os.walk(ANN_ROOT):
    for f in files:
        if not f.lower().endswith(".xml"):
            continue

        total_xmls += 1
        xml_path = os.path.join(root, f)

        if not is_ct_annotation_xml(xml_path):
            continue

        ct_xmls += 1
        tree = etree.parse(xml_path)
        root_xml = tree.getroot()

        series_elem = root_xml.find(".//{*}SeriesInstanceUid")
        if series_elem is None:
            continue

        xml_series_uid = series_elem.text.strip()

        # Assign XML to patient based on metadata
        for patient, series_set in patient_to_series.items():
            if xml_series_uid in series_set:
                patient_to_xmls[patient].append(xml_path)
                break

# ---------------- STEP 4: REPORT ----------------
print("\n===== FINAL XML → CT_HU_WINDOWED MAPPING REPORT =====")
print("Total XML files scanned:", total_xmls)
print("CT annotation XMLs detected:", ct_xmls)
print("Patients with ≥1 XML mapped:", len(patient_to_xmls))

print("\nPatient-wise XML counts (first 15):")
for p in sorted(patient_to_xmls.keys())[:15]:
    print(p, "→", len(patient_to_xmls[p]), "XMLs")

print("\nPatients in CT_HU_Windowed but NO XML annotations:")
print(sorted(set(patients) - set(patient_to_xmls.keys())))

if patient_to_xmls:
    example = sorted(patient_to_xmls.keys())[0]
    print(f"\nExample XMLs mapped to {example}:")
    for x in patient_to_xmls[example][:3]:
        print(" ", x)

print("\n✅ XMLs successfully mapped to CT_HU_Windowed (no data modified)")


Patients found in CT_HU_Windowed: 100
Patients with CT series in metadata: 100

===== FINAL XML → CT_HU_WINDOWED MAPPING REPORT =====
Total XML files scanned: 1318
CT annotation XMLs detected: 1035
Patients with ≥1 XML mapped: 100

Patient-wise XML counts (first 15):
LIDC-IDRI-0001 → 1 XMLs
LIDC-IDRI-0002 → 1 XMLs
LIDC-IDRI-0003 → 1 XMLs
LIDC-IDRI-0004 → 1 XMLs
LIDC-IDRI-0005 → 1 XMLs
LIDC-IDRI-0006 → 1 XMLs
LIDC-IDRI-0007 → 1 XMLs
LIDC-IDRI-0008 → 1 XMLs
LIDC-IDRI-0009 → 1 XMLs
LIDC-IDRI-0010 → 1 XMLs
LIDC-IDRI-0011 → 1 XMLs
LIDC-IDRI-0012 → 1 XMLs
LIDC-IDRI-0013 → 1 XMLs
LIDC-IDRI-0014 → 1 XMLs
LIDC-IDRI-0015 → 1 XMLs

Patients in CT_HU_Windowed but NO XML annotations:
[]

Example XMLs mapped to LIDC-IDRI-0001:
  /content/drive/MyDrive/Lung cancer Dataset/LIDC-XML-only/tcia-lidc-xml/185/069.xml

✅ XMLs successfully mapped to CT_HU_Windowed (no data modified)


In [None]:
# ===== FINAL STEP: CREATE labels.csv (PATIENT LEVEL) =====

import os
import pandas as pd
from lxml import etree
from collections import defaultdict
import numpy as np

# ---------------- PATHS ----------------
CT_ROOT = "/content/drive/MyDrive/CT_HU_Windowed"
META_PATH = "/content/drive/MyDrive/Lung cancer Dataset/metadata.csv"
ANN_ROOT = "/content/drive/MyDrive/Lung cancer Dataset/LIDC-XML-only/tcia-lidc-xml"
OUT_PATH = "/content/drive/MyDrive/labels.csv"

# ---------------- STEP 1: Patients from CT_HU_Windowed ----------------
patients = sorted([
    p for p in os.listdir(CT_ROOT)
    if p.startswith("LIDC-IDRI-")
])

# ---------------- STEP 2: Metadata → patient → CT SeriesUIDs ----------------
meta = pd.read_csv(META_PATH)

meta_ct = meta[
    (meta["Modality"] == "CT") &
    (meta["Subject ID"].isin(patients))
]

patient_to_series = defaultdict(set)
for _, row in meta_ct.iterrows():
    patient_to_series[row["Subject ID"]].add(str(row["Series UID"]).strip())

# ---------------- STEP 3: Helper functions ----------------
def is_ct_annotation_xml(xml_path):
    try:
        tree = etree.parse(xml_path)
        root = tree.getroot()
        return root.tag.endswith("LidcReadMessage") and root.find(".//{*}readingSession") is not None
    except:
        return False

def extract_malignancies(xml_path):
    """Return list of malignancy scores found in this XML"""
    scores = []
    tree = etree.parse(xml_path)
    root = tree.getroot()

    for m in root.findall(".//{*}malignancy"):
        try:
            scores.append(int(m.text))
        except:
            pass
    return scores

# ---------------- STEP 4: Parse XMLs and collect malignancy per patient ----------------
patient_to_malignancies = defaultdict(list)

for root, _, files in os.walk(ANN_ROOT):
    for f in files:
        if not f.lower().endswith(".xml"):
            continue

        xml_path = os.path.join(root, f)

        if not is_ct_annotation_xml(xml_path):
            continue

        tree = etree.parse(xml_path)
        root_xml = tree.getroot()

        series_elem = root_xml.find(".//{*}SeriesInstanceUid")
        if series_elem is None:
            continue

        xml_series_uid = series_elem.text.strip()

        # Match XML to patient using metadata
        for patient, series_set in patient_to_series.items():
            if xml_series_uid in series_set:
                patient_to_malignancies[patient].extend(
                    extract_malignancies(xml_path)
                )
                break

# ---------------- STEP 5: Build labels.csv ----------------
rows = []

for patient in patients:
    malignancies = patient_to_malignancies.get(patient, [])

    if len(malignancies) == 0:
        max_m = 0
        mean_m = 0.0
        label = 0
    else:
        max_m = max(malignancies)
        mean_m = float(np.mean(malignancies))
        label = 1 if max_m >= 4 else 0

    rows.append({
        "patient_id": patient,
        "label": label,
        "max_malignancy": max_m,
        "mean_malignancy": round(mean_m, 3),
        "num_malignant_nodules": sum(1 for m in malignancies if m >= 4),
        "num_total_malignancy_scores": len(malignancies)
    })

labels_df = pd.DataFrame(rows)
labels_df.to_csv(OUT_PATH, index=False)

print("✅ labels.csv created at:", OUT_PATH)
print("\nPreview:")
labels_df.head()


✅ labels.csv created at: /content/drive/MyDrive/labels.csv

Preview:


Unnamed: 0,patient_id,label,max_malignancy,mean_malignancy,num_malignant_nodules,num_total_malignancy_scores
0,LIDC-IDRI-0001,1,5,4.75,4,4
1,LIDC-IDRI-0002,1,5,4.5,2,2
2,LIDC-IDRI-0003,1,5,3.538,7,13
3,LIDC-IDRI-0004,0,2,1.25,0,4
4,LIDC-IDRI-0005,0,3,2.667,0,9
