In [None]:

#Imports + NumPy fix for pylidc

import numpy as np
if not hasattr(np, "int"):
    np.int = int
if not hasattr(np, "bool"):
    np.bool = bool

import pandas as pd
import pylidc as pl
from pylidc.utils import consensus
from tqdm import tqdm


# Load LUNA16 list of scans (seriesuids)

luna = pd.read_csv("https://zenodo.org/record/3723295/files/annotations.csv")
luna_seriesuids = set(luna["seriesuid"].astype(str).values)
print("Unique LUNA seriesuids:", len(luna_seriesuids))

# Helper functions

attribute_keys = [
    "calcification",
    "internalStructure",
    "lobulation",
    "malignancy",
    "margin",
    "sphericity",
    "spiculation",
    "subtlety",
    "texture",
]

def consensus_attr(values):
    return int(np.rint(np.mean(values)))

def consensus_bbox_and_centroid(anns):
    _, cbbox, _ = consensus(anns, clevel=0.5, pad=False)
    centroid_zyx = [sl.start + int(0.5 * (sl.stop - sl.start)) for sl in cbbox]
    return cbbox, centroid_zyx


# Get seriesuids that exist in pylidc DB

db_uids = [s.series_instance_uid for s in pl.query(pl.Scan).all()]
seriesuids = sorted(set(db_uids).intersection(luna_seriesuids))

print("Seriesuids present in BOTH pylidc DB and LUNA:", len(seriesuids))


# Iterate scan-by-scan (robust)

rows = []

for seriesuid in tqdm(seriesuids):
    try:
        scan = pl.query(pl.Scan).filter(pl.Scan.series_instance_uid == seriesuid).first()
        if scan is None:
            continue

        clusters = scan.cluster_annotations()
        clusters = [c for c in clusters if len(c) >= 3]

        for nid, cluster in enumerate(clusters):
            cluster_use = [ann for ann in cluster if ann.diameter >= 3]
            if len(cluster_use) < 3:
                continue

            # collect attributes
            attr_lists = {k: [] for k in attribute_keys}
            for ann in cluster_use:
                for k in attribute_keys:
                    attr_lists[k].append(getattr(ann, k))

            # consensus attributes (except malignancy)
            attrs = {k: consensus_attr(v) for k, v in attr_lists.items()}

            # malignancy outputs (ONLY what you want)
            mal_scores = attr_lists["malignancy"]
            mal_mean = float(np.mean(mal_scores))

            cbbox, centroid_zyx = consensus_bbox_and_centroid(cluster_use)

            rows.append({
                "seriesuid": seriesuid,
                "nodule_id": f"nodule_{nid}",
                "centroid_z": centroid_zyx[0],
                "centroid_y": centroid_zyx[1],
                "centroid_x": centroid_zyx[2],
                "bbox": str(cbbox),

                "mal_mean": mal_mean,
                "all_scores": str(mal_scores),
                "source": "LIDC-IDRI",

                "calcification": attrs["calcification"],
                "internalStructure": attrs["internalStructure"],
                "lobulation": attrs["lobulation"],
                "margin": attrs["margin"],
                "sphericity": attrs["sphericity"],
                "spiculation": attrs["spiculation"],
                "subtlety": attrs["subtlety"],
                "texture": attrs["texture"],
            })

    except Exception as e:
        print(f"[SKIP] {seriesuid}: {e}")
        continue


# Save output

df = pd.DataFrame(rows)
print("Output shape:", df.shape)
print(df.head())

out_path = "luna16_lidc_consensus_attributes_with_malmean_scores.csv"
df.to_csv(out_path, index=False)
print("Saved:", out_path)


Unique LUNA seriesuids: 601
Seriesuids present in BOTH pylidc DB and LUNA: 601


 11%|█▏        | 68/601 [00:46<05:29,  1.62it/s]

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.


 15%|█▌        | 93/601 [01:04<08:13,  1.03it/s]

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.


 18%|█▊        | 107/601 [01:14<06:58,  1.18it/s]

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.


 37%|███▋      | 225/601 [02:47<06:39,  1.06s/it]

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.


 44%|████▍     | 267/601 [03:23<03:37,  1.53it/s]

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.


 47%|████▋     | 281/601 [03:31<03:03,  1.75it/s]

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.


 61%|██████    | 368/601 [04:36<02:49,  1.38it/s]

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.


 72%|███████▏  | 434/601 [05:22<00:59,  2.79it/s]

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.


 74%|███████▍  | 446/601 [05:35<01:50,  1.40it/s]

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.


 75%|███████▍  | 450/601 [05:39<02:03,  1.22it/s]

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.


 88%|████████▊ | 527/601 [06:18<00:26,  2.78it/s]

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.


 96%|█████████▌| 577/601 [06:47<00:28,  1.20s/it]

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.


 99%|█████████▉| 597/601 [07:03<00:03,  1.14it/s]

Failed to reduce all groups to <= 4 Annotations.
Some nodules may be close and must be grouped manually.


100%|██████████| 601/601 [07:05<00:00,  1.41it/s]

Output shape: (1185, 17)
                                           seriesuid nodule_id  centroid_z  \
0  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  nodule_0         212   
1  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  nodule_1         155   
2  1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793...  nodule_0         270   
3  1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...  nodule_0         339   
4  1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...  nodule_1         253   

   centroid_y  centroid_x                                               bbox  \
0          45          78  (slice(np.int64(206), np.int64(218), None), sl...   
1         406         118  (slice(np.int64(151), np.int64(160), None), sl...   
2         385          89  (slice(np.int64(264), np.int64(277), None), sl...   
3         378         143  (slice(np.int64(321), np.int64(358), None), sl...   
4         379         231  (slice(np.int64(233), np.int64(273), None), sl...   

   mal_mean    all_scores




In [None]:
df.head(20)

Unnamed: 0,seriesuid,nodule_id,centroid_z,centroid_y,centroid_x,bbox,mal_mean,all_scores,source,calcification,internalStructure,lobulation,margin,sphericity,spiculation,subtlety,texture
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,nodule_0,212,45,78,"(slice(np.int64(206), np.int64(218), None), sl...",3.0,"[4, 2, 4, 2]",LIDC-IDRI,6,1,1,4,4,1,4,5
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,nodule_1,155,406,118,"(slice(np.int64(151), np.int64(160), None), sl...",2.333333,"[3, 1, 3]",LIDC-IDRI,6,1,1,4,5,1,4,5
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793...,nodule_0,270,385,89,"(slice(np.int64(264), np.int64(277), None), sl...",2.666667,"[4, 3, 1]",LIDC-IDRI,6,1,1,4,3,1,3,5
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,nodule_0,339,378,143,"(slice(np.int64(321), np.int64(358), None), sl...",3.75,"[5, 5, 3, 2]",LIDC-IDRI,6,1,2,4,5,2,5,4
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,nodule_1,253,379,231,"(slice(np.int64(233), np.int64(273), None), sl...",3.75,"[5, 5, 3, 2]",LIDC-IDRI,6,1,2,5,4,1,5,5
5,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,nodule_2,294,220,252,"(slice(np.int64(275), np.int64(313), None), sl...",4.25,"[5, 5, 4, 3]",LIDC-IDRI,6,1,2,5,5,2,5,5
6,1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...,nodule_3,327,176,266,"(slice(np.int64(316), np.int64(338), None), sl...",3.75,"[5, 4, 3, 3]",LIDC-IDRI,6,1,2,5,4,2,4,5
7,1.3.6.1.4.1.14519.5.2.1.6279.6001.100953483028...,nodule_0,290,355,77,"(slice(np.int64(280), np.int64(300), None), sl...",2.5,"[3, 2, 3, 2]",LIDC-IDRI,6,1,2,4,4,2,4,5
8,1.3.6.1.4.1.14519.5.2.1.6279.6001.102681962408...,nodule_0,279,354,122,"(slice(np.int64(262), np.int64(296), None), sl...",4.25,"[2, 5, 5, 5]",LIDC-IDRI,6,1,3,4,4,3,4,5
9,1.3.6.1.4.1.14519.5.2.1.6279.6001.104562737760...,nodule_0,258,121,103,"(slice(np.int64(250), np.int64(267), None), sl...",3.0,"[5, 5, 1, 1]",LIDC-IDRI,4,1,2,4,4,1,5,5


In [None]:
# After creating the DataFrame
df = pd.DataFrame(rows)

# Delete the 'source' column if it exists
if 'source' in df.columns:
    del df['source']  # Simpler way to delete a column

# Save to CSV
df.to_csv("luna16_lidc_consensus_attributes_cleaned.csv", index=False)
print(df.head())

                                           seriesuid nodule_id  centroid_z  \
0  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  nodule_0         212   
1  1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...  nodule_1         155   
2  1.3.6.1.4.1.14519.5.2.1.6279.6001.100398138793...  nodule_0         270   
3  1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...  nodule_0         339   
4  1.3.6.1.4.1.14519.5.2.1.6279.6001.100621383016...  nodule_1         253   

   centroid_y  centroid_x                                               bbox  \
0          45          78  (slice(np.int64(206), np.int64(218), None), sl...   
1         406         118  (slice(np.int64(151), np.int64(160), None), sl...   
2         385          89  (slice(np.int64(264), np.int64(277), None), sl...   
3         378         143  (slice(np.int64(321), np.int64(358), None), sl...   
4         379         231  (slice(np.int64(233), np.int64(273), None), sl...   

   mal_mean    all_scores  calcification  internal