<a href="https://colab.research.google.com/github/Tijn808/Luna16-foundation-model/blob/preprocess/patches_with_malignancy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# load
labels = pd.read_csv("luna16_lidc_consensus_attributes_cleaned (1).csv")
patches = pd.read_csv("patch_manifest.csv").sort_values("patch_index").reset_index(drop=True)

# make sure label nodules are in a stable order per seriesuid
# (if your CSV already has nodule_0, nodule_1, ... this is safe)
labels["nodule_num"] = labels["nodule_id"].str.extract(r"(\d+)").astype(int)
labels = labels.sort_values(["seriesuid", "nodule_num"]).reset_index(drop=True)

# assign nodule_num to each patch based on sequence within each seriesuid
patches["nodule_num"] = patches.groupby("seriesuid").cumcount()

# merge on (seriesuid, nodule_num)
merged = patches.merge(
    labels,
    on=["seriesuid", "nodule_num"],
    how="left",
    suffixes=("_patch", "_label")
)

print("Total patches:", len(merged))
print("Missing labels:", merged["mal_mean"].isna().sum())

# keep only what you want
final = merged[[
    "patch_index",
    "seriesuid",
    "nodule_num",
    "patch_filename",
    "mal_mean",
    "all_scores",
    "calcification",
    "internalStructure",
    "lobulation",
    "margin",
    "sphericity",
    "spiculation",
    "subtlety",
    "texture",
]]

final.to_csv("patches_with_malignancy_labels.csv", index=False)
print("Saved patches_with_malignancy_labels.csv")


Total patches: 601
Missing labels: 0
Saved patches_with_malignancy_labels.csv


In [None]:
# show first few patches per scan
print(final.groupby("seriesuid").head(3)[
    ["patch_index","seriesuid","nodule_num","mal_mean","patch_filename"]
])


     patch_index                                          seriesuid  \
0              0  1.3.6.1.4.1.14519.5.2.1.6279.6001.131939324905...   
1              1  1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636...   
2              2  1.3.6.1.4.1.14519.5.2.1.6279.6001.170706757615...   
3              3  1.3.6.1.4.1.14519.5.2.1.6279.6001.170706757615...   
4              4  1.3.6.1.4.1.14519.5.2.1.6279.6001.323541312620...   
..           ...                                                ...   
596          596  1.3.6.1.4.1.14519.5.2.1.6279.6001.323535944958...   
597          597  1.3.6.1.4.1.14519.5.2.1.6279.6001.631047517458...   
598          598  1.3.6.1.4.1.14519.5.2.1.6279.6001.801945620899...   
599          599  1.3.6.1.4.1.14519.5.2.1.6279.6001.323302986710...   
600          600  1.3.6.1.4.1.14519.5.2.1.6279.6001.780558315515...   

     nodule_num  mal_mean                                     patch_filename  
0             0      3.25  0_1.3.6.1.4.1.14519.5.2.1.6279.6001.13193

Unnamed: 0,patch_index,seriesuid,nodule_num,patch_filename,mal_mean,all_scores,calcification,internalStructure,lobulation,margin,sphericity,spiculation,subtlety,texture
0,0,1.3.6.1.4.1.14519.5.2.1.6279.6001.131939324905...,0,0_1.3.6.1.4.1.14519.5.2.1.6279.6001.1319393249...,3.25,"[5, 2, 3, 3]",6,1,4,4,4,4,2,5
1,1,1.3.6.1.4.1.14519.5.2.1.6279.6001.179049373636...,0,1_1.3.6.1.4.1.14519.5.2.1.6279.6001.1790493736...,4.75,"[5, 5, 5, 4]",6,1,3,3,4,4,5,5
2,2,1.3.6.1.4.1.14519.5.2.1.6279.6001.170706757615...,0,2_1.3.6.1.4.1.14519.5.2.1.6279.6001.1707067576...,4.25,"[5, 5, 3, 4]",6,1,2,3,4,3,5,4
3,3,1.3.6.1.4.1.14519.5.2.1.6279.6001.170706757615...,1,3_1.3.6.1.4.1.14519.5.2.1.6279.6001.1707067576...,3.25,"[4, 4, 3, 2]",6,1,2,4,4,2,4,5
4,4,1.3.6.1.4.1.14519.5.2.1.6279.6001.323541312620...,0,4_1.3.6.1.4.1.14519.5.2.1.6279.6001.3235413126...,1.25,"[2, 1, 1, 1]",4,1,1,5,4,1,3,5
5,5,1.3.6.1.4.1.14519.5.2.1.6279.6001.272348349298...,0,5_1.3.6.1.4.1.14519.5.2.1.6279.6001.2723483492...,4.75,"[5, 5, 5, 4]",6,1,3,3,4,5,5,5
6,6,1.3.6.1.4.1.14519.5.2.1.6279.6001.140253591510...,0,6_1.3.6.1.4.1.14519.5.2.1.6279.6001.1402535915...,3.0,"[3, 3, 4, 2]",6,1,2,4,4,1,4,5
7,7,1.3.6.1.4.1.14519.5.2.1.6279.6001.140253591510...,1,7_1.3.6.1.4.1.14519.5.2.1.6279.6001.1402535915...,1.25,"[2, 1, 1, 1]",4,1,1,5,5,1,4,5
8,8,1.3.6.1.4.1.14519.5.2.1.6279.6001.328789598898...,0,8_1.3.6.1.4.1.14519.5.2.1.6279.6001.3287895988...,2.666667,"[3, 3, 2]",6,1,1,5,5,1,3,5
9,9,1.3.6.1.4.1.14519.5.2.1.6279.6001.328789598898...,1,9_1.3.6.1.4.1.14519.5.2.1.6279.6001.3287895988...,2.5,"[2, 3, 3, 2]",6,1,1,4,4,1,3,4
