In [7]:
!wget https://zenodo.org/records/3723295/files/annotations.csv

--2024-11-03 10:40:37--  https://zenodo.org/records/3723295/files/annotations.csv
Resolving zenodo.org (zenodo.org)... 188.185.79.172, 188.184.98.238, 188.184.103.159, ...
Connecting to zenodo.org (zenodo.org)|188.185.79.172|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 136986 (134K) [text/plain]
Saving to: 'annotations.csv.1'


2024-11-03 10:40:37 (2.44 MB/s) - 'annotations.csv.1' saved [136986/136986]



In [8]:
!wget https://zenodo.org/records/4121926/files/subset7.zip

--2024-11-03 10:40:38--  https://zenodo.org/records/4121926/files/subset7.zip
Resolving zenodo.org (zenodo.org)... 188.184.98.238, 188.184.103.159, 188.185.79.172, ...
Connecting to zenodo.org (zenodo.org)|188.184.98.238|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6313598213 (5.9G) [application/octet-stream]
Saving to: 'subset7.zip'


2024-11-03 10:41:15 (161 MB/s) - 'subset7.zip' saved [6313598213/6313598213]



In [9]:
!unzip -q /kaggle/working/subset7.zip -d /kaggle/working/

In [10]:
!rm subset7.zip

In [11]:
import SimpleITK as sitk
import numpy as np
import csv
from glob import glob
import pandas as pd
import os
from tqdm import tqdm
import matplotlib.pyplot as plt
import cv2

In [12]:
# LUNA_mask_extraction.py
subset = "subset7"
file_list = os.listdir(f"/kaggle/working//{subset}")
file_list = list(map(lambda file : f"/kaggle/working/{subset}/" + file, file_list))

def myFunc(file):
  if file.endswith(".mhd"):
    return True
  else:
    return False

file_list = list(filter(myFunc, file_list))

In [13]:
!mkdir ct_images

In [14]:
def make_mask(center, diam, z, width, height, spacing, origin):
    '''
        Center : centers of circles px -- list of coordinates x,y,z
        diam : diameters of circles px -- diameter
        widthXheight : pixel dim of image
        spacing = mm/px conversion rate np array x,y,z
        origin = x,y,z mm np.array
        z = z position of slice in world coordinates mm
    '''
    mask = np.zeros([height, width]) # 0's everywhere except nodule swapping x,y to match img
    #convert to nodule space from world coordinates

    # Defining the voxel range in which the nodule falls
    v_center = (center - origin)/spacing
    v_diam = int(diam/spacing[0] + 1)
    v_xmin = np.max([0, int(v_center[0]-v_diam/2) - 2])
    v_xmax = np.min([width-1, int(v_center[0]+v_diam/2) + 2])
    v_ymin = np.max([0, int(v_center[1]-v_diam/2) - 2]) 
    v_ymax = np.min([height-1, int(v_center[1]+v_diam/2) + 2])

    v_xrange = range(v_xmin, v_xmax+1)
    v_yrange = range(v_ymin, v_ymax+1)

    # Fill in 1 within sphere around nodule
    for v_x in v_xrange:
        for v_y in v_yrange:
            p_x = spacing[0]*v_x + origin[0]
            p_y = spacing[1]*v_y + origin[1]
            if np.linalg.norm(center - np.array([p_x, p_y, z])) <= diam:
                mask[int((p_y-origin[1]) / spacing[1]), int((p_x-origin[0]) / spacing[0])] = 1.0
    return(mask, [v_xmin, v_xmax, v_ymin, v_ymax])

def get_filename(case):
    global file_list
    for f in file_list:
        if case in f:
            return(f)
        
def normalize(image):
    MIN_BOUND = -1000.0
    MAX_BOUND = 400.0
    image = (image - MIN_BOUND) / (MAX_BOUND - MIN_BOUND)
    image[image>1] = 1.
    image[image<0] = 0.
    return image

if __name__ == "__main__":
    df_node = pd.read_csv("/kaggle/working/annotations.csv")
    df_node["file"] = df_node["seriesuid"].apply(get_filename)
    df_node = df_node.dropna()
    # Define the file name
    file_name = 'output.csv'

    # Define the column names
    columns = ['x_min', 'x_max', 'y_min', 'y_max', 'ID']

    # Create an empty CSV file with column headers
    with open(file_name, mode='w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=columns)
        writer.writeheader()
    df_roi = pd.read_csv("output.csv")
    df_roi_cur = dict()
    
    print(f"TRÍCH XUẤT ẢNH VÀ TỌA ĐỘ TỪ FILE .raw VÀ FILE .mhd TỪ {subset}")
    for fcount, img_file in enumerate(tqdm(file_list)):
        mini_df = df_node[df_node["file"]==img_file] # get all nodules associate with file
        if len(mini_df) > 0:       # some files may not have a nodule--skipping those
            itk_img = sitk.ReadImage(img_file) # read .mhd file
            img_array = sitk.GetArrayFromImage(itk_img) # indexes are z,y,x (notice the ordering)
            num_z, height, width = img_array.shape
            origin = np.array(itk_img.GetOrigin())      # x,y,z  Origin in world coordinates (mm)
            spacing = np.array(itk_img.GetSpacing())    # spacing of voxels in world coor. (mm)
            
            for node_idx, cur_row in mini_df.iterrows():       
                node_x = cur_row["coordX"]
                node_y = cur_row["coordY"]
                node_z = cur_row["coordZ"]
                diam = cur_row["diameter_mm"]
                
                # imgs = np.ndarray([num_z, height, width], dtype=np.float16)
                center = np.array([node_x, node_y, node_z])   # nodule center
                v_center = np.rint((center-origin)/spacing)  # nodule center in voxel space (still x,y,z ordering)
            
                for i_z in range(0, num_z): # clip prevents going out of bounds in Z
                    _, roi = make_mask(center, diam, i_z*spacing[2]+origin[2], width, height, spacing, origin)
                    if i_z == v_center[2]:
                        roi_key = f"{img_file.split('/')[-1].rsplit('.', 1)[0]}_{i_z}_{node_idx}"
                        img_name = f"{img_file.split('/')[-1].rsplit('.', 1)[0]}_{i_z}_{node_idx}.png"
                        img = normalize(img_array[i_z])
                        img = img * 255
                        img_rgb = np.stack((img,)*3, -1)
                        cv2.imwrite(f"/kaggle/working/ct_images/{img_name}", img_rgb)
                        df_roi_cur.update({roi_key: roi})
                    # imgs[i_z] = img_array[i_z]
    df_temp = pd.DataFrame.from_dict(df_roi_cur, columns=["x_min", "x_max", "y_min", "y_max"], orient='index')
    df_temp["ID"] = list(df_roi_cur.keys())
    df_temp = df_temp.reset_index(drop=True)
    df_roi = pd.concat([df_roi, df_temp])
    df_roi.to_csv("output.csv", index=False)

TRÍCH XUẤT ẢNH VÀ TỌA ĐỘ TỪ FILE .raw VÀ FILE .mhd TỪ subset7


100%|██████████| 89/89 [01:02<00:00,  1.42it/s]


In [15]:
!zip -r subset7.zip /kaggle/working/ct_images /kaggle/working/output.csv

  adding: kaggle/working/ct_images/ (stored 0%)
  adding: kaggle/working/ct_images/1.3.6.1.4.1.14519.5.2.1.6279.6001.900182736599353600185270496549_88_1139.png (deflated 17%)
  adding: kaggle/working/ct_images/1.3.6.1.4.1.14519.5.2.1.6279.6001.185154482385982570363528682299_196_327.png (deflated 18%)
  adding: kaggle/working/ct_images/1.3.6.1.4.1.14519.5.2.1.6279.6001.130036599816889919308975074972_119_116.png (deflated 20%)
  adding: kaggle/working/ct_images/1.3.6.1.4.1.14519.5.2.1.6279.6001.129982010889624423230394257528_94_112.png (deflated 19%)
  adding: kaggle/working/ct_images/1.3.6.1.4.1.14519.5.2.1.6279.6001.129982010889624423230394257528_142_114.png (deflated 19%)
  adding: kaggle/working/ct_images/1.3.6.1.4.1.14519.5.2.1.6279.6001.328789598898469177563438457842_67_918.png (deflated 19%)
  adding: kaggle/working/ct_images/1.3.6.1.4.1.14519.5.2.1.6279.6001.111496024928645603833332252962_133_33.png (deflated 16%)
  adding: kaggle/working/ct_images/1.3.6.1.4.1.14519.5.2.1.6279.60