In [133]:
import os
import numpy as np
import pandas as pd
import os

from PIL import Image
from imageio import imread
import cv2 as cv
import SimpleITK as sitk
%matplotlib inline

In [134]:
# Capture directory location for each file
file_dict = {}

for dirname, _, filenames in os.walk('../../data/luna16-raw'):
    for filename in filenames:
        if (filename.endswith('.mhd')):
            file_dict[filename] = dirname


In [135]:
candidates_df = pd.read_csv('../../data/luna16-raw/candidates_V2/candidates_V2.csv')

# Read in the filename prefixes that have no data
missing_df = pd.read_csv('../../data/luna16-raw/missing.txt', header=None)

In [136]:
candidates_df.head()

Unnamed: 0,seriesuid,coordX,coordY,coordZ,class
0,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,68.42,-74.48,-288.7,0
1,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-95.209361,-91.809406,-377.42635,0
2,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-24.766755,-120.379294,-273.361539,0
3,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,-63.08,-65.74,-344.24,0
4,1.3.6.1.4.1.14519.5.2.1.6279.6001.100225287222...,52.946688,-92.688873,-241.067872,0


In [137]:
candidates_df.shape

(754975, 5)

In [138]:
positives_index = candidates_df[candidates_df['class']==1].index
negatives_index = candidates_df[candidates_df['class']==0].index

# Select a random sample of 5 times the positives list since it is quite imbalanced
np.random.seed(42)
negatives_index = np.random.choice(negatives_index, len(positives_index)*5, replace = False)

positive_df = candidates_df.iloc[positives_index]
negative_df = candidates_df.iloc[negatives_index]

# Filter rows to only those where we have data
positive_df = positive_df[~positive_df['seriesuid'].isin(missing_df[0])]
negative_df = negative_df[~negative_df['seriesuid'].isin(missing_df[0])]

In [139]:
negative_df.shape

(3921, 5)

In [140]:
positive_df.shape

(817, 5)

In [141]:

def generate_jpg_images(df, output_dir):
    for index, row in df.iterrows():
        mhd_filename = row['seriesuid'] + '.mhd'
        mhd_path = os.path.join(file_dict[mhd_filename], mhd_filename)

        # Load .mhd file using SimpleITK
        ct_scan = sitk.ReadImage(mhd_path)
        origin = ct_scan.GetOrigin()
        resolution = ct_scan.GetSpacing()
        x, y, z = int(row['coordX']), int(row['coordY']), int(row['coordZ'])

        center_xyz = (x, y, z)
        origin_xyz_np = np.array(origin)
        voxel_size_xyz_np = np.array(resolution)
        direction_matrix = np.array(ct_scan.GetDirection()).reshape(3, 3)
        # Convert the coordinates of the center of the candidate
        # from the patient coordinate system to column, row, index
        cri = ((center_xyz - origin_xyz_np) @ np.linalg.inv(direction_matrix)) / voxel_size_xyz_np

        # Since we'll be using column, row and index values to index into arrays,
        # we round them to the nearest integer.
        cri = np.round(cri)

        # Convert CT scan to NumPy array
        ct_array = np.array(sitk.GetArrayFromImage(ct_scan), dtype=np.float32)
        ct_array.clip(-1000, 1000, ct_array)

        # Extract the specific slice corresponding to the given coordinates
        selected_slice = ct_array[int(cri[2]), :, :]

        # Normalize pixel values to [0, 255] for JPEG conversion
        selected_slice_normalized = ((selected_slice - np.min(selected_slice)) /
                                    (np.max(selected_slice) - np.min(selected_slice))) * 255
        selected_slice_normalized = selected_slice_normalized.astype(np.uint8)

        # Save the selected slice as JPEG
        output_path = os.path.join(output_dir, "image_{}.jpg".format(index))
        image = Image.fromarray(selected_slice_normalized, 'L')  # 'L' mode for grayscale
        image.save(output_path)

In [143]:
# Output a random sample of normal images
generate_jpg_images(negative_df, '../../data/cancer-detection-model/luna16-jpg for testing/Normal')

# Output the malignant images
generate_jpg_images(positive_df, '../../data/cancer-detection-model/luna16-jpg for testing/Malignant')

In [144]:
positive_df.to_csv('../../data/cancer-detection-model/luna16-jpg for testing/positives.csv')

In [145]:
negative_df.to_csv('../../data/cancer-detection-model/luna16-jpg for testing/negatives.csv')