First is necessary to know which files are going to be used in the data set, in our case we tried to use all the information avaiable (so we should have 1012 patient id's inside our local copy of the LIDC-IDRI) so that we could get more accurate results and since we would be using "one" image from each pacient "nodule"

In [1]:
import os

#specify the path to your dataset where you have the folders with the name of the pacients
directory="data/dataset/LIDC-IDRI/"
#list all the folders inside our local copy of LIDC-IDRI (that correspond to the patient id's that are going to be used to get the scans and annotations)
folders=os.listdir(directory)

print(f"Number of patients in the dataset: {len(folders)}")
#just so that it is easier to know which patients we are don't have in our copy of the LIDC-IDRI
folders.sort()
print()
#printing the patients id's that we couldn't download
print("List of the id's that are missing from our dataset:")
count=1
for folder in folders:
    if(str(count) not in folder):
        print(folder)
        count=int(folder.split('-')[-1])
    count+=1


Number of patients in the dataset: 1010

List of the id's that are missing from our dataset:
LIDC-IDRI-0239
LIDC-IDRI-0586


As it is possible to see the number is diferent from the expected (1012 number of people that are avaiable in the original LODC-IRDI), this is due to the fact that during the download (which was made using the offical tool nbia-data-retriver) we did not have authority to download this instances

In [None]:
import pylidc as pl

n_lesions=0
scans_0_nods=[]

for folder in folders:
    pid=folder

    #get the scann corresponding to the patient_id that we have 
    scan=pl.query(pl.Scan).filter(pl.Scan.patient_id==pid).first()
    #get all the nodules annotations, each list corresponds to the same nodules
    nods=scan.cluster_annotations()    
    n_lesions+=len(nods)
    if(len(nods)==0):
        scans_0_nods.append([scan.patient_id,scan.annotations])
        


print(f"Number of lesions: {n_lesions}" )
print(f"Number of scanns with 0 nodules: {len(scans_0_nods)}")
#print(scans_0_nods)
#13 of them gave an error so in total we have 123 scanns with 0 annotations


As it is possible to see above some of the scans don't have nodules/annotations 

In the following code we use VG16 to extract features from the images of the lesions

In [3]:
def get_malignancy(anns):
    #Get the Malignancy of the lesion
    #
    #   1-‘Highly Unlikely’
    #   2-‘Moderately Unlikely’
    #   3-‘Indeterminate’
    #   4-‘Moderately Suspicious’
    #   5-‘Highly Suspicious’
    #this feature will be used as a label to the dataset
    malignancy=0
    for i in range(len(anns)):
        malignancy+=anns[i].malignancy
    malignancy/=len(anns)
    malignancy=round(malignancy)

    return malignancy

In [None]:
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import pandas as pd
from pylidc.utils import consensus
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.applications import VGG16
from tensorflow.keras.layers import GlobalAveragePooling2D
from tensorflow.keras.models import Model

#load pre-trained model withput the top layers
base_model=VGG16(weights='imagenet',include_top=False, input_shape=(512,512,3))

#add global pooling to the method
x=base_model.output
x=GlobalAveragePooling2D()(x)
cnn_model=Model(inputs=base_model.input,outputs=x)


#create the DataFrame where we ccn features are going to be stored

#create the columns names
columns_name=[]
columns_name.append("PatientID")
for i in range(512):
    columns_name.append("cnn_feature "+str(i))

columns_name.append("Label")

df=pd.DataFrame(columns=columns_name)


for folder in folders:
    
    pid=folder
    print(f"Currently in scan: {pid}")

    #get the scann corresponding to the patient_id that we have 
    scan=pl.query(pl.Scan).filter(pl.Scan.patient_id==pid).first()
    
    #get all the nodules annotations, each list corresponds to the same lesions 
    nods=scan.cluster_annotations()
    #nods is a list of list of annotations agrouped by nodule

    #create an volume with all of the dicom images of that scan (basicly an 3D image of the CT)
    vol=scan.to_volume()

    padding=[(30,20),(10,25),(0,0)]
    #anns represents all the annotations for one lesion in this scann
    #nods represents a list of all the annotations for each lesion in this scan 
    for anns in nods:
        #get the concensus for the contours of this nodule
        cmask,cbbox,masks=consensus(anns,clevel=0.5,pad=padding)

        #get the central slice of the computed bounding box
        k=int(0.5*(cbbox[2].stop-cbbox[2].start))

        img=vol[cbbox][:,:,k]


        #applying possible usefull filters since we don't have that many lesions (2625)
        from skimage.filters.rank import entropy
        from skimage.morphology import disk
        from skimage.util import img_as_ubyte
        entropy_img=entropy(img_as_ubyte(img),disk(1))


        from skimage.filters import sobel
        sobel_img=sobel(img)

        #getting the images ready for the cnn
        imgs=[img,entropy_img,sobel_img]

        for i in range(1):#we can change this after if we want to use the filters
            #making each image have 3 dimensions
            imgs[i] = np.expand_dims(imgs[i], axis=-1)

            #resizing the images so that they match the input of the cnn
            imgs[i]=tf.image.resize(imgs[i],(512,512))

            #converting the images to rgb
            if(imgs[i].shape[-1]==1):
                imgs[i]=tf.image.grayscale_to_rgb(imgs[i])
            
            #preprocess to the VGG16
            imgs[i]=preprocess_input(imgs[i].numpy())

            #get the features learned by the cnn
            features=cnn_model(np.expand_dims(imgs[i], axis=0))
            features=features.numpy().flatten()

            #create the row with everything that is going in the dataset 
            row=[pid]
            for x in features:
                row.append(x)
            row.append(get_malignancy(anns))
            df.loc[len(df.index)]=row
        
    #     break #this breaks are for when debugging not having to run all the scanns
    # break #this breaks are for when debugging not having to run all the scanns


In [5]:
print(df.shape)
df.head()

(2625, 514)


Unnamed: 0,PatientID,cnn_feature 0,cnn_feature 1,cnn_feature 2,cnn_feature 3,cnn_feature 4,cnn_feature 5,cnn_feature 6,cnn_feature 7,cnn_feature 8,...,cnn_feature 503,cnn_feature 504,cnn_feature 505,cnn_feature 506,cnn_feature 507,cnn_feature 508,cnn_feature 509,cnn_feature 510,cnn_feature 511,Label
0,LIDC-IDRI-0001,1.408519,6.613491,0.206812,12.522177,2.842293,2.566988,0.225895,3.202182,14.950947,...,0.080818,1.799512,0.474278,6.975934,1.273983,4.503269,0.129298,9.106273,0.563464,5
1,LIDC-IDRI-0002,1.461552,9.027784,1.347735,26.601097,2.025649,2.777943,1.059285,2.9877,6.970528,...,3.473532,0.753911,0.949201,1.776874,2.387758,4.194271,1.47639,5.335127,1.748041,4
2,LIDC-IDRI-0003,5.875987,11.540094,0.045949,27.112854,4.678982,3.787902,1.021508,1.924538,7.500788,...,0.80121,0.239504,9.201927,1.946184,5.459285,0.32032,0.616315,3.036831,0.429381,2
3,LIDC-IDRI-0003,4.641324,6.771929,0.130819,20.271763,1.972514,5.147611,0.792397,6.882763,15.692515,...,0.951502,1.994685,0.370287,5.199282,1.392501,1.78975,0.671943,5.783095,0.068551,4
4,LIDC-IDRI-0003,5.158184,4.188747,0.030707,4.44469,3.11626,0.293485,0.0,3.416024,8.580443,...,0.095775,1.139045,0.818399,4.912726,11.126127,1.109776,1.508004,11.945731,0.48469,3


Store the current cnn features values to make it possible to make use them without having to calculate them again as it can take a while (93 min first time we tried)


In [6]:
df.to_csv('cnn_features.csv', index=False)

In order to make the randomforest algorithm to work better we decided to make the number of features smaller (200)

In [7]:
from sklearn.decomposition import PCA
pca=PCA(n_components=200)
features_columns=df.loc[:,~df.columns.isin(['PatientID','Label'])]
reduced_features=pca.fit_transform(features_columns)
reduced_features.shape

(2625, 200)

Formating the reduced features into an DataFrame

In [8]:
columns_name=[]
for i in range(200):
    columns_name.append("cnn_feature "+str(i))
    
reduced_df=pd.DataFrame(columns=columns_name, data=reduced_features)
reduced_df=pd.concat([df['PatientID'],reduced_df,df['Label']], axis=1)
reduced_df.head()




Unnamed: 0,PatientID,cnn_feature 0,cnn_feature 1,cnn_feature 2,cnn_feature 3,cnn_feature 4,cnn_feature 5,cnn_feature 6,cnn_feature 7,cnn_feature 8,...,cnn_feature 191,cnn_feature 192,cnn_feature 193,cnn_feature 194,cnn_feature 195,cnn_feature 196,cnn_feature 197,cnn_feature 198,cnn_feature 199,Label
0,LIDC-IDRI-0001,-59.270191,-4.731754,-19.260628,-49.457321,-11.942023,-16.354303,-12.154401,6.667793,1.845408,...,-0.035443,0.564248,0.842348,-2.176432,-0.014757,2.239526,-0.390019,0.31965,0.738894,5
1,LIDC-IDRI-0002,-7.475523,-11.984719,-72.629936,-13.296576,21.873442,-7.505165,-39.059998,-15.960101,15.330404,...,-1.795778,-0.523711,1.677022,0.258018,-3.003587,0.102814,-1.25175,0.74965,-0.240229,4
2,LIDC-IDRI-0003,16.350088,50.582478,-79.797974,4.395186,23.136997,16.631542,-31.811354,-15.073377,0.979202,...,1.142602,-0.964356,2.940799,-1.391039,-0.301345,-1.437591,2.171166,0.947804,1.161555,2
3,LIDC-IDRI-0003,-20.658722,27.842659,-50.602596,-30.740786,6.100678,5.994651,-17.880272,-5.460018,2.200853,...,2.244281,-0.816611,0.542074,-1.072884,-3.382216,-2.390389,0.189504,-2.236187,-0.011233,4
4,LIDC-IDRI-0003,-74.38163,-9.598942,-13.327378,-6.291062,3.200797,10.638253,-9.848012,12.354282,3.165051,...,-0.023434,0.619883,1.100488,-0.642573,0.905817,0.332873,0.526588,-1.331275,-0.017004,3


In [9]:
#storing the reduced cnn_features
reduced_df.to_csv('cnn_reduced_features.csv',index=False)