# DS 6050 Lung Cancer Classification
## Thomas Butler, vra2cf
## Drew Haynes, rbc6wr
## Christian Schroeder, dbn5eu

# Load Packages

In [None]:
import sys
import sklearn
import os
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from functools import partial
import PIL

%tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras

print(tf.__version__)
!pip install SimpleITK
import SimpleITK as sitk
import csv
from PIL import Image
%matplotlib inline

np.random.seed(42) # note that you must use the same seed to ensure consistentcy in your training/validation/testing
tf.random.set_seed(42)

Colab only includes TensorFlow 2.x; %tensorflow_version has no effect.
2.8.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SimpleITK
  Downloading SimpleITK-2.1.1.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (48.4 MB)
[K     |████████████████████████████████| 48.4 MB 49 kB/s 
[?25hInstalling collected packages: SimpleITK
Successfully installed SimpleITK-2.1.1.2


# Import Dataset


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
root = "/content/drive/MyDrive/LUNA/"

Ref: https://luna16.grand-challenge.org/Tutorial/

Used this code to help preprocess data into a format we can use.

We define now a function to:
    - Open the image 
    - Store it into a numpy array
    - Extract the following info: Pixel Spacing, Origin
This function takes as input the name of the image and returns:
    - The array corresponding to the image (numpyImage)
    - Origin (numpyOrigin)
    - PixelSpacing (numpySpacing)

In [None]:
def load_itk_image(filename):
    itkimage = sitk.ReadImage(filename)
    numpyImage = sitk.GetArrayFromImage(itkimage)
     
    numpyOrigin = np.array(list(reversed(itkimage.GetOrigin())))
    numpySpacing = np.array(list(reversed(itkimage.GetSpacing())))
     
    return numpyImage, numpyOrigin, numpySpacing

Since the coordinates of the candidates are given in World Coordinates, we now need to transform from world coordinates to voxel coordinates. 
We define now a function to do that. Please note that the transformation below is only valid if there is no rotation component in the transformation matrix. For all CT images in our dataset, there is no rotation component so that this formula can be used. 
This function takes as inputs:
    - The world coordinates
    - The origin
    - The pixel Spacing
This function returns:
    - Voxel coordinates (voxelCoord)

In [None]:
def worldToVoxelCoord(worldCoord, origin, spacing):
     
    stretchedVoxelCoord = np.absolute(worldCoord - origin)
    voxelCoord = stretchedVoxelCoord / spacing
    return voxelCoord

We want to extract now some features from the candidates. We define some normalized planes to extract views from the candidates

In [None]:
def normalizePlanes(npzarray):
     
    maxHU = 400.
    minHU = -1000.
 
    npzarray = (npzarray - minHU) / (maxHU - minHU)
    npzarray[npzarray>1] = 1.
    npzarray[npzarray<0] = 0.
    return npzarray

    - Specify the path where the image (img_path) is 
    - Specificy the path where the file with the list of candidates is (cand_path)

In [None]:
cand_path = root+'candidates_V2/candidates_V2.csv'

In [None]:
subsets = range(10)
img_paths = []
for subset in subsets : # Doing this to loop instead of a general recursive
                        # loop to 1) avoid segs_lungs_LUNA16 files and 2) to 
                        # allow flexibility with which subsets to use if desired
  subset_root = root+'subset'+str(subset)+'/'
  for img_path in os.listdir(subset_root):
        if img_path.endswith('.mhd'):
          img_paths.append(subset_root+img_path)


    - Load the image
    - Extract the Origin
    - Extract the Pixel Spacing 

In [None]:
# load image
numpyImage, numpyOrigin, numpySpacing = load_itk_image(img_paths[0])
print(numpyImage.shape)
print(numpyOrigin)
print(numpySpacing)

(121, 512, 512)
[-335.209991 -195.       -198.100006]
[2.5        0.76171899 0.76171899]


    - Load the csv file
    - Get the candidates 
    - Transform from world to voxel coordinates

In [None]:
# load candidates
cands_master_df = pd.read_csv(cand_path)

In [None]:
#subset dataset.
df_1 =cands_master_df[cands_master_df['class'] == 1]
df_0 =cands_master_df[cands_master_df['class'] == 0].sample(frac=1).head(2000)
cands_master_df = pd.concat([df_1,df_0]).sample(frac=1)

In [None]:
# List of each unique SeriesUID, i.e. candidates
cands = cands_master_df.seriesuid.unique()

# Dictionary 
cands_df_dict = {elem : pd.DataFrame() for elem in cands}

for key in cands_df_dict.keys():
    cands_df_dict[key] = cands_master_df[cands_master_df.seriesuid == key]
    cands_df_dict[key] = cands_df_dict[key].drop(['seriesuid'], axis=1)

In [None]:
Fail_save = 0
Fail_convert = 0
Could_not_find = 0
if not os.path.exists(root+'data_subset_balanced/'):
  os.mkdir(root+'data_subset_balanced/')
if not os.path.exists(root+'data_subset_balanced/0/'):
  os.mkdir(root+'data_subset_balanced/0/')
if not os.path.exists(root+'data_subset_balanced/1/'):
  os.mkdir(root+'data_subset_balanced/1/')
if not os.path.exists(root+'patches/'):
  os.mkdir(root+'patches/')

for cand_id in cands_df_dict:
  try:
    voxelWidth = 64
    cand_image_path = [i for i in img_paths if cand_id in i][0]
    numpyImage, numpyOrigin, numpySpacing = load_itk_image(img_paths[0])
    points_for_cand = cands_df_dict[cand_id]
    for index, point in points_for_cand.iterrows():
        #print(point[2])
        worldCoord = np.asarray([float(point[2]),float(point[1]),float(point[0])])
        voxelCoord = worldToVoxelCoord(worldCoord, numpyOrigin, numpySpacing)
        try : 
          patch = numpyImage[int(voxelCoord[0]),int(voxelCoord[1]-voxelWidth/2):int(voxelCoord[1]+voxelWidth/2),int(voxelCoord[2]-voxelWidth/2):int(voxelCoord[2]+voxelWidth/2)]
          patch = normalizePlanes(patch)
          #print('data')
          #print(worldCoord)
          #print(voxelCoord)
          cand_root = root+'patches/'+cand_id+'/'
          if not os.path.exists(cand_root):
            os.mkdir(cand_root)
          #outputPath = cand_root + 'patch_' + str(worldCoord[0]) + '_' + str(worldCoord[1]) + '_' + str(worldCoord[2]) + str(int(point[3])) + '.tiff'
          #save jpg under data and class 0 or 1
          outputPath = root + 'data_subset_balanced/' + str(int(point[3])) + '/patch_' + str(worldCoord[0]) + '_' + str(worldCoord[1]) + '_' + str(worldCoord[2]) + str(int(point[3])) + '.jpg'
          #plt.imshow(patch, cmap='gray')
          #plt.show()
          f = open(outputPath, "w")
          f.close()
          try:
            patch_image = Image.fromarray(patch*255).convert('L')
            #patch_image = patch_image.save(outputPath)
            #save jpg keep image the same.
            patch_image = patch_image.save(outputPath)
          except:
            print(f'Failed to save {outputPath}')
            Fail_save+=1
            os.remove(outputPath)
        except :
          print(f'Failed to convert point: ID: {cand_id}, Index: {index}')
          Fail_convert+=1
  except:
    print(f'Could not find {cand_id}')
    Could_not_find+=1

      

Could not find 1.3.6.1.4.1.14519.5.2.1.6279.6001.276556509002726404418399209377
Could not find 1.3.6.1.4.1.14519.5.2.1.6279.6001.211956804948320236390242845468
Could not find 1.3.6.1.4.1.14519.5.2.1.6279.6001.277662902666135640561346462196
Failed to convert point: ID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.450501966058662668272378865145, Index: 610645
Failed to convert point: ID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.450501966058662668272378865145, Index: 609721
Failed to convert point: ID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.450501966058662668272378865145, Index: 609614
Failed to convert point: ID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.450501966058662668272378865145, Index: 610351
Failed to convert point: ID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.131939324905446238286154504249, Index: 79308
Failed to convert point: ID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.131939324905446238286154504249, Index: 79493
Failed to convert point: ID: 1.3.6.1.4.1.14519.5.2.1.6279.6001.131939324905446238286154504249, Index: 78810


In [None]:
print(Fail_save)
print(Fail_convert)
print(Could_not_find)

87
212
126


In [None]:
import pandas as pd

cand_csv = pd.read_csv(cand_path)
cand_csv['class'].value_counts()

0    753418
1      1557
Name: class, dtype: int64

In [None]:
from os import listdir
file_names_0 = [file for file in os.listdir(root+'data_subset_balanced/0')]
len(file_names_0)

1517

In [None]:
file_names_1 = [file for file in os.listdir(root+'data_subset_balanced/1')]
len(file_names_1)

1184

In [None]:
from sklearn.datasets import load_files 
from keras.utils import np_utils

from keras.preprocessing import image
from tqdm import tqdm # progress bar

data_dir = root + "data_subset_balanced/"
batch_size = 32;
# IMPORTANT: Depends on what pre-trained model you choose, you will need to change these dimensions accordingly
img_height = 64; 
img_width = 64;

# Training Dataset
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    color_mode = 'grayscale',
    validation_split = 0.2,
    subset = "training",
    seed = 42,
    image_size= (img_height, img_width),
    batch_size = batch_size
)

# Validation Dataset
validation_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    color_mode = 'grayscale',
    validation_split = 0.2,
    subset = "validation",
    seed = 42,
    image_size = (img_height, img_width),
    batch_size = batch_size
)        


Found 2701 files belonging to 2 classes.
Using 2161 files for training.
Found 2701 files belonging to 2 classes.
Using 540 files for validation.


In [None]:
tf.random.set_seed(42)
#train is 70% of data, test is 10% of data.
#ref: https://stackoverflow.com/questions/50737192/tf-data-dataset-how-to-get-the-dataset-size-number-of-elements-in-an-epoch
dataset_size = train_ds.cardinality().numpy()
train_size = int(0.875 * dataset_size)
test_size = int(0.125 * dataset_size)

temp_ds = train_ds.shuffle(1000, seed=42)
train_ds = temp_ds.take(train_size)
test_ds = temp_ds.skip(train_size)

## Metrics Functions

In [None]:
#ref: https://aakashgoel12.medium.com/how-to-add-user-defined-function-get-f1-score-in-keras-metrics-3013f979ce0d

import keras.backend as K

def Precision(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
  precision = true_positives / (predicted_positives + K.epsilon())
  return precision

def Recall(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
  recall = true_positives / (possible_positives + K.epsilon())
  return recall

def F1_score(y_true, y_pred):
  true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
  possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
  predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
  precision = true_positives / (predicted_positives + K.epsilon())
  recall = true_positives / (possible_positives + K.epsilon())
  f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
  return f1_val

In [None]:
#keras.metrics.AUC(name='prc', curve='PR'), keras.metrics.AUC(curve='PR')

#ref:https://stackoverflow.com/questions/41032551/how-to-compute-receiving-operating-characteristic-roc-and-auc-in-keras
from sklearn.metrics import roc_auc_score

def auroc(y_true, y_pred):
  return tf.py_function(roc_auc_score, (y_true, y_pred[:,1]), tf.double)

#below function doesn't work with current code.
#def auc(y_true, y_pred):
#  auc = tf.keras.metrics.AUC(y_true, y_pred[:,1])[1]
#  K.get_session().run(tf.local_variables_initializer())
#  return auc

# Best Model (in terms of loss)

In [None]:
from functools import partial

tf.keras.backend.clear_session()

DefaultConv2D = partial(keras.layers.Conv2D,
                        kernel_size=4, activation='relu', padding="SAME")

model = keras.models.Sequential([
    keras.Input(shape=[64, 64, 1]),
    keras.layers.BatchNormalization(),
    #DefaultConv2D(filters=32, kernel_size=8),
    #DefaultConv2D(filters=32, kernel_size=8),
    #keras.layers.BatchNormalization(),
    #keras.layers.MaxPooling2D(pool_size=2),
    #DefaultConv2D(filters=32),
    #DefaultConv2D(filters=32),
    #keras.layers.BatchNormalization(),
    #keras.layers.MaxPooling2D(pool_size=2), 
    DefaultConv2D(filters=16),
    #DefaultConv2D(filters=16),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling2D(pool_size=2),
    keras.layers.Flatten(),
    keras.layers.Dense(units=128, activation='relu'),
    keras.layers.Dropout(0.5),
    #keras.layers.Dense(units=64, activation='relu'),
    #keras.layers.Dropout(0.5),
    keras.layers.Dense(units=2, activation='softmax'),
])

In [None]:
optimizer = keras.optimizers.RMSprop(learning_rate=0.0001)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy", F1_score, Precision, Recall, auroc])
history = model.fit(train_ds, epochs=10, validation_data=validation_ds)
score = model.evaluate(test_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# Test Model

Note I did alot of iterations based on commented out code. Basically every combination of the commented out code including increasing filters or dense units by a factor of 2.

In [None]:
from functools import partial

tf.keras.backend.clear_session()

DefaultConv2D = partial(keras.layers.Conv2D,
                        kernel_size=4, activation='relu', padding="SAME")

model = keras.models.Sequential([
    keras.Input(shape=[64, 64, 1]),
    keras.layers.BatchNormalization(),
    DefaultConv2D(filters=32, kernel_size=8),
    #DefaultConv2D(filters=32, kernel_size=8),
    #keras.layers.BatchNormalization(),
    #keras.layers.MaxPooling2D(pool_size=2),
    #DefaultConv2D(filters=32),
    #DefaultConv2D(filters=32),
    #keras.layers.BatchNormalization(),
    #keras.layers.MaxPooling2D(pool_size=2), 
    DefaultConv2D(filters=16),
    #DefaultConv2D(filters=16),
    keras.layers.BatchNormalization(),
    keras.layers.MaxPooling2D(pool_size=2),
    keras.layers.Flatten(),
    keras.layers.Dense(units=128, activation='relu'),
    keras.layers.Dropout(0.5),
    #keras.layers.Dense(units=64, activation='relu'),
    #keras.layers.Dropout(0.5),
    keras.layers.Dense(units=2, activation='softmax'),
])

In [None]:
optimizer = keras.optimizers.RMSprop(learning_rate=0.0001)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=["accuracy", F1_score, Precision, Recall, auroc])
history = model.fit(train_ds, epochs=10, validation_data=validation_ds)
score = model.evaluate(test_ds)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Xception Model

In [None]:
data_dir = root + "data_subset_balanced/"
batch_size = 32;
# IMPORTANT: Depends on what pre-trained model you choose, you will need to change these dimensions accordingly
img_height = 299; 
img_width = 299;

# Training Dataset
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    color_mode = 'rgb',
    validation_split = 0.2,
    subset = "training",
    seed = 42,
    image_size= (img_height, img_width),
    batch_size = batch_size
)

# Validation Dataset
validation_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    color_mode = 'rgb',
    validation_split = 0.2,
    subset = "validation",
    seed = 42,
    image_size = (img_height, img_width),
    batch_size = batch_size
)        


Found 2701 files belonging to 2 classes.
Using 2161 files for training.
Found 2701 files belonging to 2 classes.
Using 540 files for validation.


In [None]:
tf.random.set_seed(42)
#train is 70% of data, test is 10% of data.
#ref: https://stackoverflow.com/questions/50737192/tf-data-dataset-how-to-get-the-dataset-size-number-of-elements-in-an-epoch
dataset_size = train_ds.cardinality().numpy()
train_size = int(0.875 * dataset_size)
test_size = int(0.125 * dataset_size)

temp_ds = train_ds.shuffle(1000, seed=42)
train_ds = temp_ds.take(train_size)
test_ds = temp_ds.skip(train_size)

In [None]:
n_classes = 2
def xception_preprocess_x(x, y):
  x = tf.keras.applications.xception.preprocess_input(x)
  return x,y
xception_ds_train = train_ds.map(xception_preprocess_x)
xception_ds_val = validation_ds.map(xception_preprocess_x)
xception_ds_test = test_ds.map(xception_preprocess_x)

In [None]:
#ref: https://stackoverflow.com/questions/50737192/tf-data-dataset-how-to-get-the-dataset-size-number-of-elements-in-an-epoch
dataset_size = xception_ds_train.cardinality().numpy()
dataset_size

59

In [None]:
tf.keras.backend.clear_session()
base_model = keras.applications.xception.Xception(weights="imagenet",
                                                  include_top=False)
avg = keras.layers.GlobalAveragePooling2D()(base_model.output)
output = keras.layers.Dense(n_classes, activation="softmax")(avg)
model = keras.models.Model(inputs=base_model.input, outputs=output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/xception/xception_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
for layer in base_model.layers:
    layer.trainable = False

optimizer = keras.optimizers.SGD(learning_rate=0.2, momentum=0.9, decay=0.01)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=['accuracy', F1_score, Precision, Recall, auroc])
history = model.fit(xception_ds_train,
                    validation_data=xception_ds_val,
                    epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
for layer in base_model.layers:
    layer.trainable = True

optimizer = keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, nesterov=True, decay=0.001)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=['accuracy', F1_score, Precision, Recall, auroc])
history = model.fit(xception_ds_train,
                    validation_data=xception_ds_val,
                    epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
score = model.evaluate(xception_ds_test)



## InceptionResNetV2 Model

In [None]:
def InceptionResNetV2_preprocess_x(x, y):
  x = tf.keras.applications.inception_resnet_v2.preprocess_input(x)
  return x,y
InceptionResNetV2_ds_train = train_ds.map(InceptionResNetV2_preprocess_x)
InceptionResNetV2_ds_val = validation_ds.map(InceptionResNetV2_preprocess_x)
InceptionResNetV2_ds_test = test_ds.map(InceptionResNetV2_preprocess_x)

In [None]:
tf.keras.backend.clear_session()
base_model = keras.applications.InceptionResNetV2(weights="imagenet",
                                                  include_top=False)
avg = keras.layers.GlobalAveragePooling2D()(base_model.output)
output = keras.layers.Dense(n_classes, activation="softmax")(avg)
model = keras.models.Model(inputs=base_model.input, outputs=output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/inception_resnet_v2/inception_resnet_v2_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
for layer in base_model.layers:
    layer.trainable = False

optimizer = keras.optimizers.SGD(learning_rate=0.2, momentum=0.9, decay=0.01)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=['accuracy', F1_score, Precision, Recall, auroc])
history = model.fit(InceptionResNetV2_ds_train,
                    validation_data=InceptionResNetV2_ds_val,
                    epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
for layer in base_model.layers:
    layer.trainable = True

optimizer = keras.optimizers.SGD(learning_rate=0.01, momentum=0.9,
                                 nesterov=True, decay=0.001)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=['accuracy', F1_score, Precision, Recall, auroc])
history = model.fit(InceptionResNetV2_ds_train,
                    validation_data=InceptionResNetV2_ds_val,
                    epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
score = model.evaluate(InceptionResNetV2_ds_test)



## ResNet152V2 Model



In [None]:
data_dir = root + "data_subset_balanced/"
batch_size = 32;
# IMPORTANT: Depends on what pre-trained model you choose, you will need to change these dimensions accordingly
img_height = 224; 
img_width = 224;

# Training Dataset
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    color_mode = 'rgb',
    validation_split = 0.2,
    subset = "training",
    seed = 42,
    image_size= (img_height, img_width),
    batch_size = batch_size
)

# Validation Dataset
validation_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    color_mode = 'rgb',
    validation_split = 0.2,
    subset = "validation",
    seed = 42,
    image_size = (img_height, img_width),
    batch_size = batch_size
)        

Found 2701 files belonging to 2 classes.
Using 2161 files for training.
Found 2701 files belonging to 2 classes.
Using 540 files for validation.


In [None]:
tf.random.set_seed(42)
#train is 70% of data, test is 10% of data.
#ref: https://stackoverflow.com/questions/50737192/tf-data-dataset-how-to-get-the-dataset-size-number-of-elements-in-an-epoch
dataset_size = train_ds.cardinality().numpy()
train_size = int(0.875 * dataset_size)
test_size = int(0.125 * dataset_size)

temp_ds = train_ds.shuffle(1000, seed=42)
train_ds = temp_ds.take(train_size)
test_ds = temp_ds.skip(train_size)

In [None]:
def resnet_v2_preprocess_x(x, y):
  x = tf.keras.applications.resnet_v2.preprocess_input(x)
  return x,y
resnet_v2_ds_train = train_ds.map(resnet_v2_preprocess_x)
resnet_v2_ds_val = validation_ds.map(resnet_v2_preprocess_x)
resnet_v2_ds_test = test_ds.map(resnet_v2_preprocess_x)

In [None]:
dataset_size = resnet_v2_ds_train.cardinality().numpy()
dataset_size

59

In [None]:
tf.keras.backend.clear_session()
base_model = keras.applications.ResNet152V2(weights="imagenet",
                                            include_top=False)
avg = keras.layers.GlobalAveragePooling2D()(base_model.output)
output = keras.layers.Dense(n_classes, activation="softmax")(avg)
model = keras.models.Model(inputs=base_model.input, outputs=output)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet152v2_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
for layer in base_model.layers:
    layer.trainable = False

optimizer = keras.optimizers.SGD(learning_rate=0.2, momentum=0.9, decay=0.01)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=['accuracy', F1_score, Precision, Recall, auroc])
history = model.fit(resnet_v2_ds_train,
                    validation_data=resnet_v2_ds_val,
                    epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
for layer in base_model.layers:
    layer.trainable = True

optimizer = keras.optimizers.SGD(learning_rate=0.01, momentum=0.9, decay=0.001)
model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer,
              metrics=['accuracy', F1_score, Precision, Recall, auroc])
history = model.fit(resnet_v2_ds_train,
                    validation_data=resnet_v2_ds_val,
                    epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [None]:
score = model.evaluate(resnet_v2_ds_test)

