In [1]:
import pandas as pd

train_labels = pd.read_csv("train_labels.csv")

train_labels.sort_values(by='id', axis=0, inplace=True)

train_labels = train_labels[0:10000]

train_labels.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 151577 to 121931
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      10000 non-null  object
 1   label   10000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 234.4+ KB


The training dataset consists of $220,025$ pathology images, each of which is denoted by an `id` and has been assigned a `label`, either $0$ or $1$, indicating the presence of cancer. Due to memory limitations, I will only be considering the first $10,000$ samples. The goal here will be to train a convolutional neural network to successfully detect cancer given a pathology image by classifying it into one of these two categories.

In [2]:
import numpy as np

print(np.sum(train_labels['label'] == 0))
print(np.sum(train_labels['label'] == 1))
print(5936+4064)

5936
4064
10000


Of the $10,000$ training images, $5,936$ have a `label` of $0$, indicating the absence of cancer, and the remaining $4,064$ have a `label` of $1$, indicating the presence of cancer. This distribution is well-balanced, meaning we should not have any problems using accuracy as an evaluation metric.

As noted on Kaggle, the dataset contains no duplicate images, and every image has a valid `label`. Therefore the data is clean and ready for analysis.

For this project, I am going to make use of the following architecture:

Since the image size is $96 \times 96$ pixels, we will start with a $96 \times 96 \times 3$ matrix. There will be three convolutional layers: the first layer will have $32$ $3 \times 3$ filters, the second layer will have $64$ $3 \times 3$ filters and the third layer will have $128$ $3 \times 3$ filters.

In [3]:
import cv2
import numpy as np
import os

directory = 'train'

n=0

X_train_whole = []

for i, filename in enumerate(os.listdir(directory)):
    if n == 10000:
        break
        print("hit 10000")
    f = os.path.join(directory, filename)

    if os.path.isfile(f):
        img = cv2.imread(f)
        pix = np.array(img)
        pix = pix.reshape(96,96,3)
        #print(pix_arr)
        #print(pix_arr.shape)
        X_train_whole.append(pix)
        n+=1
    else:
        print('no file')

The above will load the images and turn them into numpy arrays.

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
import tensorflow as tf

In [6]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 8505248476124214785
xla_global_id: -1
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 5912018944
locality {
  bus_id: 1
  links {
  }
}
incarnation: 5942763675066565448
physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 2070, pci bus id: 0000:07:00.0, compute capability: 7.5"
xla_global_id: 416903419
]


In [7]:
X_train,valid_X,train_label,valid_label = train_test_split(X_train_whole, train_labels['label'], test_size=0.2, random_state=4)

In [8]:
X_train_t = tf.constant(X_train)
valid_X_t = tf.constant(valid_X)

In the cell above, I split the data into training and validation sets.

In [9]:
import keras
from keras.models import Sequential, Model
from keras.layers import Input,Dense, Dropout, Flatten, Conv2D, MaxPooling2D, BatchNormalization, LeakyReLU

batch_size = 64
epochs = 20
num_classes = 1

I am using Keras to implement this architecture. I am making use of the leaky ReLU activation function, and the final output layer will use a sigmoid function, as is appropriate for a binary classification problem. The batch size is $64$, and I will train the network for $20$ epochs.

In [10]:
model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),activation='linear',input_shape=(96,96,3),padding='same'))
model.add(LeakyReLU(alpha=0.1))
model.add(MaxPooling2D((2, 2),padding='same'))
model.add(Conv2D(64, (3, 3), activation='linear',padding='same'))
model.add(LeakyReLU(alpha=0.1))
model.add(MaxPooling2D(pool_size=(2, 2),padding='same'))
model.add(Conv2D(128, (3, 3), activation='linear',padding='same'))
model.add(LeakyReLU(alpha=0.1))                  
model.add(MaxPooling2D(pool_size=(2, 2),padding='same'))
model.add(Flatten())
model.add(Dense(128, activation='linear'))
model.add(LeakyReLU(alpha=0.1))                  
model.add(Dense(num_classes, activation='sigmoid'))

In [11]:
model.compile(loss=keras.losses.binary_crossentropy, optimizer=keras.optimizers.Adam(),metrics=['accuracy'])

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d (Conv2D)             (None, 96, 96, 32)        896       
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 96, 96, 32)        0         
                                                                 
 max_pooling2d (MaxPooling2D  (None, 48, 48, 32)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 48, 48, 64)        18496     
                                                                 
 leaky_re_lu_1 (LeakyReLU)   (None, 48, 48, 64)        0         
                                                                 
 max_pooling2d_1 (MaxPooling  (None, 24, 24, 64)       0         
 2D)                                                    

In [13]:
train_label_t = train_label
print(train_label_t)

136447    1
161920    0
126742    1
20905     0
161193    0
         ..
216756    0
202679    0
184402    1
152958    1
56545     0
Name: label, Length: 8000, dtype: int64


In [14]:
print(X_train_t)

tf.Tensor(
[[[[157  77 124]
   [161  79 128]
   [162  77 129]
   ...
   [106  30  88]
   [134  48 110]
   [134  37 103]]

  [[158  73 123]
   [146  61 113]
   [147  59 113]
   ...
   [170  90 157]
   [176  82 153]
   [191  91 163]]

  [[188  97 152]
   [171  80 135]
   [163  74 130]
   ...
   [189 103 181]
   [226 131 212]
   [230 126 209]]

  ...

  [[223 213 219]
   [246 229 242]
   [206 181 201]
   ...
   [231 215 232]
   [215 199 216]
   [221 205 223]]

  [[212 201 209]
   [213 196 209]
   [255 246 255]
   ...
   [238 219 246]
   [227 207 236]
   [244 224 253]]

  [[222 211 219]
   [230 213 226]
   [252 225 245]
   ...
   [219 198 231]
   [255 242 255]
   [150 128 163]]]


 [[[235 231 243]
   [228 223 238]
   [220 212 229]
   ...
   [229 230 228]
   [229 230 228]
   [229 230 228]]

  [[213 210 219]
   [219 215 226]
   [231 224 237]
   ...
   [229 230 228]
   [229 230 228]
   [229 230 228]]

  [[234 231 233]
   [230 226 231]
   [225 221 227]
   ...
   [229 230 228]
   [229 230 228]


In [15]:
train = model.fit(X_train_t, train_label, batch_size=batch_size,epochs=epochs,verbose=1,validation_data=(valid_X_t, valid_label))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [16]:
print(len(X_train))
print(len(train_label))
print(len(valid_X))
print(len(valid_label))

8000
8000
2000
2000


In [21]:
directory = 'test'

#n=0

X_test_whole = []

for i, filename in enumerate(os.listdir(directory)):
    #if n == 10000:
    #    break
    #    print("hit 10000")
    f = os.path.join(directory, filename)

    if os.path.isfile(f):
        img = cv2.imread(f)
        pix = np.array(img)
        pix = pix.reshape(96,96,3)
        #print(pix_arr)
        #print(pix_arr.shape)
        X_test_whole.append(pix)
        #n+=1
    else:
        print('no file')

In [None]:
X_test_t = tf.constant(X_test_whole)

pred = model.predict(X_test_t)

In [None]:
id_list = []

for filename in os.listdir(directory):

    img_id = filename[0:-4]
    id_list.append(img_id)
    print(img_id)
    break
    

In [None]:
out = pd.DataFrame({'id' : id_list, 'label' : pred})

output = out.to_csv('output.csv')