<img src="../images/topcover.jpg" width="1000" height="50">

## Colorectal cancer dataset

In [2]:
# Import libraries and modules
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from tensorflow.keras.preprocessing.image import ImageDataGenerator
np.random.seed(42)

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras import utils
from tensorflow.keras.datasets import mnist

In [3]:
colorectalx = pd.read_csv('../data/hmnist_64_64_L.csv')
colorectalx.drop(columns = 'label', inplace=True)
colorectaly = pd.read_csv('../data/hmnist_64_64_L.csv')

In [4]:

X = colorectalx
y = colorectaly['label']

In [5]:
y.value_counts(normalize = True)

7    0.125
3    0.125
6    0.125
2    0.125
5    0.125
1    0.125
8    0.125
4    0.125
Name: label, dtype: float64

In [6]:
X = X / 255.0 

In [7]:
# Reshaping
X = X.values.reshape(-1,64,64,1)       # shaping for the Keras
y = y.values

In [8]:
y = utils.to_categorical(y)

In [9]:
# split the data into the training and testing sets

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=42)


In [10]:
print("x_train.shape: ",X_train.shape)
print("x_val.shape: ",X_val.shape)
print("y_train.shape: ",y_train.shape)
print("y_val.shape: ",y_val.shape)

x_train.shape:  (4000, 64, 64, 1)
x_val.shape:  (1000, 64, 64, 1)
y_train.shape:  (4000, 9)
y_val.shape:  (1000, 9)


## Convulated Neural Network Model

In [11]:
# Check shape of an image.
X_train.shape

(4000, 64, 64, 1)

In [12]:
# Check shape of an image.
X_train[0].shape

(64, 64, 1)

In [13]:
# Instantiate a CNN.
cnn_model_2 = Sequential()

In [14]:
# Add a convolutional layer.
cnn_model_2.add(Conv2D(filters=128,             # number of filters
                       kernel_size=(5,5),      # height/width of filter
                       activation='relu',
                       padding = 'same',# activation function 
                       input_shape=(64,64,1))) # shape of input (image)

In [15]:
# Add a pooling layer.
cnn_model_2.add(MaxPooling2D(pool_size=(2,2))) # dimensions of region of pooling

In [16]:
cnn_model_2.add(Dropout(0.25))

In [17]:
# Add another convolutional layer.
cnn_model_2.add(Conv2D(64,
                       kernel_size=(3,3),
                       activation='relu'))

In [18]:
# Add another pooling layer.
cnn_model_2.add(MaxPooling2D(pool_size=(2,2)))

In [19]:
cnn_model_2.add(Dropout(0.25))

In [20]:
# Add another convolutional layer.
cnn_model_2.add(Conv2D(64,
                       kernel_size=(3,3),
                       activation='relu'))

# Add another pooling layer.
cnn_model_2.add(MaxPooling2D(pool_size=(2,2)))

cnn_model_2.add(Dropout(0.25))

In [21]:
cnn_model_2.add(Flatten())

In [22]:
cnn_model_2.add(Dense(256, activation='relu'))
cnn_model_2.add(Dense(64, activation='relu'))
cnn_model_2.add(Dense(32, activation='relu'))

In [23]:
cnn_model_2.add(Dense(9, activation='softmax'))

In [24]:
cnn_model_2.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 64, 64, 128)       3328      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 32, 32, 128)       0         
_________________________________________________________________
dropout (Dropout)            (None, 32, 32, 128)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 30, 30, 64)        73792     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 15, 15, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 15, 15, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 13, 13, 64)        3

In [25]:
# Compile model
cnn_model_2.compile(loss='categorical_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])



In [27]:
datagen = ImageDataGenerator(
        rotation_range=0.5, 
        zoom_range = 0.5, 
        width_shift_range=0.5,  
        height_shift_range=0.5, 
        horizontal_flip=True, 
        vertical_flip=True)

datagen.fit(X_train)

In [30]:
cnn_model_2.fit_generator(datagen.flow(X_train,y_train, batch_size=200),
                              epochs = 20, validation_data = (X_val,y_val))



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0xa1284ae760>

##### Observing the accuracy score in the table below, Random Forests is still the model that performs better to classify pixel image data.

**Summary table for Colorectal Cancer Classification models for 8 Tissue type classes:**

| Model| Test Accuracy|Baseline score|
|:---------:|:---:|:--------:|
|  Random Forests |    0.684 |  0.125  |
|KNN| 0.432| 0.125|
|CNN|  0.603 |0.125|