In [2]:
# imports relevant modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt


from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from tensorflow.keras import utils
from tensorflow.keras.datasets import mnist


In [3]:
colorectalx = pd.read_csv('../data/colorectal12.csv')
colorectalx.drop(columns = 'label', inplace=True)
colorectaly = pd.read_csv('../data/colorectal12.csv')

In [4]:
# set up data for modelling random forests

X = colorectalx
y = colorectaly['label']

In [5]:
# Check distribution since this is a classification problem

y.value_counts(normalize = True)

2    0.5
1    0.5
Name: label, dtype: float64

In [6]:
# split the data into the training and testing sets

X_train, X_val, y_train, y_val = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42
                                                    )

In [7]:
# standard scaler applied

ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_val = ss.transform(X_val)

In [8]:
# random forests

In [9]:
# instantiate Random Forests 

rf = RandomForestClassifier(n_estimators=100)

In [9]:
# preliminar modeling with cross val score

pre_score = cross_val_score(estimator = rf,
                            X = X_train, 
                            y = y_train,
                            scoring = 'accuracy',
                            cv = 10,
                            verbose = 0)

print('Random Forest mean score: %5.4f' %np.mean(pre_score))

Random Forest mean score: 0.9366


In [10]:
# gridsearch for random forests

rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5],
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.9390504704875962


{'max_depth': 5, 'n_estimators': 100}

In [11]:
# predictions using Random Forests

predictions = gs.predict(X_val)

In [12]:
# Random Forests using GridSearchCV

gs.score(X_train, y_train)

0.9904420549581839

In [13]:
# Random Forests using GridSearchCV

gs.score(X_val, y_val)

0.9467312348668281

In [21]:
# cnn

In [10]:
X1 = colorectalx
y1 = colorectaly['label']

In [11]:
y1.value_counts(normalize = True)

2    0.5
1    0.5
Name: label, dtype: float64

In [12]:
X1 = X1/255

In [13]:
# Reshaping
X1 = X1.values.reshape(-1,64,64,1)       # shaping for the Keras
y1 = y1.values

In [14]:
y1 = utils.to_categorical(y1)

In [15]:
# split the data into the training and testing sets

X1_train, X1_val, y1_train, y1_val = train_test_split(X1, y1, test_size = 0.2, random_state=42)

In [16]:
print("x_train.shape: ",X1_train.shape)
print("x_val.shape: ",X1_val.shape)
print("y_train.shape: ",y1_train.shape)
print("y_val.shape: ",y1_val.shape)

x_train.shape:  (1000, 64, 64, 1)
x_val.shape:  (250, 64, 64, 1)
y_train.shape:  (1000, 3)
y_val.shape:  (250, 3)


In [17]:
# Check shape of an image.
X1_train.shape

(1000, 64, 64, 1)

In [18]:
# Check shape of an image.
X1_train[0].shape

(64, 64, 1)

In [19]:
# Instantiate a CNN.
cnn_model_2 = Sequential()

In [20]:
# Add a convolutional layer.
cnn_model_2.add(Conv2D(filters=128,             # number of filters
                       kernel_size=(5,5),      # height/width of filter
                       activation='relu',
                       padding = 'same',# activation function 
                       input_shape=(64,64,1))) # shape of input (image)

In [21]:
# Add a pooling layer.
cnn_model_2.add(MaxPooling2D(pool_size=(2,2))) # dimensions of region of pooling

In [22]:
cnn_model_2.add(Dropout(0.25))

In [23]:
# Add another convolutional layer.
cnn_model_2.add(Conv2D(64,
                       kernel_size=(3,3),
                       activation='relu'))

In [24]:
# Add another pooling layer.
cnn_model_2.add(MaxPooling2D(pool_size=(2,2)))

In [25]:
cnn_model_2.add(Dropout(0.25))

In [26]:
# Add another convolutional layer.
cnn_model_2.add(Conv2D(64,
                       kernel_size=(3,3),
                       activation='relu'))

# Add another pooling layer.
cnn_model_2.add(MaxPooling2D(pool_size=(2,2)))

cnn_model_2.add(Dropout(0.25))

In [27]:
cnn_model_2.add(Flatten())

In [28]:
cnn_model_2.add(Dense(256, activation='relu'))
cnn_model_2.add(Dense(64, activation='relu'))
cnn_model_2.add(Dense(32, activation='relu'))

In [29]:
cnn_model_2.add(Dense(3, activation='sigmoid'))

In [30]:
cnn_model_2.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 64, 64, 128)       3328      
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 32, 32, 128)       0         
_________________________________________________________________
dropout (Dropout)            (None, 32, 32, 128)       0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 30, 30, 64)        73792     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 15, 15, 64)        0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 15, 15, 64)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 13, 13, 64)        3

In [31]:
# Compile model
cnn_model_2.compile(loss='binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])

In [32]:
datagen = ImageDataGenerator(
        rotation_range=0.5, 
        zoom_range = 0.5, 
        width_shift_range=0.5,  
        height_shift_range=0.5, 
        horizontal_flip=True, 
        vertical_flip=True)

datagen.fit(X1_train)

In [33]:
cnn_model_2.fit_generator(datagen.flow(X1_train,y1_train, batch_size=200),
                              epochs = 20, validation_data = (X1_val,y1_val))



Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x992e8c6e80>

In [None]:
model.save('my_cifar10_model.h5')