<img src="../images/topcover.jpg" width="1000" height="50">

## Colorectal cancer dataset

In [1]:
# imports relevant modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler



##### The images will be classified into 8 classes according to the type of tissue namely, Tumour, Stroma, Complex, Lympho, Debris, Mucosa, Adipose and Empty. A pathologist may want to use texture to differentiate the tissue images. Therefore a few models will be adopted to see which model works best to classify the classes to the right category with the highest accuracy score.

In [2]:
# importing the dataset

colorectalx = pd.read_csv('../data/hmnist_64_64_L.csv')
colorectalx.drop(columns = 'label', inplace=True)
colorectaly = pd.read_csv('../data/hmnist_64_64_L.csv')

In [3]:
# set up data for modelling

X = colorectalx
y = colorectaly['label']

In [4]:
colorectalx

Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel4086,pixel4087,pixel4088,pixel4089,pixel4090,pixel4091,pixel4092,pixel4093,pixel4094,pixel4095
0,134,99,119,130,142,169,152,139,117,87,...,146,112,89,73,100,120,120,126,140,195
1,55,64,74,63,74,75,71,73,70,77,...,76,79,85,86,77,68,66,65,68,69
2,114,116,136,152,132,100,151,150,127,205,...,144,128,157,159,205,182,143,129,89,122
3,86,82,88,85,103,93,98,109,104,115,...,75,79,80,109,128,89,85,80,63,48
4,168,143,140,139,129,123,123,141,137,101,...,201,231,199,183,195,179,134,142,158,149
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,154,159,165,162,155,155,144,141,154,167,...,140,139,104,126,134,119,123,135,140,144
4996,148,119,154,147,132,137,147,170,177,163,...,158,165,150,176,189,157,168,163,143,168
4997,30,29,37,60,66,93,91,90,78,81,...,227,230,231,229,226,175,172,195,83,45
4998,105,115,108,105,111,125,123,109,109,118,...,106,152,141,116,116,125,91,77,81,85


In [5]:
colorectaly.head()


Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel4087,pixel4088,pixel4089,pixel4090,pixel4091,pixel4092,pixel4093,pixel4094,pixel4095,label
0,134,99,119,130,142,169,152,139,117,87,...,112,89,73,100,120,120,126,140,195,2
1,55,64,74,63,74,75,71,73,70,77,...,79,85,86,77,68,66,65,68,69,2
2,114,116,136,152,132,100,151,150,127,205,...,128,157,159,205,182,143,129,89,122,2
3,86,82,88,85,103,93,98,109,104,115,...,79,80,109,128,89,85,80,63,48,2
4,168,143,140,139,129,123,123,141,137,101,...,231,199,183,195,179,134,142,158,149,2


In [6]:
# Check distribution since this is a classification problem

y.value_counts(normalize = True)

7    0.125
3    0.125
6    0.125
2    0.125
5    0.125
1    0.125
8    0.125
4    0.125
Name: label, dtype: float64

In [7]:
# split the data into the training and testing sets

X_train, X_val, y_train, y_val = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42
                                                    )

In [8]:
# standard scaler applied

ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_val = ss.transform(X_val)


## Random Forests Model

In [9]:
# instantiate Random Forests 

rf = RandomForestClassifier(n_estimators=100)

In [None]:
# preliminar modeling with cross val score

pre_score = cross_val_score(estimator = rf,
                            X = X_train, 
                            y = y_train,
                            scoring = 'accuracy',
                            cv = 10,
                            verbose = 0)

print('Random Forest mean score: %5.4f' %np.mean(pre_score))


In [None]:
# gridsearch for random forests

rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5],
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

In [None]:
# predictions using Random Forests

predictions = gs.predict(X_val)

In [None]:
# Random Forests using GridSearchCV

gs.score(X_train, y_train)

In [None]:
# Random Forests using GridSearchCV

gs.score(X_val, y_val)

##### Random Forests model has an acuracy score of 0.684.

**Summary table for Colorectal Cancer Classification models for 8 Tissue type classes:**

| Model| Test Accuracy|Baseline score|
|:---------:|:---:|:--------:|
 |  Random Forests |    0.684 |  0.125  |
