## Colorectal cancer dataset

In [24]:
# imports relevant modules

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

In [17]:
colorectalx = pd.read_csv('../data/hmnist_64_64_L.csv')
colorectalx.drop(columns = 'label', inplace=True)
colorectaly = pd.read_csv('../data/hmnist_64_64_L.csv')

In [18]:
# set up data for modelling random forests

X = colorectalx
y = colorectaly['label']

In [19]:
colorectalx.head()

Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel4086,pixel4087,pixel4088,pixel4089,pixel4090,pixel4091,pixel4092,pixel4093,pixel4094,pixel4095
0,134,99,119,130,142,169,152,139,117,87,...,146,112,89,73,100,120,120,126,140,195
1,55,64,74,63,74,75,71,73,70,77,...,76,79,85,86,77,68,66,65,68,69
2,114,116,136,152,132,100,151,150,127,205,...,144,128,157,159,205,182,143,129,89,122
3,86,82,88,85,103,93,98,109,104,115,...,75,79,80,109,128,89,85,80,63,48
4,168,143,140,139,129,123,123,141,137,101,...,201,231,199,183,195,179,134,142,158,149


In [20]:
colorectaly.head()


Unnamed: 0,pixel0000,pixel0001,pixel0002,pixel0003,pixel0004,pixel0005,pixel0006,pixel0007,pixel0008,pixel0009,...,pixel4087,pixel4088,pixel4089,pixel4090,pixel4091,pixel4092,pixel4093,pixel4094,pixel4095,label
0,134,99,119,130,142,169,152,139,117,87,...,112,89,73,100,120,120,126,140,195,2
1,55,64,74,63,74,75,71,73,70,77,...,79,85,86,77,68,66,65,68,69,2
2,114,116,136,152,132,100,151,150,127,205,...,128,157,159,205,182,143,129,89,122,2
3,86,82,88,85,103,93,98,109,104,115,...,79,80,109,128,89,85,80,63,48,2
4,168,143,140,139,129,123,123,141,137,101,...,231,199,183,195,179,134,142,158,149,2


In [21]:
# Check distribution since this is a classification problem

y.value_counts(normalize = True)

7    0.125
3    0.125
6    0.125
2    0.125
5    0.125
1    0.125
8    0.125
4    0.125
Name: label, dtype: float64

In [22]:
# split the data into the training and testing sets

X_train, X_val, y_train, y_val = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42
                                                    )

In [25]:
# standard scaler applied

ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_val = ss.transform(X_val)


In [26]:
# instantiate Random Forests 

rf = RandomForestClassifier(n_estimators=100)

In [27]:
# preliminar modeling with cross val score

pre_score = cross_val_score(estimator = rf,
                            X = X_train, 
                            y = y_train,
                            scoring = 'accuracy',
                            cv = 10,
                            verbose = 0)

print('Random Forest mean score: %5.4f' %np.mean(pre_score))


Random Forest mean score: 0.6854


In [28]:
# gridsearch for random forests

rf_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5],
}
gs = GridSearchCV(rf, param_grid=rf_params, cv=5)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.699402985074627


{'max_depth': None, 'n_estimators': 200}

In [30]:
# predictions using Random Forests

predictions = gs.predict(X_val)

In [31]:
# Random Forests using GridSearchCV

gs.score(X_train, y_train)

1.0

In [32]:
# Random Forests using GridSearchCV

gs.score(X_val, y_val)

0.686060606060606

In [33]:
# Random Forests using GridSearchCV
# generate a confusion matrix.

confusion_matrix(y_val, predictions)

array([[144,   2,  26,  21,   0,  13,   0,   0],
       [  1, 103,  30,   0,  62,  10,   0,   0],
       [ 34,  25, 115,   3,  17,  12,   0,   0],
       [ 49,   1,  25, 117,   0,  15,   0,   0],
       [  5,  57,  19,   0, 106,   5,  14,   0],
       [ 24,   2,  13,   9,  13, 145,   0,   0],
       [  0,   0,   0,   0,   1,   0, 201,   5],
       [  0,   0,   0,   0,   0,   0,   5, 201]], dtype=int64)

In [35]:
# Random Forests using GridSearchCV

#tn, fp, fn, tp = confusion_matrix(y_val, predictions).ravel()

In [36]:
# Random Forests using GridSearchCV

#print("True Negatives: %s" % tn)
#print("False Positives: %s" % fp)
#print("False Negatives: %s" % fn)
#print("True Positives: %s" % tp)

In [37]:
# Random Forests using GridSearchCV

#plot_confusion_matrix(gs, X_test, y_test, cmap='viridis', 
                      #values_format='d', display_labels=['1', '']);

In [40]:
# Random Forests using GridSearchCV
# from sklearn f1 score

#f1_score(y_val, predictions)

In [None]:
# Random Forests using GridSearchCV
# test specificity of our model

#spec1 = tn1 / (tn1 + fp1)
#spec1

In [None]:
# Random Forests using GridSearchCV
# test sensitivity of our model

#sens1 = tp1 / (tp1 + fn1)
#sens1