# Random Forest
Train on 6-class cancer classification.  
Input is the nucleus+RBC rollup, training set only.  
Train on one train/test split.  
Print the confusion matrix.  

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)
import tensorflow as tf
tf.config.list_physical_devices('GPU')

2022-06-09 11:11:29.035383
Python 3.8.10
sklearn 1.0.2


[]

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.utils import shuffle
from sklearn.feature_selection import RFE
import joblib # used to dump/load sklearn models
from CellProfiler_Util import CP_Util
from RandomForestUtil import RF_Util

In [3]:
BASE_DIR='/home/jrm/Adjeroh/Naved/CP_80K/'  # append Output0/ etc
CLASSES=range(0,6)  # use all 6 classes
#CLASSES=[5]  # JUST FOR TESTING
MODELS_DIR='/home/jrm/Adjeroh/Naved/CP_80K/models/RandomForest.18'

In [4]:
print(datetime.datetime.now())
PREFIX='abc_'
def set_prefix(prefix):
    global PREFIX
    PREFIX=prefix
def get_prefix(col_name):
    global PREFIX
    return PREFIX+col_name  # hard coded for now
def load_all_classes():
    X = None
    y = None
    for i in CLASSES:
        print(datetime.datetime.now())
        print("Process class",i)
        FULL_PATH=BASE_DIR+'Output'+str(i)+'/'
        cp = CP_Util(FULL_PATH)
        cp.train_test_split()
        Xnuc = cp.get_nucleus_rollup()
        set_prefix('Nuc_')
        Xnuc.rename(get_prefix,axis='columns',inplace=True)
        Xrbc = cp.get_RBC_rollup()
        set_prefix('Rbc_')
        Xrbc.rename(get_prefix,axis='columns',inplace=True)
        Xi = Xnuc.join(Xrbc,how='outer')
        size = len(Xi)
        yi = np.ones(size) * i   # e.g. class 3
        if X is None:
            X = Xi
            y = yi
        else:
            X = pd.concat( (X,Xi) )
            y = np.concatenate( (y,yi) )
    X.fillna(0,inplace=True)  
    return X,y
Xtrain,ytrain=load_all_classes()
Xtrain

2022-06-09 11:11:31.285508
2022-06-09 11:11:31.285724
Process class 0
Num tumors in test/train sets: 23 94
Num patches in test/train sets: 8803 36163
2022-06-09 11:12:10.491577
Process class 1
Num tumors in test/train sets: 7 26
Num patches in test/train sets: 2734 10168
2022-06-09 11:12:24.112251
Process class 2
Num tumors in test/train sets: 7 30
Num patches in test/train sets: 2769 11181
2022-06-09 11:12:40.017765
Process class 3
Num tumors in test/train sets: 3 14
Num patches in test/train sets: 1091 5273
2022-06-09 11:12:52.642467
Process class 4
Num tumors in test/train sets: 2 6
Num patches in test/train sets: 813 2384
2022-06-09 11:12:59.958056
Process class 5
Num tumors in test/train sets: 1 3
Num patches in test/train sets: 396 1195


Unnamed: 0_level_0,Nuc_ObjectNumber_count,Nuc_ObjectNumber_mean,Nuc_ObjectNumber_std,Nuc_ObjectNumber_min,Nuc_ObjectNumber_25%,Nuc_ObjectNumber_50%,Nuc_ObjectNumber_75%,Nuc_ObjectNumber_max,Nuc_AreaShape_Area_count,Nuc_AreaShape_Area_mean,...,Rbc_Number_Object_Number_75%,Rbc_Number_Object_Number_max,Rbc_Parent_ShrinkRBC_count,Rbc_Parent_ShrinkRBC_mean,Rbc_Parent_ShrinkRBC_std,Rbc_Parent_ShrinkRBC_min,Rbc_Parent_ShrinkRBC_25%,Rbc_Parent_ShrinkRBC_50%,Rbc_Parent_ShrinkRBC_75%,Rbc_Parent_ShrinkRBC_max
PatchNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
397,7.0,4.0,2.160247,1.0,2.50,4.0,5.50,7.0,7.0,216.000000,...,0.00,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.00,0.0
398,1.0,1.0,0.000000,1.0,1.00,1.0,1.00,1.0,1.0,200.000000,...,0.00,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.00,0.0
399,1.0,1.0,0.000000,1.0,1.00,1.0,1.00,1.0,1.0,181.000000,...,0.00,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.00,0.0
400,4.0,2.5,1.290994,1.0,1.75,2.5,3.25,4.0,4.0,239.250000,...,0.00,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.00,0.0
401,4.0,2.5,1.290994,1.0,1.75,2.5,3.25,4.0,4.0,237.500000,...,0.00,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.00,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1587,7.0,4.0,2.160247,1.0,2.50,4.0,5.50,7.0,7.0,368.285714,...,0.00,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.00,0.0
1588,17.0,9.0,5.049752,1.0,5.00,9.0,13.00,17.0,17.0,519.352941,...,0.00,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.00,0.0
1589,19.0,10.0,5.627314,1.0,5.50,10.0,14.50,19.0,19.0,388.315789,...,1.75,2.0,2.0,1.5,0.707107,1.0,1.25,1.5,1.75,2.0
1590,33.0,17.0,9.669540,1.0,9.00,17.0,25.00,33.0,33.0,444.454545,...,0.00,0.0,0.0,0.0,0.000000,0.0,0.00,0.0,0.00,0.0


In [5]:
print(datetime.datetime.now())
print("Shuffle...")
Xtrain,ytrain=shuffle(Xtrain,ytrain)  #set random_state for reproducibility

2022-06-09 11:13:20.088449
Shuffle...


In [6]:
print(datetime.datetime.now())
print('Do a one-time train/valid split...')
Xt,Xv,yt,yv = train_test_split(Xtrain, ytrain.ravel()) 
        # ,random_state=42) # add this for reproducibility
Xtrain=None
Ytrain=None
print(datetime.datetime.now())
print('Do a one-time fit and evaluate...')
rf1 = RF_Util()
rf1.set_train(Xt,yt)
rf1.set_validation(Xv,yv)
rf1.fit()
print(datetime.datetime.now())
print('Accuracy:',rf1.validation_accuracy())
print('Confusion:')
print(rf1.validation_confusion())

2022-06-09 11:13:21.980248
Do a one-time train/valid split...
2022-06-09 11:13:57.667147
Do a one-time fit and evaluate...
2022-06-09 11:19:44.477881
Accuracy: 80.82315967605463
Confusion:
[[8672  130  147   35    1    2]
 [ 304 1732  408   64    9    2]
 [ 281  400 2008   60   19   21]
 [ 162  274  301  584    4    0]
 [  78  117  179   26  225    2]
 [  51   41   49    6    0  152]]


In [9]:
print('Rank the features by importance.')
top = rf1.important_features()
pd.set_option('display.max_rows', None)
top.loc[:200]

Rank the features by importance.


Unnamed: 0,0,1
0,0.010052,Nuc_AreaShape_MaximumRadius_75%
1,0.006728,Nuc_AreaShape_MinorAxisLength_mean
2,0.006388,Nuc_Texture_InfoMeas1_Hematoxylin_7_00_256_25%
3,0.006382,Nuc_Texture_InfoMeas1_Hematoxylin_3_02_256_mean
4,0.0062,Nuc_AreaShape_MeanRadius_50%
5,0.006186,Nuc_Texture_InfoMeas1_Hematoxylin_4_02_256_25%
6,0.005721,Nuc_AreaShape_MedianRadius_75%
7,0.005498,Nuc_AreaShape_MeanRadius_mean
8,0.004916,Nuc_Texture_InfoMeas1_Hematoxylin_7_00_256_50%
9,0.004796,Nuc_Texture_InfoMeas1_Hematoxylin_4_03_256_50%


In [10]:
print(datetime.datetime.now())
joblib.dump(rf1.get_model(),MODELS_DIR)

2022-06-09 11:25:06.276193


['/home/jrm/Adjeroh/Naved/CP_80K/models/RandomForest.18']