# Random Forest
Notebook 8 achieved 89% accuracy on Nuc textures + RBC profiles.
Notebook 9 achieved 82% accuracy on Nuc + RBC means of most features.

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import sklearn   # pip install --upgrade scikit-learn
print('sklearn',sklearn.__version__)
import tensorflow as tf
tf.config.list_physical_devices('GPU')

2022-05-26 11:40:12.878852
Python 3.8.3
sklearn 1.1.1


[]

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
import joblib # used to dump/load sklearn models

In [3]:
# Data straight from CellProfiler (rows for every patch image file)
BASE_DIR='//Users/jasonmiller/Downloads/Process100/'
CLASS_DIRS=['0','1','2','3','4','5']
NUM_CLASSES=len(CLASS_DIRS)
IMG_FN="Process100_Image.csv"
RBC_FN="Process100_MergeRBC.csv"
NUC_FN="Process100_Nucleus.csv"
CEL_FN="Process100_ExpandCells.csv"
TIS_FN="Process100_Tissue.csv"
# Saving our models
MODELS_DIR='/home/jrm/Adjeroh/Naved/models/RandomForest.01/'
DESCRIBE=True   # compute stats for every column
DESCRIBE=False  # just retain the mean over objects in patch (not even the count!)
SHOW_ALL_COLUMNS=True   # show all the csv columns in this notebook  
SHOW_ALL_COLUMNS=False  

In [4]:
if SHOW_ALL_COLUMNS:
    nuc_df = pd.read_csv(BASE_DIR+CLASS_DIRS[0]+'/'+NUC_FN)
    initial_cols = nuc_df.columns
    for col in initial_cols:
        print(col)
    nuc_df = None

In [5]:
if SHOW_ALL_COLUMNS:
    rbc_df = pd.read_csv(BASE_DIR+CLASS_DIRS[0]+'/'+RBC_FN)
    initial_cols = rbc_df.columns
    for col in initial_cols:
        print(col)
    rbc_df = None

In [6]:
def get_nuc_columns():
    df = pd.read_csv(BASE_DIR+CLASS_DIRS[0]+'/'+NUC_FN)
    initial_cols = df.columns
    GOOD_COLS=['ImageNumber']
    GOOD_COLS.extend(['AreaShape_Area','AreaShape_Compactness','AreaShape_ConvexArea'])
    GOOD_COLS.extend(['AreaShape_Eccentricity','AreaShape_EquivalentDiameter','AreaShape_EulerNumber'])
    GOOD_COLS.extend(['AreaShape_Extent','AreaShape_FormFactor','AreaShape_MajorAxisLength'])
    GOOD_COLS.extend(['AreaShape_MaxFeretDiameter','AreaShape_MaximumRadius','AreaShape_MeanRadius'])
    GOOD_COLS.extend(['AreaShape_MinFeretDiameter','AreaShape_MinorAxisLength','AreaShape_Orientation'])
    GOOD_COLS.extend(['AreaShape_Perimeter','AreaShape_Solidity'])
    GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('AreaShape_CentralMoment') ) ] )
    GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('AreaShape_HuMoment') ) ] )
    GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('AreaShape_NormalizedMoment') ) ] )
    GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('AreaShape_SpacialMoment') ) ] )  # yes
    GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('AreaShape_Zernike') ) ] )   # yes
    GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('Granularity') ) ] )   # yes!!!
    GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('Intensity') ) ] )   # yes!!!
    GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('Neighbors') ) ] )   
    #GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('RadialDistribution') ) ] )   
    GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('Texture_') ) ] )  # yes
    #GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('Texture_Contrast_Hematoxylin') ) ] )   
    #GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('Texture_DifferenceEntropy_Hematoxylin') ) ] ) 
    #GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('Texture_Entropy_Hematoxylin') ) ] )  # yes
    return GOOD_COLS
def get_rbc_columns():
    df = pd.read_csv(BASE_DIR+CLASS_DIRS[0]+'/'+RBC_FN)
    initial_cols = df.columns
    GOOD_COLS=['ImageNumber']
    GOOD_COLS.extend(['AreaShape_Area','AreaShape_Compactness','AreaShape_ConvexArea'])
    GOOD_COLS.extend(['AreaShape_Eccentricity','AreaShape_EquivalentDiameter','AreaShape_EulerNumber'])
    GOOD_COLS.extend(['AreaShape_Extent','AreaShape_FormFactor','AreaShape_MajorAxisLength'])
    GOOD_COLS.extend(['AreaShape_MaxFeretDiameter','AreaShape_MaximumRadius','AreaShape_MeanRadius'])
    GOOD_COLS.extend(['AreaShape_MinFeretDiameter','AreaShape_MinorAxisLength','AreaShape_Orientation'])
    GOOD_COLS.extend(['AreaShape_Perimeter','AreaShape_Solidity'])
    #GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('AreaShape_CentralMoment') ) ] )
    #GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('AreaShape_HuMoment') ) ] )
    #GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('AreaShape_NormalizedMoment') ) ] )
    #GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('AreaShape_SpacialMoment') ) ] )  
    #GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('AreaShape_Zernike') ) ] )  
    GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('Granularity') ) ] )  
    GOOD_COLS.extend([x for x in initial_cols if x.startswith( ('Intensity') ) ] )   
    return GOOD_COLS

In [7]:
PREFIX='abc_'
def set_prefix(prefix):
    global PREFIX
    PREFIX=prefix
def get_prefix(col_name):
    global PREFIX
    return PREFIX+col_name  # hard coded for now
def load_one_file(fn,cols,prefix):
    set_prefix(prefix)
    object_df = pd.read_csv(fn)
    object_df = object_df[cols]
    object_df.rename(get_prefix,axis='columns',inplace=True)
    object_df.rename(columns={get_prefix('ImageNumber'):'ImageNumber'},inplace=True)
    if DESCRIBE:
        patch_df = object_df.groupby(['ImageNumber']).describe() ## this is slow
        # The random forest classifier cannot handle two-part column names, so convert them to string.    
        #patch_df.columns=patch_df.columns.map(str) # works but leaves in parenthesis
        patch_df.columns=patch_df.columns.map('_'.join)
    else:
        # Problem: using this approach, we don't get a cell count.
        #patch_df = object_df.groupby(['ImageNumber']).mean()
        patch_df = object_df.groupby(['ImageNumber']).max()
    # By virtue of groupby, ImageNumber is now the dataframe index.
    return patch_df


In [8]:
SHOW_DEMO = True
total_df = None
if SHOW_DEMO:
    nuc_cols = get_nuc_columns()
    rbc_cols = get_rbc_columns()
    nuc_df = load_one_file(BASE_DIR+CLASS_DIRS[0]+'/'+NUC_FN,nuc_cols,'Nuc_')
    rbc_df = load_one_file(BASE_DIR+CLASS_DIRS[0]+'/'+RBC_FN,rbc_cols,'Rbc_')
    total_df = nuc_df.join(rbc_df,how='outer')
    # Sources of NaN:
    # CellProfiler reports NaN for stdev of 1 value.
    # CellProfiler omits patches with no RBC; pandas join sets those to NaN.
    total_df.fillna(0,inplace=True)  
total_df

Unnamed: 0_level_0,Nuc_AreaShape_Area,Nuc_AreaShape_Compactness,Nuc_AreaShape_ConvexArea,Nuc_AreaShape_Eccentricity,Nuc_AreaShape_EquivalentDiameter,Nuc_AreaShape_EulerNumber,Nuc_AreaShape_Extent,Nuc_AreaShape_FormFactor,Nuc_AreaShape_MajorAxisLength,Nuc_AreaShape_MaxFeretDiameter,...,Rbc_Intensity_MaxIntensityEdge_Eosin,Rbc_Intensity_MaxIntensity_Eosin,Rbc_Intensity_MeanIntensityEdge_Eosin,Rbc_Intensity_MeanIntensity_Eosin,Rbc_Intensity_MedianIntensity_Eosin,Rbc_Intensity_MinIntensityEdge_Eosin,Rbc_Intensity_MinIntensity_Eosin,Rbc_Intensity_StdIntensityEdge_Eosin,Rbc_Intensity_StdIntensity_Eosin,Rbc_Intensity_UpperQuartileIntensity_Eosin
ImageNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,313,1.273323,345,0.564175,19.963065,1,0.626000,0.785346,22.632028,24.515301,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,336,2.208444,365,0.953539,20.683532,1,0.678977,0.762246,33.514234,32.572995,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,585,1.701673,603,0.945255,27.291851,1,0.785185,0.983658,30.773980,30.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,635,2.727914,818,0.919852,28.434259,1,0.821256,0.990936,44.852725,43.416587,...,0.865196,0.922232,0.730481,0.769773,0.780492,0.653640,0.653640,0.069792,0.065724,0.817634
5,832,2.067581,896,0.952981,32.547432,1,0.814103,0.979789,49.273720,51.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,340,1.604151,356,0.937486,20.806284,1,0.702479,0.903434,32.719426,29.546573,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
97,812,2.097841,940,0.946143,32.153857,1,0.857143,0.916053,52.041988,49.929951,...,0.822311,0.822311,0.721245,0.735250,0.736400,0.688624,0.688624,0.026196,0.020112,0.746882
98,633,2.326529,702,0.968304,28.389446,1,0.810606,0.976125,47.357716,46.097722,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
99,387,3.989876,647,0.873110,22.197831,1,0.406593,0.328034,38.109267,42.755117,...,0.909205,0.918255,0.771856,0.791015,0.799960,0.526595,0.526595,0.062599,0.060932,0.835346


In [9]:
print(datetime.datetime.now())
df = None
def load_all_classes():
    nuc_cols = get_nuc_columns()
    rbc_cols = get_rbc_columns()    
    nuc_df = load_one_file(BASE_DIR+CLASS_DIRS[0]+'/'+NUC_FN,nuc_cols,'Nuc_')
    rbc_df = load_one_file(BASE_DIR+CLASS_DIRS[0]+'/'+RBC_FN,rbc_cols,'Rbc_')
    X = nuc_df.join(rbc_df,how='outer')
    size = len(X)
    y = np.zeros(size)
    for i in range(1,NUM_CLASSES):
        nuc_df = load_one_file(BASE_DIR+CLASS_DIRS[i]+'/'+NUC_FN,nuc_cols,'Nuc_')
        rbc_df = load_one_file(BASE_DIR+CLASS_DIRS[i]+'/'+RBC_FN,rbc_cols,'Rbc_')
        Xi = nuc_df.join(rbc_df,how='outer')
        size = len(Xi)
        yi = np.ones(size) * i
        X = pd.concat( (X,Xi) )
        y = np.concatenate( (y,yi) )
    X.fillna(0,inplace=True)  
    return X,y
X,y=load_all_classes()
X

2022-05-26 11:40:26.232864


Unnamed: 0_level_0,Nuc_AreaShape_Area,Nuc_AreaShape_Compactness,Nuc_AreaShape_ConvexArea,Nuc_AreaShape_Eccentricity,Nuc_AreaShape_EquivalentDiameter,Nuc_AreaShape_EulerNumber,Nuc_AreaShape_Extent,Nuc_AreaShape_FormFactor,Nuc_AreaShape_MajorAxisLength,Nuc_AreaShape_MaxFeretDiameter,...,Rbc_Intensity_MaxIntensityEdge_Eosin,Rbc_Intensity_MaxIntensity_Eosin,Rbc_Intensity_MeanIntensityEdge_Eosin,Rbc_Intensity_MeanIntensity_Eosin,Rbc_Intensity_MedianIntensity_Eosin,Rbc_Intensity_MinIntensityEdge_Eosin,Rbc_Intensity_MinIntensity_Eosin,Rbc_Intensity_StdIntensityEdge_Eosin,Rbc_Intensity_StdIntensity_Eosin,Rbc_Intensity_UpperQuartileIntensity_Eosin
ImageNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,313,1.273323,345,0.564175,19.963065,1,0.626000,0.785346,22.632028,24.515301,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
2,336,2.208444,365,0.953539,20.683532,1,0.678977,0.762246,33.514234,32.572995,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,585,1.701673,603,0.945255,27.291851,1,0.785185,0.983658,30.773980,30.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,635,2.727914,818,0.919852,28.434259,1,0.821256,0.990936,44.852725,43.416587,...,0.865196,0.922232,0.730481,0.769773,0.780492,0.653640,0.653640,0.069792,0.065724,0.817634
5,832,2.067581,896,0.952981,32.547432,1,0.814103,0.979789,49.273720,51.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,1323,3.283374,1474,0.925216,41.042611,1,0.837093,0.950693,64.861527,65.764732,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
97,923,2.000305,1052,0.943625,34.281192,1,0.610022,0.662628,58.363145,58.008620,...,0.778873,0.829217,0.735511,0.742334,0.743000,0.655277,0.655277,0.024384,0.025481,0.760939
98,1565,2.337659,1833,0.956790,44.638771,1,0.824017,0.971563,68.549994,70.213959,...,0.839652,0.862333,0.737755,0.743925,0.744059,0.662400,0.659784,0.031272,0.027539,0.758598
99,1000,3.968473,1149,0.989404,35.682482,1,0.647826,0.886006,89.070327,91.301698,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [10]:
print(datetime.datetime.now())
Xtrain,Xvalid,ytrain,yvalid = train_test_split(X, y.ravel()) 
        # ,random_state=42) # add this for reproducibility
print('Xtrain',Xtrain.shape,'ytrain',ytrain.shape,'non-zero:',np.count_nonzero(ytrain))
print('Xvalid',Xvalid.shape,'yvalid',yvalid.shape,'non-zero:',np.count_nonzero(yvalid))

2022-05-26 11:40:29.484518
Xtrain (450, 584) ytrain (450,) non-zero: 372
Xvalid (150, 584) yvalid (150,) non-zero: 128


In [11]:
# RandomForestClassifier can only track feature names of type string.
num_problems=0
for name in Xtrain.columns:
    if not isinstance(name,str):
        num_problems += 1
        print(type(name),name)
if num_problems==0:
    print("Ok")

Ok


In [12]:
print(datetime.datetime.now())
class RF_Util:
    def __init__(self):
        self.model=RandomForestClassifier()
    def get_model(self):
        return self.model
    def set_train(self,X,y):
        self.Xtr = X
        self.ytr = y
    def set_validation(self,X,y):
        self.Xval = X
        self.yval = y
    def fit(self):
        self.model.fit(self.Xtr,self.ytr)
        #print(dir(self.model))  # see whether feature_names_in_ got created
    def validation_accuracy(self):
        ypred = self.model.predict(self.Xval)
        matches = np.count_nonzero(self.yval==ypred)
        accuracy = 100.0 * matches / len(ypred)  # bug fix
        return accuracy
    def validation_confusion(self):
        ypred = self.model.predict(self.Xval)
        cm = confusion_matrix(self.yval, ypred)
        return cm
    def important_features(self):
        names = self.model.feature_names_in_
        importances = self.model.feature_importances_
        pairs = np.column_stack( (names,importances) )
        top_array = sorted(pairs, key = lambda e:e[1], reverse=True)
        # There must be a way to do this witout a loop!
        top_list = []
        for i in top_array:
             top_list.append((i[1],i[0]))  # 0=feature_name, 1=importance
        top_df = pd.DataFrame(top_list)
        return top_df

2022-05-26 11:40:29.587624


In [13]:
print('Train on all Features')
rf1 = RF_Util()
rf1.set_train(Xtrain,ytrain)
rf1.set_validation(Xvalid,yvalid)
print(datetime.datetime.now())
rf1.fit()
print(datetime.datetime.now())
print('Accuracy:',rf1.validation_accuracy())
print('Confusion:')
print(rf1.validation_confusion())
print('The impurity-based feature importances.')
top = rf1.important_features()
top.head()

Train on all Features
2022-05-26 11:40:29.625600
2022-05-26 11:40:30.199384
Accuracy: 82.66666666666667
Confusion:
[[19  0  1  2  0  0]
 [ 0 17  2  1  0  1]
 [ 2  2 24  3  0  0]
 [ 1  0  2 14  2  0]
 [ 1  0  0  0 22  0]
 [ 1  0  3  2  0 28]]
The impurity-based feature importances.


Unnamed: 0,0,1
0,0.020698,Nuc_Texture_SumAverage_Eosin_5_00_256
1,0.020012,Nuc_Texture_SumAverage_Eosin_7_00_256
2,0.018639,Nuc_Texture_SumAverage_Eosin_3_02_256
3,0.017722,Nuc_Texture_SumAverage_Eosin_4_01_256
4,0.016138,Nuc_Texture_SumAverage_Eosin_7_02_256
