# Random Forest
Here we learned to load features and feed them to a model.

We loaded all 6 classes.
We have only 100 patch vectors per class.

We loaded only one dataset: nuclei.
Loading more will require vertical concatenation of tables of unequal size.

We retained just a few features.
Retaining more will require cleverness with column names.

Despite these shortcomings, the RF classifier achieved 62% accuracy in cross-validation dong 6-way classification.

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import sklearn
print('sklearn',sklearn.__version__)

2022-05-25 14:45:40.523806
Python 3.8.10
sklearn 1.0.2


In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
import joblib # used to dump/load sklearn models

In [3]:
# Data straight from CellProfiler (rows for every patch image file)
BASE_DIR='/home/jrm/Adjeroh/Naved/Process100.class/'
CLASS_DIRS=['0','1','2','3','4','5']
NUM_CLASSES=len(CLASS_DIRS)
IMG_FN="Process100_Image.csv"
RBC_FN="Process100_MergeRBC.csv"
NUC_FN="Process100_Nucleus.csv"
CEL_FN="Process100_ExpandCells.csv"
TIS_FN="Process100_Tissue.csv"
# Saving our models
MODELS_DIR='/home/jrm/Adjeroh/Naved/models/RandomForest.01/'

In [4]:
# show columns
df = pd.read_csv(BASE_DIR+CLASS_DIRS[0]+'/'+NUC_FN)
for col in df.columns:
    print(col)

ImageNumber
ObjectNumber
FileName_Tumor
PathName_Tumor
AreaShape_Area
AreaShape_BoundingBoxArea
AreaShape_BoundingBoxMaximum_X
AreaShape_BoundingBoxMaximum_Y
AreaShape_BoundingBoxMinimum_X
AreaShape_BoundingBoxMinimum_Y
AreaShape_Center_X
AreaShape_Center_Y
AreaShape_CentralMoment_0_0
AreaShape_CentralMoment_0_1
AreaShape_CentralMoment_0_2
AreaShape_CentralMoment_0_3
AreaShape_CentralMoment_1_0
AreaShape_CentralMoment_1_1
AreaShape_CentralMoment_1_2
AreaShape_CentralMoment_1_3
AreaShape_CentralMoment_2_0
AreaShape_CentralMoment_2_1
AreaShape_CentralMoment_2_2
AreaShape_CentralMoment_2_3
AreaShape_Compactness
AreaShape_ConvexArea
AreaShape_Eccentricity
AreaShape_EquivalentDiameter
AreaShape_EulerNumber
AreaShape_Extent
AreaShape_FormFactor
AreaShape_HuMoment_0
AreaShape_HuMoment_1
AreaShape_HuMoment_2
AreaShape_HuMoment_3
AreaShape_HuMoment_4
AreaShape_HuMoment_5
AreaShape_HuMoment_6
AreaShape_InertiaTensorEigenvalues_0
AreaShape_InertiaTensorEigenvalues_1
AreaShape_InertiaTensor_0_0
Ar

In [14]:
df = None
GOOD_COLS=['ImageNumber','AreaShape_Area','AreaShape_MeanRadius','Neighbors_NumberOfNeighbors_Expanded']
def nuc_prefix(col_name):
    return 'Nuc_'+col_name
def load_one_class(class_index):
    class_name=CLASS_DIRS[class_index]+'/'
    object_df = pd.read_csv(BASE_DIR+class_name+NUC_FN)
    print("Load %d nuclei." % len(object_df))
    object_df = object_df[GOOD_COLS]
    object_df.rename(nuc_prefix,axis='columns',inplace=True)
    patch_df = object_df.groupby(['Nuc_ImageNumber']).describe()
    # The random forest classifier cannot handle two-part column names, so convert them to string.
    patch_df.columns=patch_df.columns.map('_'.join)
    patch_df.fillna(0,inplace=True)
    print("Load %d patches having nuclei." % len(patch_df))
    return patch_df
df = load_one_class(0)

Load 1826 nuclei.
Load 100 patches having nuclei.


In [15]:
df

Unnamed: 0_level_0,Nuc_AreaShape_Area_count,Nuc_AreaShape_Area_mean,Nuc_AreaShape_Area_std,Nuc_AreaShape_Area_min,Nuc_AreaShape_Area_25%,Nuc_AreaShape_Area_50%,Nuc_AreaShape_Area_75%,Nuc_AreaShape_Area_max,Nuc_AreaShape_MeanRadius_count,Nuc_AreaShape_MeanRadius_mean,...,Nuc_AreaShape_MeanRadius_75%,Nuc_AreaShape_MeanRadius_max,Nuc_Neighbors_NumberOfNeighbors_Expanded_count,Nuc_Neighbors_NumberOfNeighbors_Expanded_mean,Nuc_Neighbors_NumberOfNeighbors_Expanded_std,Nuc_Neighbors_NumberOfNeighbors_Expanded_min,Nuc_Neighbors_NumberOfNeighbors_Expanded_25%,Nuc_Neighbors_NumberOfNeighbors_Expanded_50%,Nuc_Neighbors_NumberOfNeighbors_Expanded_75%,Nuc_Neighbors_NumberOfNeighbors_Expanded_max
Nuc_ImageNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,313.000000,0.000000,313.0,313.00,313.0,313.00,313.0,1.0,3.387379,...,3.387379,3.387379,1.0,0.000000,0.000000,0.0,0.0,0.0,0.00,0.0
2,3.0,251.000000,79.680612,178.0,208.50,239.0,287.50,336.0,3.0,2.472544,...,2.754673,3.177413,3.0,2.000000,0.000000,2.0,2.0,2.0,2.00,2.0
3,5.0,302.000000,162.260593,182.0,212.00,260.0,271.00,585.0,5.0,3.278895,...,3.339997,4.695143,5.0,2.400000,0.894427,1.0,2.0,3.0,3.00,3.0
4,29.0,314.137931,125.210030,177.0,233.00,262.0,340.00,635.0,29.0,3.072118,...,3.247382,3.940398,29.0,4.689655,1.649750,2.0,4.0,4.0,5.00,8.0
5,32.0,355.937500,160.779339,180.0,243.00,331.0,375.25,832.0,32.0,3.188709,...,3.514243,4.589244,32.0,4.687500,1.635050,2.0,3.0,4.5,5.25,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,2.0,306.500000,47.376154,273.0,289.75,306.5,323.25,340.0,2.0,3.069114,...,3.317004,3.564894,2.0,1.000000,0.000000,1.0,1.0,1.0,1.00,1.0
97,32.0,394.750000,171.982745,189.0,270.50,352.5,480.75,812.0,32.0,3.372149,...,3.808400,4.650069,32.0,4.812500,1.554131,2.0,4.0,5.0,6.00,9.0
98,23.0,318.565217,120.095729,183.0,244.50,285.0,373.00,633.0,23.0,3.056807,...,3.390141,4.031012,23.0,4.782609,1.312753,2.0,4.0,5.0,5.00,7.0
99,2.0,304.500000,116.672619,222.0,263.25,304.5,345.75,387.0,2.0,1.997744,...,2.052328,2.106912,2.0,1.000000,0.000000,1.0,1.0,1.0,1.00,1.0


In [16]:
df = None
def load_all_classes():
    X = load_one_class(0)
    size = len(X)
    y = np.zeros(size)
    for i in range(1,NUM_CLASSES):
        Xi = load_one_class(i)
        size = len(Xi)
        yi = np.ones(size) * i
        X = pd.concat( (X,Xi) )
        y = np.concatenate( (y,yi) )
    return X,y
X,y=load_all_classes()
X

Load 1826 nuclei.
Load 100 patches having nuclei.
Load 2435 nuclei.
Load 100 patches having nuclei.
Load 886 nuclei.
Load 100 patches having nuclei.
Load 1459 nuclei.
Load 100 patches having nuclei.
Load 1713 nuclei.
Load 100 patches having nuclei.
Load 3659 nuclei.
Load 100 patches having nuclei.


Unnamed: 0_level_0,Nuc_AreaShape_Area_count,Nuc_AreaShape_Area_mean,Nuc_AreaShape_Area_std,Nuc_AreaShape_Area_min,Nuc_AreaShape_Area_25%,Nuc_AreaShape_Area_50%,Nuc_AreaShape_Area_75%,Nuc_AreaShape_Area_max,Nuc_AreaShape_MeanRadius_count,Nuc_AreaShape_MeanRadius_mean,...,Nuc_AreaShape_MeanRadius_75%,Nuc_AreaShape_MeanRadius_max,Nuc_Neighbors_NumberOfNeighbors_Expanded_count,Nuc_Neighbors_NumberOfNeighbors_Expanded_mean,Nuc_Neighbors_NumberOfNeighbors_Expanded_std,Nuc_Neighbors_NumberOfNeighbors_Expanded_min,Nuc_Neighbors_NumberOfNeighbors_Expanded_25%,Nuc_Neighbors_NumberOfNeighbors_Expanded_50%,Nuc_Neighbors_NumberOfNeighbors_Expanded_75%,Nuc_Neighbors_NumberOfNeighbors_Expanded_max
Nuc_ImageNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,313.000000,0.000000,313.0,313.00,313.0,313.00,313.0,1.0,3.387379,...,3.387379,3.387379,1.0,0.000000,0.000000,0.0,0.0,0.0,0.00,0.0
2,3.0,251.000000,79.680612,178.0,208.50,239.0,287.50,336.0,3.0,2.472544,...,2.754673,3.177413,3.0,2.000000,0.000000,2.0,2.0,2.0,2.00,2.0
3,5.0,302.000000,162.260593,182.0,212.00,260.0,271.00,585.0,5.0,3.278895,...,3.339997,4.695143,5.0,2.400000,0.894427,1.0,2.0,3.0,3.00,3.0
4,29.0,314.137931,125.210030,177.0,233.00,262.0,340.00,635.0,29.0,3.072118,...,3.247382,3.940398,29.0,4.689655,1.649750,2.0,4.0,4.0,5.00,8.0
5,32.0,355.937500,160.779339,180.0,243.00,331.0,375.25,832.0,32.0,3.188709,...,3.514243,4.589244,32.0,4.687500,1.635050,2.0,3.0,4.5,5.25,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,47.0,574.255319,290.847448,177.0,305.50,534.0,746.00,1323.0,47.0,3.914974,...,4.580066,5.991597,47.0,4.978723,1.700248,2.0,4.0,5.0,6.00,9.0
97,5.0,667.600000,270.346629,280.0,513.00,734.0,888.00,923.0,5.0,3.732732,...,4.018302,4.458481,5.0,2.400000,1.140175,1.0,2.0,2.0,3.00,4.0
98,50.0,549.580000,293.608798,181.0,347.00,440.0,728.75,1565.0,50.0,3.927443,...,4.432256,6.146791,50.0,4.920000,1.549720,2.0,4.0,5.0,6.00,8.0
99,12.0,549.333333,237.054782,244.0,399.25,516.5,670.50,1000.0,12.0,3.460067,...,3.950081,4.063406,12.0,3.666667,1.302678,2.0,3.0,3.5,4.00,6.0


In [17]:
y

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 2., 2., 2., 2.,
       2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
       2., 2., 2., 2., 2.

In [18]:
Xtrain,Xvalid,ytrain,yvalid = train_test_split(X, y.ravel()) 
        # ,random_state=42) # add this for reproducibility
print('Xtrain',Xtrain.shape,'ytrain',ytrain.shape,'non-zero:',np.count_nonzero(ytrain))
print('Xvalid',Xvalid.shape,'yvalid',yvalid.shape,'non-zero:',np.count_nonzero(yvalid))

Xtrain (450, 24) ytrain (450,) non-zero: 371
Xvalid (150, 24) yvalid (150,) non-zero: 129


In [19]:
ytrain

array([4., 5., 2., 4., 5., 1., 3., 1., 0., 3., 1., 1., 0., 4., 2., 3., 3.,
       3., 1., 5., 0., 2., 5., 0., 0., 3., 3., 0., 3., 0., 4., 2., 3., 1.,
       1., 1., 0., 3., 1., 4., 3., 0., 0., 4., 0., 2., 0., 1., 3., 3., 4.,
       5., 2., 4., 3., 0., 2., 2., 4., 5., 2., 0., 0., 4., 2., 2., 2., 3.,
       3., 4., 0., 4., 1., 4., 2., 3., 2., 2., 0., 1., 1., 4., 5., 3., 3.,
       0., 0., 2., 0., 2., 0., 3., 0., 5., 3., 2., 4., 5., 5., 1., 4., 1.,
       3., 0., 3., 4., 1., 3., 4., 3., 4., 4., 2., 5., 1., 1., 5., 5., 3.,
       0., 3., 5., 5., 1., 1., 5., 2., 3., 0., 4., 0., 3., 3., 4., 0., 3.,
       0., 5., 2., 0., 1., 3., 1., 1., 3., 3., 0., 0., 2., 0., 0., 0., 3.,
       2., 4., 2., 2., 2., 2., 2., 0., 0., 5., 4., 2., 4., 2., 4., 0., 1.,
       2., 2., 3., 1., 1., 3., 4., 3., 2., 0., 5., 2., 5., 2., 3., 1., 2.,
       0., 2., 2., 2., 5., 4., 3., 0., 2., 0., 4., 1., 2., 3., 4., 2., 4.,
       2., 0., 4., 5., 3., 3., 0., 5., 4., 0., 0., 0., 3., 2., 0., 5., 0.,
       1., 3., 5., 2., 1.

In [20]:
class RF_Util:
    def __init__(self):
        self.model=RandomForestClassifier()
    def get_model(self):
        return self.model
    def set_train(self,X,y):
        self.Xtr = X
        self.ytr = y
    def set_validation(self,X,y):
        self.Xval = X
        self.yval = y
    def fit(self):
        self.model.fit(self.Xtr,self.ytr)
    def validation_accuracy(self):
        ypred = self.model.predict(self.Xval)
        matches = np.count_nonzero(self.yval==ypred)
        accuracy = 100.0 * matches / len(ypred)  # bug fix
        return accuracy
    def validation_confusion(self):
        ypred = self.model.predict(self.Xval)
        cm = confusion_matrix(self.yval, ypred)
        return cm
    def important_features(self):
        names = self.model.feature_names_in_
        importances = self.model.feature_importances_
        pairs = np.column_stack( (names,importances) )
        top_array = sorted(pairs, key = lambda e:e[1], reverse=True)
        # There must be a way to do this witout a loop!
        top_list = []
        for i in top_array:
             top_list.append((i[1],i[0]))  # 0=feature_name, 1=importance
        top_df = pd.DataFrame(top_list)
        return top_df

In [21]:
print(datetime.datetime.now())
print('Train on all Features')
rf1 = RF_Util()
rf1.set_train(Xtrain,ytrain)
rf1.set_validation(Xvalid,yvalid)
rf1.fit()
print('Accuracy:',rf1.validation_accuracy())
print('Confusion:')
print(rf1.validation_confusion())
print('The impurity-based feature importances.')
top = rf1.important_features()
top.head()

2022-05-25 14:51:35.846950
Train on all Features
Accuracy: 62.0
Confusion:
[[16  1  0  0  4  0]
 [ 1 21  2  1  1  3]
 [ 0  0 16  2  2  0]
 [ 3  3  4  8  3  0]
 [ 3  4  4  5 11  3]
 [ 1  3  3  0  1 21]]
The impurity-based feature importances.


Unnamed: 0,0,1
0,0.069836,Nuc_AreaShape_Area_count
1,0.065514,Nuc_AreaShape_Area_mean
2,0.065511,Nuc_AreaShape_MeanRadius_count
3,0.060962,Nuc_Neighbors_NumberOfNeighbors_Expanded_mean
4,0.056749,Nuc_AreaShape_MeanRadius_mean
