# Random Forest Classifier with Feature Elimination

Redo notebook 005 to answer this question:  
Did Yneg images have more patches classified as Yneg than Ypos images? 

Link: [Using](https://machinelearningmastery.com/save-load-machine-learning-models-python-scikit-learn/) joblib to dump and load an sklearn model.

Fix bugs:
* The way we dumped with joblib could not be loaded.
* Don't wait till the end to save.
* Rename 'test' to 'validate' because we haven't touched the test set yet.
* Make a CSV file of predictions on the validation set.
* My RF_Util was using len(ytest) for accuracy. Wrong variable but same result.

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import sklearn
print('sklearn',sklearn.__version__)

2022-05-06 14:24:58.356458
Python 3.8.10
sklearn 1.0.2


In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE
import joblib # used to dump/load sklearn models

In [3]:
# Data straight from CellProfiler (rows for every patch image file)
YPOS_DIR='/home/jrm/Martinez/CellProfilerRuns/CP_20220429f/'
YNEG_DIR='/home/jrm/Martinez/CellProfilerRuns/CP_20220429g/'
IMAGES="Image.csv"
RAW_YPOS=YPOS_DIR+IMAGES
RAW_YNEG=YNEG_DIR+IMAGES
# Our pre-processed data (zero to one row for every patch image file)
FILEPATH = '/home/jrm/Martinez/CellProfilerRuns/CP_20220429.combined.002/'
PROCESSED_YPOS=FILEPATH+'positives.csv'
PROCESSED_YNEG=FILEPATH+'negatives.csv'
# Saving our models
MODELS_DIR='/home/jrm/Martinez/models/HE_RandomForest.006/'

## Load Data
Load patch filenames from the original CellProfiler csv files called f and g.  
Load patch data from the csv generated by our CP_PreProcess.002 notebook.

In [4]:
def read_raw (filename):
    df = pd.read_csv(filename) 
    df = df[ ['ImageNumber','FileName_HE'] ]
    df.set_index('ImageNumber',inplace=True)
    return df
Ypos_filenames = read_raw(RAW_YPOS)
Yneg_filenames = read_raw(RAW_YNEG)
print('Ypos filenames',Ypos_filenames.shape)
print('Yneg filenames',Yneg_filenames.shape)

Ypos filenames (12979, 1)
Yneg filenames (17913, 1)


In [5]:
Ypos_filenames

Unnamed: 0_level_0,FileName_HE
ImageNumber,Unnamed: 1_level_1
1,B15.1.jpg
2,B15.10.jpg
3,B15.100.jpg
4,B15.1000.jpg
5,B15.1001.jpg
...,...
12975,I5_.994.jpg
12976,I5_.995.jpg
12977,I5_.996.jpg
12978,I5_.997.jpg


In [6]:
def make_dataframe(filename):
    df = pd.read_csv(filename,dtype=np.float32)
    # This field is critical for our post-processing
    # but allowing model to see it could leak information about labels.
    # Need to retain this temporarily to match rows to filenames.
    # Drop this after sampling and before training.
    # df = df.drop(columns=['ImageNumber']) 
    # df.set_index('ImageNumber',inplace=True)
    count2 = df.isnull().sum().sum()
    print('How many NaN?:', count2)
    print('Largest value:', df.max().max())
    print('Smallest:', df.min().min())
    return df

In [7]:
feature_vec_Ypos = make_dataframe(PROCESSED_YPOS)
print('Ypos features',feature_vec_Ypos.shape)

How many NaN?: 0
Largest value: 50176.0
Smallest: -90.0
Ypos features (9040, 731)


In [8]:
feature_vec_Yneg = make_dataframe(PROCESSED_YNEG)
print('Yneg features',feature_vec_Yneg.shape)

How many NaN?: 0
Largest value: 50176.0
Smallest: -90.0
Yneg features (12433, 731)


In [9]:
# Note missing ImageNumbers because this is a filtered set.
feature_vec_Ypos

Unnamed: 0,ImageNumber,Count_Cells,Count_Nuclei,Count_RBC,"('RBC_ObjectNumber', 'max')","('RBC_AreaShape_Area', 'min')","('RBC_AreaShape_Area', 'max')","('RBC_AreaShape_Area', 'mean')","('RBC_AreaShape_Area', 'std')","('RBC_AreaShape_BoundingBoxArea', 'min')",...,"('Cell_Neighbors_SecondClosestDistance_Expanded', 'mean')","('Cell_Neighbors_SecondClosestDistance_Expanded', 'std')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'min')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'max')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'mean')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'std')","('Cell_Parent_Nuclei', 'min')","('Cell_Parent_Nuclei', 'max')","('Cell_Parent_Nuclei', 'mean')","('Cell_Parent_Nuclei', 'std')"
0,1.0,2.0,2.0,5.0,5.0,335.0,587.0,405.799988,102.726334,533.0,...,58.006420,6.755942,0.0,0.0,0.000000,0.000000,1.0,2.0,1.5,0.707107
1,2.0,2.0,2.0,7.0,7.0,368.0,609.0,465.857147,103.246353,616.0,...,71.603241,6.743078,0.0,0.0,0.000000,0.000000,1.0,2.0,1.5,0.707107
2,4.0,3.0,3.0,2.0,2.0,432.0,508.0,470.000000,53.740116,696.0,...,58.321133,26.255213,1.0,2.0,1.333333,0.577350,1.0,3.0,2.0,1.000000
3,5.0,3.0,3.0,6.0,6.0,389.0,1438.0,908.166687,459.363647,672.0,...,66.434067,9.318718,1.0,3.0,2.333333,1.154701,1.0,3.0,2.0,1.000000
4,6.0,2.0,2.0,2.0,2.0,894.0,985.0,939.500000,64.346718,1518.0,...,53.738045,0.458414,0.0,0.0,0.000000,0.000000,1.0,2.0,1.5,0.707107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9035,12975.0,6.0,6.0,5.0,5.0,328.0,1235.0,748.799988,391.957520,600.0,...,66.865280,3.929714,2.0,5.0,3.500000,1.224745,1.0,6.0,3.5,1.870829
9036,12976.0,6.0,6.0,2.0,2.0,784.0,1243.0,1013.500000,324.562012,1368.0,...,68.272209,14.495635,2.0,5.0,4.000000,1.264911,1.0,6.0,3.5,1.870829
9037,12977.0,3.0,3.0,3.0,3.0,369.0,1836.0,993.000000,757.623230,522.0,...,59.660042,22.988840,1.0,3.0,2.333333,1.154701,1.0,3.0,2.0,1.000000
9038,12978.0,3.0,3.0,4.0,4.0,441.0,914.0,676.000000,214.549133,840.0,...,60.303802,2.921415,1.0,3.0,1.666667,1.154701,1.0,3.0,2.0,1.000000


In [10]:
# Demonstrate row# not equal ImageNumber
feature_vec_Ypos.loc[4]

ImageNumber                                                     6.000000
Count_Cells                                                     2.000000
Count_Nuclei                                                    2.000000
Count_RBC                                                       2.000000
('RBC_ObjectNumber', 'max')                                     2.000000
                                                                  ...   
('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'std')    0.000000
('Cell_Parent_Nuclei', 'min')                                   1.000000
('Cell_Parent_Nuclei', 'max')                                   2.000000
('Cell_Parent_Nuclei', 'mean')                                  1.500000
('Cell_Parent_Nuclei', 'std')                                   0.707107
Name: 4, Length: 731, dtype: float32

In [11]:
# Note missing ImageNumbers because this is a filtered set.
feature_vec_Yneg
# Note both sets use numbering starting at one though they reference DIFFERENT images.

Unnamed: 0,ImageNumber,Count_Cells,Count_Nuclei,Count_RBC,"('RBC_ObjectNumber', 'max')","('RBC_AreaShape_Area', 'min')","('RBC_AreaShape_Area', 'max')","('RBC_AreaShape_Area', 'mean')","('RBC_AreaShape_Area', 'std')","('RBC_AreaShape_BoundingBoxArea', 'min')",...,"('Cell_Neighbors_SecondClosestDistance_Expanded', 'mean')","('Cell_Neighbors_SecondClosestDistance_Expanded', 'std')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'min')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'max')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'mean')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'std')","('Cell_Parent_Nuclei', 'min')","('Cell_Parent_Nuclei', 'max')","('Cell_Parent_Nuclei', 'mean')","('Cell_Parent_Nuclei', 'std')"
0,1.0,2.0,2.0,1.0,1.0,326.0,326.0,326.000000,0.000000,444.0,...,62.908554,20.415495,0.0,0.0,0.000000,0.000000,1.0,2.0,1.5,0.707107
1,2.0,2.0,2.0,1.0,1.0,656.0,656.0,656.000000,0.000000,1170.0,...,58.823872,1.073175,0.0,0.0,0.000000,0.000000,1.0,2.0,1.5,0.707107
2,4.0,6.0,6.0,3.0,3.0,384.0,957.0,622.000000,298.561554,720.0,...,64.954605,18.807285,2.0,6.0,3.500000,1.378405,1.0,6.0,3.5,1.870829
3,5.0,2.0,2.0,4.0,4.0,442.0,921.0,643.000000,210.287109,756.0,...,61.695148,14.772483,0.0,0.0,0.000000,0.000000,1.0,2.0,1.5,0.707107
4,8.0,2.0,2.0,2.0,2.0,529.0,545.0,537.000000,11.313708,779.0,...,99.179436,25.065863,0.0,0.0,0.000000,0.000000,1.0,2.0,1.5,0.707107
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12428,17905.0,3.0,3.0,2.0,2.0,597.0,1056.0,826.500000,324.562012,1326.0,...,61.903854,22.622475,1.0,2.0,1.333333,0.577350,1.0,3.0,2.0,1.000000
12429,17907.0,3.0,3.0,1.0,1.0,515.0,515.0,515.000000,0.000000,638.0,...,80.616760,12.660180,2.0,3.0,2.333333,0.577350,1.0,3.0,2.0,1.000000
12430,17909.0,3.0,3.0,6.0,6.0,537.0,1192.0,673.666687,257.302673,924.0,...,75.165962,9.026843,1.0,2.0,1.666667,0.577350,1.0,3.0,2.0,1.000000
12431,17910.0,3.0,3.0,7.0,7.0,326.0,864.0,614.000000,223.952377,567.0,...,86.498497,44.148884,2.0,3.0,2.666667,0.577350,1.0,3.0,2.0,1.000000


In [12]:
# Sample down both sets to their minimum length.
# This achieves a balanced training set.
# Side effect: this shuffles the rows of both.
PosCount = len(feature_vec_Ypos)
NegCount = len(feature_vec_Yneg)
SameCount = min(PosCount,NegCount)
feature_vec_Ypos=feature_vec_Ypos.sample(n=SameCount)
feature_vec_Yneg=feature_vec_Yneg.sample(n=SameCount)

Ypos_rows,Ypos_cols = feature_vec_Ypos.shape
Yneg_rows,Yneg_cols = feature_vec_Yneg.shape
if Ypos_cols == Yneg_cols:
    print('The dataframes are compatible.')
else:
    print('ERROR! Column counts do not match.')
    
feature_vec_all = pd.concat ( [feature_vec_Ypos, feature_vec_Yneg], ignore_index=True )
label_vec_Ypos = np.ones(Ypos_rows,dtype=int)
label_vec_Yneg = np.zeros(Yneg_rows,dtype=int)
label_vec_all = np.concatenate ( [label_vec_Ypos, label_vec_Yneg] )
# At this point, all the Ypos=1 come before all the Yneg=0.

The dataframes are compatible.


In [15]:
# Randomly partion data into 75% training, 25% validation (called test incorrectly).
# Side effect: this shuffles the rows (but X and y remain row-matched). 
Xtrain,Xvalid,ytrain,yvalid = train_test_split(feature_vec_all, label_vec_all.ravel()) 
        # ,random_state=42) # add this for reproducibility
print('Xtrain',Xtrain.shape,'ytrain',ytrain.shape,'ones:',np.count_nonzero(ytrain))
print('Xvalid',Xvalid.shape,'yvalid',yvalid.shape,'ones:',np.count_nonzero(yvalid))

Xtrain (13560, 731) ytrain (13560,) ones: 6822
Xvalid (4520, 731) yvalid (4520,) ones: 2218


In [16]:
ytrain

array([1, 1, 1, ..., 0, 0, 1])

## Associate each row with a class and patch filename
Then remove image numbers from the training data.  
Repeat for the validation data.

In [17]:
def make_association(data,labels,pos_fn,neg_fn):
    associations = data [ ['ImageNumber'] ]
    last = len(associations.columns)
    associations.insert(last,'Label',labels)
    filenames=[]
    rec=None
    for ndx,row in associations.iterrows():   # how to do this without iteration?
        inum = row['ImageNumber']
        if row['Label']==1:
            rec = pos_fn.loc[inum]
        else:
            rec = neg_fn.loc[inum]
        fn = rec['FileName_HE']
        filenames.append(fn)
    last = len(associations.columns)
    associations.insert(last,'Filename',filenames)
    return associations
associations_train = make_association(Xtrain,ytrain,Ypos_filenames,Yneg_filenames)
associations_valid =  make_association(Xvalid,yvalid,Ypos_filenames,Yneg_filenames)
associations_valid

Unnamed: 0,ImageNumber,Label,Filename
14638,4350.0,0,C11.694.jpg
16306,1679.0,0,A5_.504.jpg
2717,4382.0,1,D5_.969.jpg
17195,73.0,0,A3_.1083.jpg
1383,373.0,1,B15.196.jpg
...,...,...,...
12397,3008.0,0,B13.634.jpg
12744,1684.0,0,A5_.51.jpg
5732,9968.0,1,I13.1316.jpg
4477,2051.0,1,B7_.854.jpg


In [18]:
Xtrain.drop(columns=['ImageNumber'],inplace=True) 
Xvalid.drop(columns=['ImageNumber'],inplace=True) 

## Random Forest Utility Class

In [19]:
class RF_Util:
    def __init__(self):
        self.model=RandomForestClassifier()
    def get_model(self):
        return self.model
    def set_train(self,X,y):
        self.Xtr = X
        self.ytr = y
    def set_validation(self,X,y):
        self.Xval = X
        self.yval = y
    def fit(self):
        self.model.fit(self.Xtr,self.ytr)
    def validation_accuracy(self):
        ypred = self.model.predict(self.Xval)
        matches = np.count_nonzero(self.yval==ypred)
        accuracy = 100.0 * matches / len(ypred)  # bug fix
        return accuracy
    def validation_confusion(self):
        ypred = self.model.predict(self.Xval)
        cm = confusion_matrix(self.yval, ypred)
        return cm
    def important_features(self):
        names = self.model.feature_names_in_
        importances = self.model.feature_importances_
        pairs = np.column_stack( (names,importances) )
        top_array = sorted(pairs, key = lambda e:e[1], reverse=True)
        # There must be a way to do this witout a loop!
        top_list = []
        for i in top_array:
             top_list.append((i[1],i[0]))  # 0=feature_name, 1=importance
        top_df = pd.DataFrame(top_list)
        return top_df

## Random Forest 1 - All Features

In [20]:
print(datetime.datetime.now())
print('Train on all Features')
rf1 = RF_Util()
rf1.set_train(Xtrain,ytrain)
rf1.set_validation(Xvalid,yvalid)
rf1.fit()
print('Accuracy:',rf1.validation_accuracy())
print('Confusion:')
print(rf1.validation_confusion())
print('The impurity-based feature importances.')
top = rf1.important_features()
top.head()

2022-05-06 14:25:43.057082
Train on all Features
Accuracy: 58.4070796460177
Confusion:
[[1447  855]
 [1025 1193]]
The impurity-based feature importances.


Unnamed: 0,0,1
0,0.005171,"('RBC_AreaShape_Solidity', 'mean')"
1,0.004776,"('RBC_AreaShape_Compactness', 'max')"
2,0.004194,"('RBC_AreaShape_FormFactor', 'mean')"
3,0.004149,"('RBC_AreaShape_Compactness', 'mean')"
4,0.00349,"('RBC_AreaShape_FormFactor', 'min')"


In [21]:
print(datetime.datetime.now())
model1=rf1.get_model()
joblib.dump(model1,MODELS_DIR+'rf1.joblib')

2022-05-06 14:26:02.185916


['/home/jrm/Martinez/models/HE_RandomForest.006/rf1.joblib']

## Update association

In [22]:
ypred = rf1.get_model().predict(Xtrain)
last = len(associations_train.columns)
associations_train.insert(last,'Ypred',ypred)
ypred = rf1.get_model().predict(Xvalid)
last = len(associations_valid.columns)
associations_valid.insert(last,'Ypred',ypred)
ypred = None
associations_valid

Unnamed: 0,ImageNumber,Label,Filename,Ypred
14638,4350.0,0,C11.694.jpg,0
16306,1679.0,0,A5_.504.jpg,1
2717,4382.0,1,D5_.969.jpg,1
17195,73.0,0,A3_.1083.jpg,0
1383,373.0,1,B15.196.jpg,0
...,...,...,...,...
12397,3008.0,0,B13.634.jpg,0
12744,1684.0,0,A5_.51.jpg,0
5732,9968.0,1,I13.1316.jpg,1
4477,2051.0,1,B7_.854.jpg,0


# STOP HERE

## Random Forest 2 - Reduced Features

In [15]:
print(datetime.datetime.now())
model = rf1.get_model()
rfe = RFE(model)  # Random Forest feature Elimination model
rfe.fit(Xtrain,ytrain) # This is slow! Uses 100% cpu but 0% gpu.
print('Ranking',rfe.ranking_) # Selected features get rank=1. Large numbers mean not selected.
support = rfe.support_
no_support = np.invert(rfe.support_)
selected = rfe.feature_names_in_[rfe.support_]
not_selected = rfe.feature_names_in_[no_support]
Xtest_reduced = Xtest.drop(not_selected,axis=1)
Xtrain_reduced = Xtrain.drop(not_selected,axis=1)

2022-05-05 15:08:07.219675
Ranking [341 342 248 284   1   1 223 269   8   1 178 293 117 310  20 133  84 306
   1 149 303 199  89 232 308 103   1 219   1   1   1 166   1   1  82   1
   1   1   1   1   1   1 252 189 101   1   1 290 181  34 249 305 356 359
 353 358   1   1   1 123   1   1   1 192   1   1   1 300 141   1  88 295
   1 247   1 292   1   1   1 258 309 319   1 302 151   1  19 167   1   1
 169 228   1   1   1  40   1   1   1 147   1   1   1 229   1 115   1 261
   1   1  12 225   1   4   1 270   1   1  39 266  80   1   1 106   1   1
   1 259   1  95   1 285 156   1  83 263   1   1   1 279   1   1   1 243
   1   1   1 224   1  10   1 274   1   1  31 226   1   1   1 230  13   1
   1 277   1   1   1 280   1   1   1 198   1   1   1 281   1   1   1 267
   1   1   1 273   1   1   1 265  42   1   1 257   1   1   1 298   1   1
   1 289   1   1   1 184   1   1   1 297   1   1   1 237   1  17   1 214
   1   1   1 154   1   1  16 239 348 150   1  37   1   1  44 291 233 244
 327   1 245 207

In [16]:
print(datetime.datetime.now())
print('Train on all Features')
rf2 = RF_Util()
rf2.set_train(Xtrain_reduced,ytrain) # X has fewer columns but y is unchanged
rf2.set_test(Xtest_reduced,ytest)
rf2.fit()
print('Accuracy:',rf2.test_accuracy())
print('Confusion:')
print(rf2.test_confusion())
print('The impurity-based feature importances.')
top = rf2.important_features()
top.head()

2022-05-05 16:51:13.017948
Train on all Features
Accuracy: 59.336283185840706
Confusion:
[[1470  824]
 [1014 1212]]
The impurity-based feature importances.


Unnamed: 0,0,1
0,0.006089,"('RBC_AreaShape_Solidity', 'mean')"
1,0.005929,"('RBC_AreaShape_Compactness', 'max')"
2,0.005853,"('RBC_AreaShape_Compactness', 'mean')"
3,0.005718,"('RBC_AreaShape_FormFactor', 'min')"
4,0.005633,"('RBC_AreaShape_Solidity', 'min')"


In [17]:
print(datetime.datetime.now())


2022-05-05 16:51:27.257079


In [19]:
# Unlike keras models, the sklearn classifier has no save method
import joblib
joblib.dump(rfe,MODELS_DIR+BASENAME+'.rfe')
joblib.dump(rf2,MODELS_DIR+BASENAME+'.rf2')

['/home/jrm/Martinez/models/HE_RandomForest.005/joblib.rf2']

In [20]:
print('RF1 - The impurity-based feature importances.')
model=rf1.get_model()
names = model.feature_names_in_
importances = model.feature_importances_
pairs = np.column_stack( (names,importances) )
sorted(pairs, key = lambda e:e[1], reverse=True)

RF1 - The impurity-based feature importances.


[array(["('RBC_AreaShape_Solidity', 'mean')", 0.005356381266235923],
       dtype=object),
 array(["('RBC_AreaShape_Solidity', 'min')", 0.004412825960293408],
       dtype=object),
 array(["('RBC_AreaShape_Compactness', 'mean')", 0.004290267833844383],
       dtype=object),
 array(["('RBC_AreaShape_FormFactor', 'min')", 0.004171596052896913],
       dtype=object),
 array(["('RBC_AreaShape_FormFactor', 'mean')", 0.0041199356390349854],
       dtype=object),
 array(["('RBC_AreaShape_Compactness', 'max')", 0.0037622661773533406],
       dtype=object),
 array(["('RBC_AreaShape_MeanRadius', 'min')", 0.0026830086049296348],
       dtype=object),
 array(["('Nuc_AreaShape_Solidity', 'mean')", 0.0026101494026034475],
       dtype=object),
 array(["('RBC_AreaShape_Extent', 'mean')", 0.002486102610225226],
       dtype=object),
 array(["('RBC_AreaShape_Perimeter', 'max')", 0.002344304959670546],
       dtype=object),
 array(["('RBC_AreaShape_Zernike_0_0', 'mean')", 0.0023140712323614625],
       

In [21]:
print('RF2 - The impurity-based feature importances.')
model=rf2.get_model()
names = model.feature_names_in_
importances = model.feature_importances_
pairs = np.column_stack( (names,importances) )
sorted(pairs, key = lambda e:e[1], reverse=True)

RF2 - The impurity-based feature importances.


[array(["('RBC_AreaShape_Solidity', 'mean')", 0.006089196578103686],
       dtype=object),
 array(["('RBC_AreaShape_Compactness', 'max')", 0.0059292486423253524],
       dtype=object),
 array(["('RBC_AreaShape_Compactness', 'mean')", 0.005853341262705861],
       dtype=object),
 array(["('RBC_AreaShape_FormFactor', 'min')", 0.005717544292948096],
       dtype=object),
 array(["('RBC_AreaShape_Solidity', 'min')", 0.005632524571360771],
       dtype=object),
 array(["('RBC_AreaShape_FormFactor', 'mean')", 0.0048784717974028],
       dtype=object),
 array(["('RBC_AreaShape_MeanRadius', 'min')", 0.004153831576070222],
       dtype=object),
 array(["('RBC_AreaShape_Zernike_2_0', 'min')", 0.003857713247514345],
       dtype=object),
 array(["('RBC_AreaShape_Perimeter', 'max')", 0.003704369834047687],
       dtype=object),
 array(["('Nuc_AreaShape_Solidity', 'mean')", 0.003685750090219058],
       dtype=object),
 array(["('RBC_AreaShape_MinFeretDiameter', 'max')", 0.0035498178039254785],
    