# Random Forest Classifier with Feature Elimination

Use data from 20220429 run. Use data prepared by CP_PreProcess.001 notebook. Take input from CP_20220429.combined.001 directory.

In [1]:
import datetime
print(datetime.datetime.now())
from platform import python_version
print('Python',python_version())
import numpy as np
import pandas as pd
import sklearn
print('sklearn',sklearn.__version__)

2022-05-03 15:23:54.816984
Python 3.8.10
sklearn 1.0.2


In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.feature_selection import RFE

In [3]:
# How to interpret the confusion matrix.
# The CM considers 0=Positive and 1=Negative.
# TP FP |  True 0=Y-neg  False 0=Y-neg
# FN TN | False 1=Y-pos  True 1=Y-pos
# Note 1=Ypos="Negative", 0=Yneg="Positive", so it appears backwards.
ytest=[1,1]
ypred=[1,0]
confusion_matrix(ytest,ypred)  

array([[0, 0],
       [1, 1]])

## Load Train and Test Sets

In [4]:
def make_dataframe(filename):
    df = pd.read_csv(filename,dtype=np.float32) 
    df = df.drop(columns=['ImageNumber'])
    count2 = df.isnull().sum().sum()
    print('How many NaN?:', count2)
    print('Largest value:', df.max().max())
    print('Smallest:', df.min().min())
    return df

In [5]:
FILEPATH = '/home/jrm/Martinez/CellProfilerRuns/CP_20220429.combined.001/'
FILENAME_YPOS = FILEPATH+'positives.csv'
feature_vec_Ypos = make_dataframe(FILENAME_YPOS)

How many NaN?: 0
Largest value: 50176.0
Smallest: -90.0


In [6]:
FILENAME_YNEG = FILEPATH+'negatives.csv'
feature_vec_Yneg = make_dataframe(FILENAME_YNEG)

How many NaN?: 0
Largest value: 50176.0
Smallest: -90.0


In [7]:
PosCount = len(feature_vec_Ypos)
NegCount = len(feature_vec_Yneg)
SameCount = min(PosCount,NegCount)
feature_vec_Ypos=feature_vec_Ypos.sample(n=SameCount)
feature_vec_Yneg=feature_vec_Yneg.sample(n=SameCount)

In [8]:
feature_vec_Ypos

Unnamed: 0,Count_Cells,Count_Nuclei,Count_RBC,Threshold_FinalThreshold_Cells,Threshold_FinalThreshold_Nuclei,Threshold_FinalThreshold_RBC,Threshold_GuideThreshold_Cells,Threshold_GuideThreshold_Nuclei,Threshold_OrigThreshold_Cells,Threshold_OrigThreshold_Nuclei,...,"('Cell_Neighbors_SecondClosestDistance_Expanded', 'mean')","('Cell_Neighbors_SecondClosestDistance_Expanded', 'std')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'min')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'max')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'mean')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'std')","('Cell_Parent_Nuclei', 'min')","('Cell_Parent_Nuclei', 'max')","('Cell_Parent_Nuclei', 'mean')","('Cell_Parent_Nuclei', 'std')"
138,2.0,2.0,1.0,0.196280,0.212159,0.368399,0.189803,0.208355,0.193373,0.211950,...,73.226830,19.629774,0.0,0.0,0.000000,0.000000,1.0,2.0,1.5,0.707107
3273,4.0,4.0,9.0,0.168296,0.324656,0.528253,0.165950,0.327605,0.168227,0.324656,...,61.752125,7.330101,1.0,4.0,2.500000,1.290994,1.0,4.0,2.5,1.290994
1508,5.0,5.0,7.0,0.328183,0.191636,0.560899,0.292464,0.187123,0.330713,0.187216,...,70.821365,9.196554,3.0,4.0,3.600000,0.547723,1.0,5.0,3.0,1.581139
6938,5.0,5.0,2.0,0.247516,0.263083,0.442189,0.245883,0.265423,0.243201,0.263083,...,66.920120,15.026970,4.0,5.0,4.200000,0.447214,1.0,5.0,3.0,1.581139
6042,3.0,3.0,6.0,0.121442,0.185165,0.384088,0.109209,0.184625,0.122749,0.185283,...,72.523926,31.368704,1.0,3.0,2.333333,1.154701,1.0,3.0,2.0,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6964,6.0,6.0,3.0,0.218772,0.266547,0.450801,0.222278,0.270031,0.218730,0.266547,...,53.669922,6.507346,2.0,6.0,4.000000,1.549193,1.0,6.0,3.5,1.870829
1585,3.0,3.0,6.0,0.201406,0.233984,0.489283,0.201115,0.238075,0.201406,0.233796,...,62.190262,8.759800,1.0,2.0,1.666667,0.577350,1.0,3.0,2.0,1.000000
4835,3.0,3.0,5.0,0.224840,0.199821,0.473068,0.221425,0.202929,0.224840,0.199753,...,66.483109,4.741963,1.0,2.0,1.333333,0.577350,1.0,3.0,2.0,1.000000
193,5.0,5.0,7.0,0.133091,0.203110,0.268403,0.134017,0.203506,0.132152,0.203110,...,57.591282,15.011292,3.0,5.0,4.600000,0.894427,1.0,5.0,3.0,1.581139


In [9]:
feature_vec_Yneg

Unnamed: 0,Count_Cells,Count_Nuclei,Count_RBC,Threshold_FinalThreshold_Cells,Threshold_FinalThreshold_Nuclei,Threshold_FinalThreshold_RBC,Threshold_GuideThreshold_Cells,Threshold_GuideThreshold_Nuclei,Threshold_OrigThreshold_Cells,Threshold_OrigThreshold_Nuclei,...,"('Cell_Neighbors_SecondClosestDistance_Expanded', 'mean')","('Cell_Neighbors_SecondClosestDistance_Expanded', 'std')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'min')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'max')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'mean')","('Cell_Neighbors_SecondClosestObjectNumber_Expanded', 'std')","('Cell_Parent_Nuclei', 'min')","('Cell_Parent_Nuclei', 'max')","('Cell_Parent_Nuclei', 'mean')","('Cell_Parent_Nuclei', 'std')"
7000,4.0,4.0,3.0,0.196086,0.235138,0.418692,0.164394,0.245222,0.219590,0.230863,...,51.240356,16.383982,1.0,3.0,1.750000,0.957427,1.0,4.0,2.5,1.290994
6154,3.0,3.0,5.0,0.179045,0.222127,0.348752,0.187012,0.217705,0.179045,0.220695,...,61.635883,17.979856,1.0,3.0,2.333333,1.154701,1.0,3.0,2.0,1.000000
7030,2.0,2.0,3.0,0.172887,0.204725,0.353455,0.177659,0.201447,0.172887,0.202413,...,67.585121,4.567595,0.0,0.0,0.000000,0.000000,1.0,2.0,1.5,0.707107
12296,3.0,3.0,3.0,0.083041,0.215219,0.352401,0.082221,0.267594,0.070118,0.210241,...,71.532082,15.809692,1.0,3.0,2.333333,1.154701,1.0,3.0,2.0,1.000000
1675,6.0,6.0,2.0,0.206489,0.213751,0.398161,0.191325,0.217610,0.213454,0.211931,...,66.116463,13.761604,1.0,6.0,3.666667,1.861899,1.0,6.0,3.5,1.870829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9121,2.0,2.0,4.0,0.189592,0.231244,0.396875,0.185894,0.213165,0.189511,0.232109,...,68.661003,11.395853,0.0,0.0,0.000000,0.000000,1.0,2.0,1.5,0.707107
11243,5.0,5.0,3.0,0.179665,0.255355,0.360231,0.177085,0.258325,0.178148,0.255355,...,61.005756,12.091174,1.0,5.0,2.200000,1.643168,1.0,5.0,3.0,1.581139
3909,4.0,4.0,3.0,0.253511,0.283958,0.560861,0.252053,0.287685,0.253507,0.283958,...,61.251923,10.780305,1.0,4.0,2.500000,1.732051,1.0,4.0,2.5,1.290994
5086,7.0,7.0,6.0,0.159237,0.188807,0.435172,0.151851,0.173888,0.159212,0.196231,...,59.914162,17.990458,1.0,5.0,3.428571,1.272418,1.0,7.0,4.0,2.160247


In [10]:
Ypos_rows,Ypos_cols = feature_vec_Ypos.shape
Yneg_rows,Yneg_cols = feature_vec_Yneg.shape
if Ypos_cols == Yneg_cols:
    print('The dataframes are compatible.')
else:
    print('ERROR! Column counts do not match.')

The dataframes are compatible.


In [11]:
feature_vec_all = pd.concat ( [feature_vec_Ypos, feature_vec_Yneg], ignore_index=True )
label_vec_Ypos = np.ones(Ypos_rows,dtype=int)
label_vec_Yneg = np.zeros(Yneg_rows,dtype=int)
label_vec_all = np.concatenate ( [label_vec_Ypos, label_vec_Yneg] )

In [12]:
# Default test size is 25%
Xtrain,Xtest,ytrain,ytest = train_test_split(feature_vec_all, label_vec_all.ravel(), random_state=42)
print('Xtrain',Xtrain.shape,'ytrain',ytrain.shape,'ones:',np.count_nonzero(ytrain))
print('Xtest',Xtest.shape,'ytest',ytest.shape,'ones:',np.count_nonzero(ytest))

Xtrain (13560, 744) ytrain (13560,) ones: 6814
Xtest (4520, 744) ytest (4520,) ones: 2226


## Random Forest Utility Class

In [13]:
class RF_Util:
    def __init__(self):
        self.model=RandomForestClassifier()
    def get_model(self):
        return self.model
    def set_train(self,X,y):
        self.Xtr = X
        self.ytr = y
    def set_test(self,X,y):
        self.Xte = X
        self.yte = y
    def fit(self):
        self.model.fit(self.Xtr,self.ytr)
    def test_accuracy(self):
        ypred = self.model.predict(self.Xte)
        matches = np.count_nonzero(self.yte==ypred)
        accuracy = 100.0 * matches / len(ytest)
        return accuracy
    def test_confusion(self):
        ypred = self.model.predict(self.Xte)
        cm = confusion_matrix(self.yte, ypred)
        return cm
    def important_features(self):
        names = self.model.feature_names_in_
        importances = self.model.feature_importances_
        pairs = np.column_stack( (names,importances) )
        top_array = sorted(pairs, key = lambda e:e[1], reverse=True)
        # This must be a way to do this witout a loop!
        top_list = []
        for i in top_array:
             top_list.append((i[1],i[0]))  # 0=feature_name, 1=importance
        top_df = pd.DataFrame(top_list)
        return top_df

## Random Forest 1 - All Features

In [14]:
print(datetime.datetime.now())
print('Train on all Features')
rf1 = RF_Util()
rf1.set_train(Xtrain,ytrain)
rf1.set_test(Xtest,ytest)
rf1.fit()
print('Accuracy:',rf1.test_accuracy())
print('Confusion:')
print(rf1.test_confusion())
print('The impurity-based feature importances.')
top = rf1.important_features()
top.head()

2022-05-03 15:23:56.683634
Train on all Features
Accuracy: 65.02212389380531
Confusion:
[[1578  716]
 [ 865 1361]]
The impurity-based feature importances.


Unnamed: 0,0,1
0,0.013702,Threshold_OrigThreshold_RBC
1,0.013205,Threshold_FinalThreshold_RBC
2,0.008759,Threshold_OrigThreshold_Nuclei
3,0.007887,Threshold_FinalThreshold_Nuclei
4,0.007534,Threshold_WeightedVariance_Nuclei


## Random Forest 2 - Reduced Features

In [15]:
print(datetime.datetime.now())
model = rf1.get_model()
rfe = RFE(model)  # Random Forest feature Elimination model
rfe.fit(Xtrain,ytrain) # This is slow! Uses 100% cpu but 0% gpu.
print('Ranking',rfe.ranking_) # Selected features get rank=1. Large numbers mean not selected.
support = rfe.support_
no_support = np.invert(rfe.support_)
selected = rfe.feature_names_in_[rfe.support_]
not_selected = rfe.feature_names_in_[no_support]
Xtest_reduced = Xtest.drop(not_selected,axis=1)
Xtrain_reduced = Xtrain.drop(not_selected,axis=1)

2022-05-03 15:24:15.562968
Ranking [343 336 235   1   1   1   1   1   1   1   1   1   1   1   1   1   1 298
   1   1 227 278 221  59 226 293 198 311 203 290 204 307   1  40 319 105
 256 251 308 166 192 281   1   1   1  78   1   1   1 153   1   1   1  30
  95   1 167 288  70  88   1 327 218   1 234 229 365 372 362 360   1   1
   1 182   1   1   1 237   1   1   1 295 136   1  34 304  98 157   1 282
   1   1   1 280 317 323   1 315   1   1   1 310   1   1   1 292   1   1
   1 175   1   1   1 301   1   1   1 180   1   1   1 271   1   1   1 259
   1   1   1 318   1   1  39 277 159   1   1 262   1  12 169 309   1   1
 123 294   1   1   1 252   1  67   1 297  26   1   1  93  23   1   1 236
  32   1 116 212   1  33   1 269   1   1   1 267   1   1   1 265   1   1
   1 286   1   1 137 285   1   1   1 275   1   1   1 303   1   1   1 261
   1   1  41 179   1   1   1 263   1   1   1 225   1   1  14 279   1   1
   1 260  31   1   1 266   1   1 142 291   1   1  11 207   1   1   1 264
   1   1 150 243

In [16]:
print(datetime.datetime.now())
print('Train on all Features')
rf2 = RF_Util()
rf2.set_train(Xtrain_reduced,ytrain) # X has fewer columns but y is unchanged
rf2.set_test(Xtest_reduced,ytest)
rf2.fit()
print('Accuracy:',rf2.test_accuracy())
print('Confusion:')
print(rf2.test_confusion())
print('The impurity-based feature importances.')
top = rf2.important_features()
top.head()

2022-05-03 17:09:46.054719
Train on all Features
Accuracy: 66.26106194690266
Confusion:
[[1621  673]
 [ 852 1374]]
The impurity-based feature importances.


Unnamed: 0,0,1
0,0.018572,Threshold_OrigThreshold_RBC
1,0.018342,Threshold_FinalThreshold_RBC
2,0.011499,Threshold_FinalThreshold_Nuclei
3,0.011079,Threshold_WeightedVariance_Nuclei
4,0.01018,Threshold_GuideThreshold_Nuclei


In [17]:
print(datetime.datetime.now())


2022-05-03 17:10:00.454861


In [27]:
# Unlike keras models, the sklearn classifier has no save method
import joblib
MODELS_DIR='/home/jrm/Martinez/models/'
BASENAME='HE_RandomForest.004/joblib'
joblib.dump(rf1,MODELS_DIR+BASENAME+'.rf1')
joblib.dump(rfe,MODELS_DIR+BASENAME+'.rfe')
joblib.dump(rf2,MODELS_DIR+BASENAME+'.rf2')

['/home/jrm/Martinez/models/HE_RandomForest.004/joblib.rf2']

In [28]:
print('RF1 - The impurity-based feature importances.')
model=rf1.get_model()
names = model.feature_names_in_
importances = model.feature_importances_
pairs = np.column_stack( (names,importances) )
sorted(pairs, key = lambda e:e[1], reverse=True)

RF1 - The impurity-based feature importances.


[array(['Threshold_OrigThreshold_RBC', 0.013702475835935257], dtype=object),
 array(['Threshold_FinalThreshold_RBC', 0.013204561499975398], dtype=object),
 array(['Threshold_OrigThreshold_Nuclei', 0.008759263544604134],
       dtype=object),
 array(['Threshold_FinalThreshold_Nuclei', 0.00788703154914402],
       dtype=object),
 array(['Threshold_WeightedVariance_Nuclei', 0.007534096024089816],
       dtype=object),
 array(['Threshold_GuideThreshold_Nuclei', 0.007034317132848879],
       dtype=object),
 array(['Threshold_WeightedVariance_RBC', 0.006051967534523718],
       dtype=object),
 array(['Threshold_WeightedVariance_Cells', 0.006022749845947893],
       dtype=object),
 array(['Threshold_GuideThreshold_Cells', 0.005644190169708992],
       dtype=object),
 array(['Threshold_SumOfEntropies_Cells', 0.00529436231509058],
       dtype=object),
 array(['Threshold_SumOfEntropies_RBC', 0.005045723344239545], dtype=object),
 array(['Threshold_OrigThreshold_Cells', 0.004785642201759306],
  

In [29]:
print('RF2 - The impurity-based feature importances.')
model=rf2.get_model()
names = model.feature_names_in_
importances = model.feature_importances_
pairs = np.column_stack( (names,importances) )
sorted(pairs, key = lambda e:e[1], reverse=True)

RF2 - The impurity-based feature importances.


[array(['Threshold_OrigThreshold_RBC', 0.018572193710818034], dtype=object),
 array(['Threshold_FinalThreshold_RBC', 0.018342462145268038], dtype=object),
 array(['Threshold_FinalThreshold_Nuclei', 0.011499232716074883],
       dtype=object),
 array(['Threshold_WeightedVariance_Nuclei', 0.011079308570955022],
       dtype=object),
 array(['Threshold_GuideThreshold_Nuclei', 0.010179812056472564],
       dtype=object),
 array(['Threshold_OrigThreshold_Nuclei', 0.009762397722681138],
       dtype=object),
 array(['Threshold_WeightedVariance_RBC', 0.007790008029450008],
       dtype=object),
 array(['Threshold_SumOfEntropies_Cells', 0.0075229433120426155],
       dtype=object),
 array(['Threshold_WeightedVariance_Cells', 0.0074609464505089795],
       dtype=object),
 array(['Threshold_SumOfEntropies_RBC', 0.006832896729102696], dtype=object),
 array(['Threshold_GuideThreshold_Cells', 0.006568687235234096],
       dtype=object),
 array(['Threshold_OrigThreshold_Cells', 0.006513123899843944]