# Random Forest Classifier with Feature Elimination

Try correlated feature elimination to improve the explainability of results.

Darst, B.F., Malecki, K.C. & Engelman, C.D. Using recursive feature elimination in random forest to account for correlated variables in high dimensional data. BMC Genet 19, 65 (2018). https://doi.org/10.1186/s12863-018-0633-8

Gregorutti, B., Michel, B. & Saint-Pierre, P. Correlation and variable importance in random forests. Stat Comput 27, 659–678 (2017). https://doi.org/10.1007/s11222-016-9646-1

The sklearn class RFE recursively removes least important feature till default=half are left.    
The sklearn class RFECV decides when to stop by doing cross-validation after each round.  


In [1]:
import numpy as np
import pandas as pd
import sklearn
print(sklearn.__version__)

1.0.2


In [2]:
from platform import python_version
print(python_version())

3.8.10


In [3]:
def make_dataframe(filename):
    df = pd.read_csv(filename,dtype=np.float32)  # remove dtype?
    count1 = df.isnull().sum().sum()
    print('Zero out this many NaN:', count1)
    df = df.fillna(0)
    count2 = df.isnull().sum().sum()
    print('Now how many NaN?:', count2)
    print('Largest value:', df.max().max())
    print('Smallest:', df.min().min())
    return df

In [4]:
FILENAME_YPOS = '/home/jrm/Martinez/CellProfilerRuns/CP_20220417_Ypos/Nuclei.CP_20220417_Ypos.csv'
feature_vec_Ypos = make_dataframe(FILENAME_YPOS)
#feature_vec_Ypos

Zero out this many NaN: 35
Now how many NaN?: 0
Largest value: 19328.0
Smallest: -89.999825


In [5]:
FILENAME_YNEG = '/home/jrm/Martinez/CellProfilerRuns/CP_20220417_Yneg/Nuclei.CP_20220417_Yneg.csv'
feature_vec_Yneg = make_dataframe(FILENAME_YNEG)
#feature_vec_Yneg

Zero out this many NaN: 20
Now how many NaN?: 0
Largest value: 25110.0
Smallest: -89.99928


In [6]:
Ypos_rows,Ypos_cols = feature_vec_Ypos.shape
Yneg_rows,Yneg_cols = feature_vec_Yneg.shape
if Ypos_cols == Yneg_cols:
    print('The dataframes are compatible.')
else:
    print('ERROR! Column counts do not match.')

The dataframes are compatible.


In [7]:
feature_vec_all = pd.concat ( [feature_vec_Ypos, feature_vec_Yneg], ignore_index=True )
#feature_vec_all

In [8]:
# Was silly to convert numpy to pandas when we need numpy eventually
#label_vec_Ypos = pd.DataFrame (np.ones(Ypos_rows,dtype=int))
#label_vec_Yneg = pd.DataFrame (np.zeros(Yneg_rows,dtype=int))
#label_vec_all = pd.concat ( [label_vec_Ypos, label_vec_Yneg], ignore_index=True )
label_vec_Ypos = np.ones(Ypos_rows,dtype=int)
label_vec_Yneg = np.zeros(Yneg_rows,dtype=int)
label_vec_all = np.concatenate ( [label_vec_Ypos, label_vec_Yneg] )
label_vec_all

array([1, 1, 1, ..., 0, 0, 0])

In [9]:
# Was looking for which data rows caused NaN errors during fit().
#feature_vec_all = feature_vec_Ypos[:1226]
#label_vec_all =     label_vec_Ypos[:1226]
#print(feature_vec_all.shape)
#pd.set_option('display.max_rows', None)
#print(feature_vec_all.iloc[-1])
#label_vec_all.shape

In [10]:
from sklearn.model_selection import train_test_split
#Xtrain,Xtest,ytrain,ytest = train_test_split(feature_vec_all, label_vec_all.ravel(), test_size=100, random_state=41)
# Default test size is 25%
Xtrain,Xtest,ytrain,ytest = train_test_split(feature_vec_all, label_vec_all.ravel(), random_state=42)
print('Xtrain',Xtrain.shape,'ytrain',ytrain.shape)
print('Xtest',Xtest.shape,'ytest',ytest.shape)

Xtrain (28364, 68) ytrain (28364,)
Xtest (9455, 68) ytest (9455,)


In [11]:
print(np.count_nonzero(ytrain))
print(np.count_nonzero(ytest))

13621
4517


In [12]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

In [13]:
do_cross_validation = False
if do_cross_validation:
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=42)
    n_scores = cross_val_score(model, Xtrain, ytrain, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
    print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

In [14]:
model.fit(Xtrain,ytrain)
ypred = model.predict(Xtest)
#for i in range(len(ypred)):
#    print('Actual',ytest[i],'Predict',ypred[i])


In [15]:
matches = np.count_nonzero(ytest==ypred)
accuracy = 100.0 * matches / len(ytest)
print('Number correct:',matches)
print('Percent correct:',accuracy)

Number correct: 5796
Percent correct: 61.300898995240615


In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix([1,1],[1,0])  
# The CM considers 0=Yneg to be positive (top left corner)

array([[0, 0],
       [1, 1]])

In [17]:
cm = confusion_matrix(ytest, ypred)
cm

array([[3305, 1633],
       [2026, 2491]])

In [18]:
print('The impurity-based feature importances.')
names = model.feature_names_in_
importances = model.feature_importances_
pairs = np.column_stack( (names,importances) )
sorted(pairs, key = lambda e:e[1], reverse=True)

The impurity-based feature importances.


[array(['AreaShape_Orientation', 0.030522275749826928], dtype=object),
 array(['Neighbors_SecondClosestDistance_Expanded', 0.01925301765531776],
       dtype=object),
 array(['AreaShape_MeanRadius', 0.018595438310598823], dtype=object),
 array(['Neighbors_FirstClosestDistance_Expanded', 0.01837670505445424],
       dtype=object),
 array(['ImageNumber', 0.018329364099046133], dtype=object),
 array(['AreaShape_Extent', 0.018088025559222985], dtype=object),
 array(['AreaShape_Solidity', 0.017881513671791437], dtype=object),
 array(['AreaShape_Zernike_8_8', 0.0169684645954576], dtype=object),
 array(['Neighbors_AngleBetweenNeighbors_Expanded', 0.016958461249753955],
       dtype=object),
 array(['AreaShape_Zernike_9_9', 0.016870501671517463], dtype=object),
 array(['AreaShape_Zernike_9_1', 0.016839884591541288], dtype=object),
 array(['AreaShape_FormFactor', 0.01674681048698256], dtype=object),
 array(['AreaShape_Zernike_4_0', 0.01665613587612913], dtype=object),
 array(['AreaShape_Zernike

In [19]:
from sklearn.feature_selection import RFE
rfe = RFE(model)
rfe.fit(Xtrain,ytrain)
# This is slow! Uses 100% cpu but 0% gpu.

RFE(estimator=RandomForestClassifier())

In [None]:
rfe.n_features_ 
# Number selected

In [20]:
rfe.ranking_
# Selected features get rank=1. Large numbers mean not selected.

array([ 1, 30,  1, 17, 28, 26, 24, 29,  1, 14,  1, 22, 18, 21, 34,  1,  1,
       12, 20,  7,  1, 31,  1,  1,  1, 19,  1,  1,  8,  1,  1, 16,  1,  1,
        1,  1,  9,  6, 15,  1,  3, 11,  5, 10,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  4,  2,  1, 13,  1, 35,  1,  1, 23, 33, 25,  1, 27, 32])

In [29]:
support = rfe.support_
no_support = np.invert(rfe.support_)
selected = rfe.feature_names_in_[rfe.support_]
not_selected = rfe.feature_names_in_[no_support]
not_selected

array(['ObjectNumber', 'AreaShape_BoundingBoxArea',
       'AreaShape_BoundingBoxMaximum_X', 'AreaShape_BoundingBoxMaximum_Y',
       'AreaShape_BoundingBoxMinimum_X', 'AreaShape_BoundingBoxMinimum_Y',
       'AreaShape_Center_Y', 'AreaShape_ConvexArea',
       'AreaShape_Eccentricity', 'AreaShape_EquivalentDiameter',
       'AreaShape_EulerNumber', 'AreaShape_MajorAxisLength',
       'AreaShape_MaxFeretDiameter', 'AreaShape_MaximumRadius',
       'AreaShape_MedianRadius', 'AreaShape_Perimeter',
       'AreaShape_Zernike_1_1', 'AreaShape_Zernike_3_1',
       'AreaShape_Zernike_5_1', 'AreaShape_Zernike_5_3',
       'AreaShape_Zernike_5_5', 'AreaShape_Zernike_6_2',
       'AreaShape_Zernike_6_4', 'AreaShape_Zernike_6_6',
       'AreaShape_Zernike_7_1', 'AreaShape_Zernike_9_5',
       'AreaShape_Zernike_9_7', 'Location_Center_X', 'Location_Center_Z',
       'Neighbors_FirstClosestObjectNumber_Expanded',
       'Neighbors_NumberOfNeighbors_Expanded',
       'Neighbors_PercentTouching_Expan

In [32]:
Xtrain

Unnamed: 0,ImageNumber,ObjectNumber,AreaShape_Area,AreaShape_BoundingBoxArea,AreaShape_BoundingBoxMaximum_X,AreaShape_BoundingBoxMaximum_Y,AreaShape_BoundingBoxMinimum_X,AreaShape_BoundingBoxMinimum_Y,AreaShape_Center_X,AreaShape_Center_Y,...,Location_Center_Y,Location_Center_Z,Neighbors_AngleBetweenNeighbors_Expanded,Neighbors_FirstClosestDistance_Expanded,Neighbors_FirstClosestObjectNumber_Expanded,Neighbors_NumberOfNeighbors_Expanded,Neighbors_PercentTouching_Expanded,Neighbors_SecondClosestDistance_Expanded,Neighbors_SecondClosestObjectNumber_Expanded,Number_Object_Number
30336,873.0,7.0,1283.0,2100.0,103.0,129.0,61.0,79.0,83.225250,104.532349,...,104.532349,0.0,162.981873,40.102287,5.0,5.0,100.000000,41.663486,6.0,7.0
34867,1212.0,4.0,1418.0,2760.0,87.0,71.0,18.0,31.0,57.375881,50.607899,...,50.607899,0.0,112.744736,38.891041,5.0,4.0,81.424149,43.533188,1.0,4.0
29555,821.0,14.0,508.0,1035.0,23.0,224.0,0.0,179.0,7.500000,202.984253,...,202.984253,0.0,40.651573,49.847626,12.0,2.0,50.793652,74.679573,13.0,14.0
6489,506.0,5.0,678.0,1080.0,224.0,64.0,197.0,24.0,213.781708,44.219765,...,44.219765,0.0,30.164587,59.570450,1.0,3.0,64.666664,80.881195,3.0,5.0
34847,1210.0,14.0,1856.0,2688.0,123.0,210.0,67.0,162.0,97.101295,185.121765,...,185.121765,0.0,71.157509,66.695084,7.0,4.0,75.297623,70.782280,10.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,1337.0,2.0,599.0,1312.0,32.0,41.0,0.0,0.0,15.220367,16.938231,...,16.938231,0.0,50.536243,53.106033,4.0,2.0,61.951218,56.100555,5.0,2.0
6265,491.0,15.0,577.0,1023.0,108.0,209.0,77.0,176.0,90.069321,190.372620,...,190.372620,0.0,72.208969,35.202892,16.0,5.0,82.211540,55.230072,10.0,15.0
11284,897.0,1.0,1130.0,1720.0,147.0,43.0,107.0,0.0,128.080536,18.029203,...,18.029203,0.0,21.027176,19.506340,5.0,3.0,54.887218,32.800522,4.0,1.0
860,65.0,12.0,969.0,1802.0,77.0,217.0,24.0,183.0,45.619194,202.654282,...,202.654282,0.0,83.643341,24.866198,7.0,4.0,69.834709,32.237518,13.0,12.0


In [35]:
Xte_reduced = Xtest.drop(not_selected,axis=1)
Xtr_reduced = Xtrain.drop(not_selected,axis=1)
Xtr_reduced

Unnamed: 0,ImageNumber,AreaShape_Area,AreaShape_Center_X,AreaShape_Compactness,AreaShape_Extent,AreaShape_FormFactor,AreaShape_MeanRadius,AreaShape_MinFeretDiameter,AreaShape_MinorAxisLength,AreaShape_Orientation,...,AreaShape_Zernike_8_4,AreaShape_Zernike_8_6,AreaShape_Zernike_8_8,AreaShape_Zernike_9_1,AreaShape_Zernike_9_3,AreaShape_Zernike_9_9,Location_Center_Y,Neighbors_AngleBetweenNeighbors_Expanded,Neighbors_FirstClosestDistance_Expanded,Neighbors_SecondClosestDistance_Expanded
30336,873.0,1283.0,83.225250,1.842784,0.610952,0.542657,5.089225,33.767673,35.475548,-28.610592,...,0.005352,0.008827,0.008033,0.008786,0.006700,0.011881,104.532349,162.981873,40.102287,41.663486
34867,1212.0,1418.0,57.375881,1.820201,0.513768,0.549390,5.910746,30.857738,29.641033,-62.969391,...,0.004555,0.004262,0.005185,0.006872,0.006052,0.004419,50.607899,112.744736,38.891041,43.533188
29555,821.0,508.0,7.500000,2.556119,0.490821,0.391218,2.780056,22.000000,19.357618,3.000451,...,0.006000,0.014792,0.008595,0.003672,0.001020,0.006409,202.984253,40.651573,49.847626,74.679573
6489,506.0,678.0,213.781708,1.766162,0.627778,0.566200,3.970750,26.000000,23.988937,-14.038938,...,0.002303,0.011415,0.000710,0.004879,0.005121,0.010836,44.219765,30.164587,59.570450,80.881195
34847,1210.0,1856.0,97.101295,1.401914,0.690476,0.713311,7.299724,46.055061,45.345875,69.150230,...,0.011674,0.002293,0.003497,0.008204,0.006050,0.003155,185.121765,71.157509,66.695084,70.782280
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16850,1337.0,599.0,15.220367,1.976788,0.456555,0.505871,3.320682,20.523907,16.857292,39.044510,...,0.008651,0.008383,0.009332,0.008610,0.004767,0.001454,16.938231,50.536243,53.106033,56.100555
6265,491.0,577.0,90.069321,1.715620,0.564027,0.582880,3.807770,25.447689,24.310482,-21.665394,...,0.003472,0.008048,0.009217,0.004549,0.011440,0.005949,190.372620,72.208969,35.202892,55.230072
11284,897.0,1130.0,128.080536,1.884132,0.656977,0.530748,4.763591,35.895271,32.422749,24.637474,...,0.006024,0.002560,0.012076,0.001628,0.010764,0.006120,18.029203,21.027176,19.506340,32.800522
860,65.0,969.0,45.619194,2.090262,0.537736,0.478409,4.202213,31.313454,27.324745,64.879204,...,0.009300,0.009044,0.004920,0.007151,0.007136,0.004877,202.654282,83.643341,24.866198,32.237518


In [36]:
model2 = RandomForestClassifier()
model2.fit(Xtr_reduced,ytrain)
ypred2 = model2.predict(Xte_reduced)
matches2 = np.count_nonzero(ytest==ypred2)
accuracy2 = 100.0 * matches / len(ytest)
print('Number correct:',matches2)
print('Percent correct:',accuracy2)
cm2 = confusion_matrix(ytest, ypred2)
cm2

Number correct: 5803
Percent correct: 61.300898995240615


array([[3283, 1655],
       [1997, 2520]])

In [38]:
print('The impurity-based feature importances.')
names = model2.feature_names_in_
importances = model2.feature_importances_
pairs = np.column_stack( (names,importances) )
sorted(pairs, key = lambda e:e[1], reverse=True)

The impurity-based feature importances.


[array(['AreaShape_Orientation', 0.045321794699802555], dtype=object),
 array(['AreaShape_MeanRadius', 0.036223254867903926], dtype=object),
 array(['ImageNumber', 0.031797397829462004], dtype=object),
 array(['Neighbors_SecondClosestDistance_Expanded', 0.03159771360806355],
       dtype=object),
 array(['AreaShape_Extent', 0.0307862658044216], dtype=object),
 array(['Neighbors_FirstClosestDistance_Expanded', 0.030417866971286324],
       dtype=object),
 array(['AreaShape_MinFeretDiameter', 0.029823449229982018], dtype=object),
 array(['AreaShape_Area', 0.029696272965752876], dtype=object),
 array(['AreaShape_Zernike_2_0', 0.029308160402708935], dtype=object),
 array(['AreaShape_Zernike_0_0', 0.029159707752352174], dtype=object),
 array(['AreaShape_FormFactor', 0.02912466848677968], dtype=object),
 array(['AreaShape_Center_X', 0.029105243503395484], dtype=object),
 array(['AreaShape_Zernike_4_0', 0.029003684670446003], dtype=object),
 array(['AreaShape_Compactness', 0.02896891288544702