# Attention HalfRoll
Class = 4.

How well can RF recapitulate CNN attention
given only CP nucleus stats?
Train a RF on the average attention "temperature" per
nucleus bounding box as selected by CP.

In [1]:
import datetime
import numpy as np
import pandas as pd
from RandomForestUtil import RF_Util
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
ATTFILEPATH='/home/jrm/Adjeroh/Naved/CP_80K/attention_nucleus/' # Alien
ATTFILENAME='class4_avg_temp.csv'
NUCFILEPATH='/home/jrm/Adjeroh/Naved/CP_80K/Output4/'
NUCFILENAME='Process100_Nucleus.csv'

## Load labels
The labels are the temperatures of the CNN attention heatmap,
specifically an average across the bounding box of each nucleus.

In [3]:
att_df=pd.read_csv(ATTFILEPATH+ATTFILENAME)
cols={'ImageNumber':'PatchNumber',' ObjectNumber':'ObjectNumber',' AvgTemp':'AvgTemp'}
att_df.rename(columns=cols,inplace=True)
att_df.sort_values(by=['PatchNumber','ObjectNumber'],axis=0,inplace=True)
att_df.reset_index(drop=True,inplace=True)

In [4]:
# Later, try the RandomForestRegressor. For now use bins and classification.
bins = [0, 0.3, 0.45, 0.6, 0.75, 1.0]
labels = ['to30%', 'to45%', 'to60%', 'to75%', 'to100%']
att_df['bin'] = pd.cut(att_df['AvgTemp'], bins=bins, labels=labels)
att_df

Unnamed: 0,PatchNumber,ObjectNumber,AvgTemp,bin
0,1,1,0.024352,to30%
1,1,2,0.167493,to30%
2,1,3,0.301966,to45%
3,1,4,0.568698,to60%
4,1,5,0.341957,to45%
...,...,...,...,...
46520,3196,6,0.542161,to60%
46521,3196,7,0.419783,to45%
46522,3196,8,0.519468,to60%
46523,3196,9,0.564878,to60%


In [5]:
ytrain = att_df['bin'] 
ytrain.value_counts()

to30%     10295
to100%    10230
to60%      9689
to45%      8818
to75%      7493
Name: bin, dtype: int64

## Load features
These are features of nuclei as determined by our "Process100" pipeline for CellProfiler.
These are the nuclei used to set the bounding boxes for the temperature lables.
Other than that, the features and temperatures were each derived independently.

In [6]:
nuc_df=pd.read_csv(NUCFILEPATH+NUCFILENAME)
cols={'ImageNumber':'PatchNumber'}
nuc_df.rename(columns=cols,inplace=True)
nuc_df.sort_values(by=['PatchNumber','ObjectNumber'],axis=0,inplace=True)
nuc_df.reset_index(drop=True,inplace=True)

In [7]:
Xtrain = nuc_df.drop(columns=['PatchNumber','ObjectNumber'])
Xtrain

Unnamed: 0,AreaShape_Area,AreaShape_BoundingBoxArea,AreaShape_BoundingBoxMaximum_X,AreaShape_BoundingBoxMaximum_Y,AreaShape_BoundingBoxMinimum_X,AreaShape_BoundingBoxMinimum_Y,AreaShape_Center_X,AreaShape_Center_Y,AreaShape_CentralMoment_0_0,AreaShape_CentralMoment_0_1,...,Texture_Variance_Hematoxylin_4_02_256,Texture_Variance_Hematoxylin_4_03_256,Texture_Variance_Hematoxylin_5_00_256,Texture_Variance_Hematoxylin_5_01_256,Texture_Variance_Hematoxylin_5_02_256,Texture_Variance_Hematoxylin_5_03_256,Texture_Variance_Hematoxylin_7_00_256,Texture_Variance_Hematoxylin_7_01_256,Texture_Variance_Hematoxylin_7_02_256,Texture_Variance_Hematoxylin_7_03_256
0,196,294,80,26,59,12,68.193878,18.607143,196.0,-5.329071e-15,...,602.743285,642.679273,642.319285,642.400310,640.764599,699.330009,644.967400,687.076735,834.618344,850.561523
1,469,609,126,37,97,16,111.159915,25.989339,469.0,-9.166001e-13,...,539.175397,565.853063,548.903754,562.875134,556.688253,597.419939,567.698283,508.662662,579.599297,624.531535
2,223,304,193,41,174,25,182.551570,32.852018,223.0,4.796163e-14,...,888.080613,808.414113,845.188775,847.659739,899.641109,733.203961,877.066057,587.996061,814.744709,819.948881
3,508,798,21,61,0,23,7.982283,43.293307,508.0,2.131628e-14,...,1241.667208,1159.372486,1238.719919,1169.194520,1257.374985,1114.919318,1175.562484,1175.745806,1287.354528,1171.277847
4,521,768,143,74,111,50,126.280230,60.255278,521.0,-1.652012e-13,...,951.986476,964.896780,907.957406,909.009379,956.391792,966.036051,933.923255,943.625640,974.996622,928.684668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46520,491,684,234,216,216,178,224.256619,197.356415,491.0,7.105427e-14,...,1422.754541,1428.622175,1361.131323,1573.083123,1431.157207,1464.698052,1423.252150,1665.083694,1452.185763,1378.132307
46521,298,364,149,208,136,180,142.043624,193.359060,298.0,-2.842171e-14,...,1695.628540,1787.881929,1789.764515,1910.202155,1681.800641,1907.651012,2040.167169,2072.963190,1667.108287,2022.937578
46522,235,345,210,219,195,196,200.523404,206.736170,235.0,-6.039613e-14,...,1399.381486,1530.439781,1401.851562,1347.494422,1361.041939,1664.847698,1485.990710,1525.224615,1400.377914,1819.843827
46523,269,414,214,233,196,210,205.635688,222.234201,269.0,-2.113865e-13,...,1255.160768,1348.841787,1194.540566,1004.038428,1251.985997,1453.395007,1280.273024,901.265840,1300.529218,1757.844843


In [8]:
att_df=None
nuc_df=None
print('Count Nan before:',Xtrain.isna().sum().sum())
Xtrain.fillna(0,inplace=True)
print('Count Nan after:',Xtrain.isna().sum().sum())

Count Nan before: 139801
Count Nan after: 0


## Measure agreement (with a random forest classifier)
Can the RF predict the temperature of each nucleus given only the CellProfiler features?

In [9]:
print(datetime.datetime.now())
print("Shuffle...")
Xtrain,ytrain=shuffle(Xtrain,ytrain)  #set random_state for reproducibility

2022-06-09 12:24:00.751357
Shuffle...


In [10]:
print(datetime.datetime.now())
print('Cross validation...')
rf1 = RF_Util()
rf1.set_train(Xtrain,ytrain)
cv_scores = rf1.cross_validation()
print(cv_scores)
print('Accuracy mean %.4f +/- %.4f' % (cv_scores.mean(),cv_scores.std()))

2022-06-09 12:24:00.852199
Cross validation...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END .................................................... total time= 1.1min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  1.1min remaining:    0.0s


[CV] END .................................................... total time= 1.1min
[CV] END .................................................... total time= 1.1min
[CV] END .................................................... total time= 1.0min
[CV] END .................................................... total time= 1.1min
[0.91155293 0.9130575  0.90725416 0.91112305 0.91133799]
Accuracy mean 0.9109 +/- 0.0019


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  5.3min finished


## Measure feature importance

In [11]:
print(datetime.datetime.now())
print('Re-train on full training set...')
rf1 = None
rf2 = RF_Util()
rf2.set_train(Xtrain,ytrain)
rf2.fit()
print(datetime.datetime.now())
print('...and rank the features by importance.')
top = rf2.important_features()
pd.set_option('display.max_rows', None)
top.loc[:200]

2022-06-09 12:29:21.857263
Re-train on full training set...
2022-06-09 12:30:43.491611
...and rank the features by importance.


Unnamed: 0,0,1
0,0.066514,Location_CenterMassIntensity_X_Hematoxylin
1,0.065837,AreaShape_Center_X
2,0.064648,Location_Center_X
3,0.061513,AreaShape_BoundingBoxMaximum_X
4,0.052292,AreaShape_BoundingBoxMinimum_X
5,0.048541,Location_MaxIntensity_X_Hematoxylin
6,0.042566,Location_CenterMassIntensity_Y_Hematoxylin
7,0.035003,AreaShape_Center_Y
8,0.033924,AreaShape_BoundingBoxMinimum_Y
9,0.032761,Location_Center_Y
