# Attention
Class = 1.

Integrate (1) Nucleus features as recored by CellProfiler Proces100
and (2) attention level recorded by the CNN.
Use Random Forest to say what CP features are predictive of attention.
Here, we do the analysis per class, just because that is easiest.

In [1]:
import datetime
import numpy as np
import pandas as pd
from RandomForestUtil import RF_Util
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
ATTFILEPATH='/home/jrm/Adjeroh/Naved/CP_80K/attention_nucleus/' # Alien
ATTFILENAME='class1_avg_temp.csv'
NUCFILEPATH='/home/jrm/Adjeroh/Naved/CP_80K/Output1/'
NUCFILENAME='Process100_Nucleus.csv'

## Load labels
The labels are the temperatures of the CNN attention heatmap,
specifically an average across the bounding box of each nucleus.

In [3]:
att_df=pd.read_csv(ATTFILEPATH+ATTFILENAME)
cols={'ImageNumber':'PatchNumber',' ObjectNumber':'ObjectNumber',' AvgTemp':'AvgTemp'}
att_df.rename(columns=cols,inplace=True)
att_df.sort_values(by=['PatchNumber','ObjectNumber'],axis=0,inplace=True)
att_df.reset_index(drop=True,inplace=True)

In [4]:
# Later, try the RandomForestRegressor. For now use bins and classification.
bins = [0, 0.3, 0.45, 0.6, 0.75, 1.0]
labels = ['to30%', 'to45%', 'to60%', 'to75%', 'to100%']
att_df['bin'] = pd.cut(att_df['AvgTemp'], bins=bins, labels=labels)
att_df

Unnamed: 0,PatchNumber,ObjectNumber,AvgTemp,bin
0,1,1,0.282182,to30%
1,1,2,0.296874,to30%
2,1,3,0.320143,to45%
3,1,4,0.171113,to30%
4,1,5,0.357959,to45%
...,...,...,...,...
248485,12900,9,0.614003,to75%
248486,12900,10,0.773111,to100%
248487,12901,1,0.449821,to45%
248488,12902,1,0.805303,to100%


In [5]:
ytrain = att_df['bin'] 
ytrain.value_counts()

to30%     54393
to100%    52680
to60%     52542
to45%     48277
to75%     40598
Name: bin, dtype: int64

## Load features
These are features of nuclei as determined by our "Process100" pipeline for CellProfiler.
These are the nuclei used to set the bounding boxes for the temperature lables.
Other than that, the features and temperatures were each derived independently.

In [6]:
nuc_df=pd.read_csv(NUCFILEPATH+NUCFILENAME)
cols={'ImageNumber':'PatchNumber'}
nuc_df.rename(columns=cols,inplace=True)
nuc_df.sort_values(by=['PatchNumber','ObjectNumber'],axis=0,inplace=True)
nuc_df.reset_index(drop=True,inplace=True)

In [7]:
Xtrain = nuc_df.drop(columns=['PatchNumber','ObjectNumber'])
Xtrain

Unnamed: 0,AreaShape_Area,AreaShape_BoundingBoxArea,AreaShape_BoundingBoxMaximum_X,AreaShape_BoundingBoxMaximum_Y,AreaShape_BoundingBoxMinimum_X,AreaShape_BoundingBoxMinimum_Y,AreaShape_Center_X,AreaShape_Center_Y,AreaShape_CentralMoment_0_0,AreaShape_CentralMoment_0_1,...,Texture_Variance_Hematoxylin_4_02_256,Texture_Variance_Hematoxylin_4_03_256,Texture_Variance_Hematoxylin_5_00_256,Texture_Variance_Hematoxylin_5_01_256,Texture_Variance_Hematoxylin_5_02_256,Texture_Variance_Hematoxylin_5_03_256,Texture_Variance_Hematoxylin_7_00_256,Texture_Variance_Hematoxylin_7_01_256,Texture_Variance_Hematoxylin_7_02_256,Texture_Variance_Hematoxylin_7_03_256
0,828,1000,243,25,203,0,221.247585,10.449275,828.0,-1.506351e-12,...,660.780936,735.169789,683.696726,710.438215,660.816968,744.997807,688.634959,707.175777,672.827808,731.769250
1,559,960,196,48,166,16,178.508050,27.883721,559.0,-2.486900e-14,...,968.722014,966.869445,890.918845,997.055721,957.845085,972.602567,896.622657,1091.533923,930.761389,918.591865
2,460,660,211,48,189,18,199.397826,31.791304,460.0,-2.433609e-13,...,1466.518307,1486.244337,1431.984322,1479.225124,1452.123712,1518.000593,1585.057170,1436.823849,1420.359612,1586.742416
3,469,680,116,49,82,29,98.315565,38.153518,469.0,2.273737e-13,...,956.563266,830.863645,984.361014,1130.531320,938.975235,793.983588,957.319994,1104.739661,900.719530,847.637257
4,265,448,196,51,168,35,182.384906,43.886792,265.0,-8.881784e-14,...,1145.633232,1252.419379,899.905323,795.795898,1261.046374,1371.643344,957.799648,730.771488,1491.493975,1604.096659
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
248485,312,630,119,300,89,279,101.141026,288.506410,312.0,-4.973799e-14,...,3464.480569,3908.449427,3760.882618,3530.157049,3616.401475,3886.624114,3755.405100,3757.063020,3979.526033,3774.403946
248486,591,812,235,300,206,272,219.456853,286.974619,591.0,-6.750156e-13,...,1500.882734,1474.259521,1575.219312,1681.106641,1502.956763,1468.299830,1547.554339,1742.616703,1542.471534,1668.649398
248487,390,506,193,71,170,49,179.912821,59.484615,390.0,-5.968559e-13,...,2362.205989,2456.127629,2316.716582,2360.786953,2387.990461,2256.902238,2375.598925,2525.959704,2475.763970,2385.981756
248488,323,441,21,126,0,105,9.786378,115.789474,323.0,-2.131628e-13,...,1393.836400,1383.672418,1250.161006,1418.162355,1359.291995,1338.906038,1279.663899,1904.470085,1319.920812,1271.221273


In [8]:
att_df=None
nuc_df=None
print('Count Nan before:',Xtrain.isna().sum().sum())
Xtrain.fillna(0,inplace=True)
print('Count Nan after:',Xtrain.isna().sum().sum())

Count Nan before: 745864
Count Nan after: 0


## Measure agreement (with a random forest classifier)
Can the RF predict the temperature of each nucleus given only the CellProfiler features?

In [9]:
print(datetime.datetime.now())
print("Shuffle...")
Xtrain,ytrain=shuffle(Xtrain,ytrain)  #set random_state for reproducibility

2022-06-09 09:12:40.944065
Shuffle...


In [None]:
print(datetime.datetime.now())
print('Cross validation...')
rf1 = RF_Util()
rf1.set_train(Xtrain,ytrain)
cv_scores = rf1.cross_validation()
print(cv_scores)
print('Accuracy mean %.4f +/- %.4f' % (cv_scores.mean(),cv_scores.std()))

2022-06-09 09:12:41.444242
Cross validation...


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


## Measure feature importance

In [None]:
print(datetime.datetime.now())
print('Re-train on full training set...')
rf1 = None
rf2 = RF_Util()
rf2.set_train(Xtrain,ytrain)
rf2.fit()
print(datetime.datetime.now())
print('...and rank the features by importance.')
top = rf2.important_features()
pd.set_option('display.max_rows', None)
top.loc[:200]