# Attention

Class 3

Divide nuclei into low or high attention.  
On each half, do a rollup to patch quartiles.  
Test whether RF does better on high vs low attention data.  

In [1]:
import datetime
import numpy as np
import pandas as pd
from CellProfiler_Util import CP_Util
from RandomForestUtil import RF_Util
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
BASEPATH='/home/jrm/Adjeroh/Naved/CP_80K/'
CLASS_DIR=['Output0/','Output1/','Output2/','Output3/','Output4/','Output5/',]
CLASSES=range(0,6)  # use all 6 classes
CLASSES=range(1,6)  # use smaller 5 classes
NUCFILENAME='Process100_Nucleus.csv'
ATTFILENAME='class_avg_temp.csv'

## Load data
### Attention  
A CNN was trained to assign tissue class to patches.  
The attention was scored as a probability per pixel.  
We call the score a temperature because it is used for heatmaps.  
We used bounding boxes around nuclei as generated by CellProfiler.  
We saved the average temperature per bounding box.  
Thus, we know which nuclei were attention getters.  
In the file, ObjectNumber identifies the nucleus.  
### Nuclei
Our CellProfiler pipeline called Process100 produced nucleus segmentation.  
For each detected nucleus, it generated about 600 features.  
Here, use nuclei from 80K patches from train-set WSI only.   

In [3]:
cls=3

In [4]:
def get_nuc(cls,hot):
    attention = pd.read_csv(BASEPATH+CLASS_DIR[cls]+ATTFILENAME)
    cols={'ImageNumber':'PatchNumber',' ObjectNumber':'ObjectNumber',' AvgTemp':'AvgTemp'}
    attention.rename(columns=cols,inplace=True)
    attention.set_index(['PatchNumber','ObjectNumber'],inplace=True)
    num_nuc  = len(attention)
    high_temp_list = None
    if hot:
        print('Class',cls,'Hot = high attention')
        high_temp_list = attention[attention['AvgTemp']>=.5]
    else:
        print('Class',cls,'Cold = low attention')
        high_temp_list  = attention[attention['AvgTemp']<.5]
    num_hot  = len(high_temp_list)
    print('From the attention file, chose %d out of %d'%(num_hot,num_nuc))
    print('These nuclei represent %.2f percent'%(100*num_hot/num_nuc))
    print('This includes nuclei from the train and test sets.')
    cp = CP_Util(BASEPATH+CLASS_DIR[cls])
    cp.train_test_split()
    nuc_df = cp.get_nuclei()
    nuc_df.reset_index(inplace=True)
    nuc_df.set_index(['PatchNumber','ObjectNumber'],inplace=True)
    hot_nuc  = nuc_df[nuc_df.index.isin(high_temp_list.index)]
    hot_patches = len(hot_nuc.index.unique(level='PatchNumber'))
    hot_nuclei  = len(hot_nuc)
    print('From the training set, we selected %d nuclei from %d distinct patches.'%
          (hot_nuclei,hot_patches))
    return hot_nuc

In [5]:
print(datetime.datetime.now())
hot_nuc = get_nuc(cls,True)
print('Hot groupby... (this is slow)')
rollup = hot_nuc.groupby(['PatchNumber']).describe() ## this is slow
rollup.columns=rollup.columns.map('_'.join)  ## helps random forest code
print('Writing file...')
rollup.to_csv('Hot_nucleus_rollup_class'+str(cls)+'.csv')
print(datetime.datetime.now())

2022-06-10 12:23:26.721447
Class 3 Hot = high attention
From the attention file, chose 51319 out of 101611
These nuclei represent 50.51 percent
This includes nuclei from the train and test sets.
Num WSI in test/train sets: 3 14
Num patches in test/train sets: 1091 5273
From the training set, we selected 41640 nuclei from 5182 distinct patches.
Hot groupby... (this is slow)
Writing file...
2022-06-10 13:04:56.053332


In [6]:
print(datetime.datetime.now())
hot_nuc = None
cold_nuc = get_nuc(cls,False)
print('Cold groupby... (this is slow)')
rollup = cold_nuc.groupby(['PatchNumber']).describe() ## this is slow
rollup.columns=rollup.columns.map('_'.join)  ## helps random forest code
print('Writing file...')
rollup.to_csv('Cold_nucleus_rollup_class'+str(cls)+'.csv')
print(datetime.datetime.now())

2022-06-10 13:04:56.061336
Class 3 Cold = low attention
From the attention file, chose 50292 out of 101611
These nuclei represent 49.49 percent
This includes nuclei from the train and test sets.
Num WSI in test/train sets: 3 14
Num patches in test/train sets: 1091 5273
From the training set, we selected 40957 nuclei from 5125 distinct patches.
Cold groupby... (this is slow)
Writing file...
2022-06-10 13:44:41.334185
