# Attention Quintile Roll

Class 2

Like Attention HalfRoll, 
divide nuclei into low or high attention,
but exclude the middle range.
Test whether RF does better on high vs low attention data.  

In [1]:
import datetime
import numpy as np
import pandas as pd
from CellProfiler_Util import CP_Util
from RandomForestUtil import RF_Util
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [2]:
BASEPATH='/home/jrm/Adjeroh/Naved/CP_80K/'  # Alien
BASEPATH='D:\\Adjeroh\\Naved\\CP_80K\\'  # Windows
CLASS_DIR=['Output0/','Output1/','Output2/','Output3/','Output4/','Output5/',]
CLASSES=range(0,6)  # use all 6 classes
CLASSES=range(1,6)  # use smaller 5 classes
NUCFILENAME='Process100_Nucleus.csv'
ATTFILENAME='class_avg_temp.csv'
HOT_NAME='Hot_quintile_rollup_'
COLD_NAME='Cold_quintile_rollup_'

## Load data 

In [3]:
cls=2

In [4]:
# These are equal-probability bins (approximately).
# See the Attention.? notebook for counts.
bins = [0, 0.3, 0.45, 0.6, 0.75, 1.0]
labels = ['to30%', 'to45%', 'to60%', 'to75%', 'to100%']
MIN_HOT=0.75
MAX_COLD=0.30

In [5]:
def get_nuc(cls,hot):
    attention = pd.read_csv(BASEPATH+CLASS_DIR[cls]+ATTFILENAME)
    cols={'ImageNumber':'PatchNumber',' ObjectNumber':'ObjectNumber',' AvgTemp':'AvgTemp'}
    attention.rename(columns=cols,inplace=True)
    attention.set_index(['PatchNumber','ObjectNumber'],inplace=True)
    num_nuc  = len(attention)
    high_temp_list = None
    if hot:
        print('Class',cls,'Hot = high attention')
        high_temp_list = attention[attention['AvgTemp']>=MIN_HOT]
    else:
        print('Class',cls,'Cold = low attention')
        high_temp_list  = attention[attention['AvgTemp']<MAX_COLD]
    num_hot  = len(high_temp_list)
    print('From the attention file, chose %d out of %d'%(num_hot,num_nuc))
    print('These nuclei represent %.2f percent'%(100*num_hot/num_nuc))
    print('This includes nuclei from the train and test sets.')
    cp = CP_Util(BASEPATH+CLASS_DIR[cls])
    cp.train_test_split()
    nuc_df = cp.get_nuclei()
    nuc_df.reset_index(inplace=True)
    nuc_df.set_index(['PatchNumber','ObjectNumber'],inplace=True)
    hot_nuc  = nuc_df[nuc_df.index.isin(high_temp_list.index)]
    hot_patches = len(hot_nuc.index.unique(level='PatchNumber'))
    hot_nuclei  = len(hot_nuc)
    print('From the training set, we selected %d nuclei from %d distinct patches.'%
          (hot_nuclei,hot_patches))
    return hot_nuc

In [6]:
print(datetime.datetime.now())
hot_nuc = get_nuc(cls,True)
print('Hot groupby... (this is slow)')
rollup = hot_nuc.groupby(['PatchNumber']).describe() ## this is slow
rollup.columns=rollup.columns.map('_'.join)  ## helps random forest code
print('Writing file...')
rollup.to_csv(HOT_NAME+'class'+str(cls)+'.csv')
print(datetime.datetime.now())

2022-06-16 12:23:55.321823
Class 2 Hot = high attention
From the attention file, chose 55749 out of 253554
These nuclei represent 21.99 percent
This includes nuclei from the train and test sets.
Train: 14 participants, 30 WSI, 11138 patches.
Test: 3 participants, 7 WSI, 2812 patches.
From the training set, we selected 44849 nuclei from 10072 distinct patches.
Hot groupby... (this is slow)
Writing file...
2022-06-16 14:27:56.611935


In [7]:
print(datetime.datetime.now())
hot_nuc = None
cold_nuc = get_nuc(cls,False)
print('Cold groupby... (this is slow)')
rollup = cold_nuc.groupby(['PatchNumber']).describe() ## this is slow
rollup.columns=rollup.columns.map('_'.join)  ## helps random forest code
print('Writing file...')
rollup.to_csv(COLD_NAME+'class'+str(cls)+'.csv')
print(datetime.datetime.now())

2022-06-16 14:27:56.632124
Class 2 Cold = low attention
From the attention file, chose 55799 out of 253554
These nuclei represent 22.01 percent
This includes nuclei from the train and test sets.
Train: 14 participants, 30 WSI, 11138 patches.
Test: 3 participants, 7 WSI, 2812 patches.
From the training set, we selected 44876 nuclei from 9852 distinct patches.
Cold groupby... (this is slow)
Writing file...
2022-06-16 16:22:45.079850
