# Attention
Second try at processing attention files. Break them up per patient.

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
print(datetime.now())

2022-08-10 14:49:20.121565


In [2]:
BASE_PATH='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/'
ATTENTION_DIR='/home/jrm/Adjeroh/Glioma/August_Run/Attention/'
TRACKING_FILE='PatchTracking.csv'
# Patch filename format: TCGA-06-0129-01Z-00-DX1_5400_5100.png
# For WSI ID, use first 23 letters.
# For patient or case ID, use first 19 letters. 
LEN_CASE_ID=19
LEN_WSI_ID=23

In [3]:
# Original imagenum: patch number, starting at 1 each on each CellProfiler run e.g. Output5.1
#    Since CP ran 8 different times, 8 patients have patch numbers starting at 1, but most start higher.
#    The original imagenum is present in the attention files.
# New imagenum: patch number, starting at 1 for each patient, regardles of when CellProfiler ran
#    This number is present in the Image.csv and Nucleus.csv files organized per patient.
# Converter: a dict that maps (patient,org_imagenum) to new_imagenum
def load_imagenum_converter():
    df = pd.read_csv(BASE_PATH+TRACKING_FILE)
    converter={}
    for ndx,data in df.iterrows():
        patient=data['patient_directory']
        orig_imagenum=int(data['orig_imagenum'])
        new_imagenum=int(data['new_imagenum'])
        key = (patient,orig_imagenum)
        val = new_imagenum
        converter[key] = val
    return converter
converter = load_imagenum_converter()
converter[('TCGA-02-0004-01Z-00',298)]

298

In [4]:
def get_attention_df(fold,multiclass,cancer,train):
    # We have filenames like cp_datasets_bcls_fold4_test_class0_bbox_stats.csv
    mb = 'bcls' # binary
    if multiclass:
        mb = 'mcls'  
    usage = 'test'   # validation
    if train:
        usage = 'train'
    filename=f"cp_datasets_{mb}_fold{fold}_{usage}_class{cancer}_bbox_stats.csv"
    fullpath = ATTENTION_DIR + filename
    # This CSV has leading blanks on every column but the first.
    df = pd.read_csv(fullpath,skipinitialspace=True)
    return df
df = get_attention_df(0,True,0,True)
df

Unnamed: 0,ImageName,ImageNumber,ObjectNumber,MeanTemp,MedianTemp,MinTemp,MaxTemp,StdTemp
0,TCGA-02-0004-01Z-00-DX1_4800_3600.png,298,1,0.461643,0.458882,0.399096,0.540614,0.033000
1,TCGA-02-0004-01Z-00-DX1_4800_3600.png,298,2,0.529523,0.530288,0.448266,0.609818,0.035756
2,TCGA-02-0004-01Z-00-DX1_4800_3600.png,298,3,0.494011,0.495230,0.396912,0.598675,0.044906
3,TCGA-02-0004-01Z-00-DX1_4800_3600.png,298,4,0.750842,0.750409,0.710556,0.798023,0.022095
4,TCGA-02-0004-01Z-00-DX1_4800_3600.png,298,5,0.616764,0.621533,0.474238,0.735776,0.055920
...,...,...,...,...,...,...,...,...
786956,TCGA-28-1749-01Z-00-DX1_10200_9000.png,44261,23,0.870074,0.870077,0.826139,0.913993,0.023133
786957,TCGA-28-1749-01Z-00-DX1_10200_9000.png,44261,24,0.733821,0.730985,0.718110,0.764831,0.011261
786958,TCGA-28-1749-01Z-00-DX1_10200_9000.png,44261,25,0.704033,0.701771,0.692021,0.731890,0.009334
786959,TCGA-28-1749-01Z-00-DX1_10200_9000.png,44261,26,0.716644,0.716222,0.716221,0.719934,0.000981


In [5]:
def cleanup():
    for directory in os.listdir(BASE_PATH):
        if directory.startswith('TCGA-'):
            for rootDir, subdirs, filenames in os.walk(BASE_PATH+directory):
                for filename in filenames:
                    if filename.startswith('Attention_'):
                        os.remove(BASE_PATH+directory+'/'+filename)

In [18]:
def process_one_file(fold,multiclass,cancer,train):
    attention = get_attention_df(fold,multiclass,cancer,train)
    mb = 'bcls'  # binary
    if multiclass:
        mb = 'mcsl'
    train_or_valid = 'valid'
    if train:
        train_or_valid = 'train'
    filename = f"Attention_{mb}_fold{fold}.csv"
    outhandle = None
    prev_patient = None
    for ndx,row in attention.iterrows():
        patient = row['ImageName'][:LEN_CASE_ID]
        if patient != prev_patient:
            if outhandle is not None:
                outhandle.close()
            #print(patient)
            prev_patient = patient
            outfile = BASE_PATH + patient + '/' + filename
            write_header = True
            if os.path.exists(outfile):
                write_header = False
            outhandle = open(outfile,'a+')
            if write_header:
                outhandle.write('ImageName,ImageNumber,ObjectNumber,MedTemp,AvgTemp,MaxTemp,Usage\n')
        orig_imagenum=row['ImageNumber']
        new_imagenum =str(converter[(patient,orig_imagenum)])
        imagename    =row['ImageName']
        objectnum    =str(row['ObjectNumber'])
        medtemp      =str(row['MedianTemp'])
        avgtemp      =str(row['MeanTemp'])
        maxtemp      =str(row['MaxTemp'])
        out_cols=[imagename,new_imagenum,objectnum,medtemp,avgtemp,maxtemp,train_or_valid]
        line = ','.join(out_cols)
        outhandle.write(line+'\n')
    if outhandle is not None:
        outhandle.close()
        

In [19]:
print(datetime.now())
print('Delete all Attention files...')
cleanup()
print(datetime.now())
folds = [0,1,2,3,4]
multi_or_binary = [True]  # later, add False for bcls = binary
cancers = [0,1,2,3,4,5]  
train_or_valid = [True,False]
for is_multi in multi_or_binary:
    print('is_multi',is_multi)
    for is_train in train_or_valid:
        print('is_train:',is_train)
        for fold in folds:
            print('fold',fold,end=' ')
            for cancer in cancers:
                print('cancer',cancer,end=' ')
                process_one_file(fold,is_multi,cancer,is_train)
            print()
print(datetime.now())

2022-08-10 15:08:25.827246
Delete all Attention files...
2022-08-10 15:08:25.908002
is_multi True
is_train: True
fold 0 cancer 0 cancer 1 cancer 2 cancer 3 cancer 4 cancer 5 
fold 1 cancer 0 cancer 1 cancer 2 cancer 3 cancer 4 cancer 5 
fold 2 cancer 0 cancer 1 cancer 2 cancer 3 cancer 4 cancer 5 
fold 3 cancer 0 cancer 1 cancer 2 cancer 3 cancer 4 cancer 5 
fold 4 cancer 0 cancer 1 cancer 2 cancer 3 cancer 4 cancer 5 
is_train: False
fold 0 cancer 0 cancer 1 cancer 2 cancer 3 cancer 4 cancer 5 
fold 1 cancer 0 cancer 1 cancer 2 cancer 3 cancer 4 cancer 5 
fold 2 cancer 0 cancer 1 cancer 2 cancer 3 cancer 4 cancer 5 
fold 3 cancer 0 cancer 1 cancer 2 cancer 3 cancer 4 cancer 5 
fold 4 cancer 0 cancer 1 cancer 2 cancer 3 cancer 4 cancer 5 
2022-08-10 15:13:28.372073


In [17]:
print(datetime.now())

2022-08-10 15:01:32.453654
