# Cell Profiler - Per Patient
Create the patch tracking file.

The previous notebook, 01, put a patch tracking file in each patient directory.  
This notebook creates one file for all.  
We can delete those and use the one instead.  

Problem: 
Need to align rows of Nucleus.csv to rows of Image.csv files.   
The original image file contains the patch filename, but the nucleus file does not.   
The nucleus file contains an ImageNum that corresponds to a row of the image file.   
The original image numbers started at 1 for each run, so they weren't unique.   
We renumbered image file rows starting at 1 for each patient.  
Now we need a way to align nucleus rows to their image numbers.   

Implementation detail:
The patch tracking file stores original and new image num as well as original patch filename.   
For validation puruposes, it stores one measurement.  
We check that we get the same measurement between these three files:
* Original image.csv (each file contains patch measurements for many patients of one cancer class)
* Patch tracking.csv (relates original to new image num)
* New image.csv (each file contains patch measurements for one patient)

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
print(datetime.now())

2022-08-08 14:23:04.426136


In [2]:
BASE_PATH_IN='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/'
BASE_PATH_OUT='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/'
INPUT_DIRS=[
'Output5/',
'Output5.1/',
'Output4/',
'Output4.1/',
'Output3/',
'Output3.1/',
'Output2/',
'Output1/',
'Output0/'
]
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']
IMAGE_FILE=FILENAMES[0]
TRACKING_FILE = '/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/PatchTracking.csv'
EXTRA_COL='Median_Tissue_Granularity_15_Eosin'
IMAGE_COL='ImageNumber'
TUMOR_COL='FileName_Tumor'   # use this column to disambiguate patients
PATIENT_COL='Patient'        # add this column to emphasize patient ID
# For WSI ID, use first 23 letters.
# For patient or case ID, use first 19 letters. 
LEN_CASE_ID=19
LEN_WSI_ID=23

In [3]:
# For convenience, add a column containing patient ID.
def add_patient_column(df):
    patient_column = df[TUMOR_COL].astype(str).str[:LEN_CASE_ID]
    df[PATIENT_COL] = patient_column
    return df

In [4]:
def make_row(patch_filename,orig_directory,orig_imagenum,patient_directory,new_imagenum,extra):
    # Extra is a random column to enable a check that we're using the right row later.
    row = {'patch_filename':patch_filename,
           'orig_directory':orig_directory,
           'orig_imagenum':orig_imagenum,
           'patient_directory':patient_directory,
           'new_imagenum':new_imagenum,
           EXTRA_COL:extra}
    return row

In [5]:
# Process all patients within a given dataframe.
# Expect a mult-patient dataframe.
def extract_patients(df,rows,orig_directory):
    patients = df[PATIENT_COL].unique()
    print('Patients to process:',patients)
    for one_patient in patients:
        patient_data = df.loc[df[PATIENT_COL]==one_patient]
        new_imagenum = 0
        for ndx,data in patient_data.iterrows():
            new_imagenum += 1
            patch_filename = data['FileName_Tumor']
            orig_imagenum  = data['ImageNumber']
            patient        = patch_filename[:LEN_CASE_ID]
            extra          = data[EXTRA_COL]
            row = make_row(patch_filename,orig_directory,orig_imagenum,patient,new_imagenum,extra)
            rows.append(row)

In [None]:
# Main loop.
# Process Image.csv
print(datetime.now())
rows = []
for indir in INPUT_DIRS:
    infile = BASE_PATH_IN+indir+IMAGE_FILE
    print('Reading from',infile)
    df = pd.read_csv(infile)
    df = df.fillna(0)
    df = add_patient_column(df)
    extract_patients(df,rows,indir)
    print(datetime.now())

2022-08-08 14:23:04.441003
Reading from /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/Output5/Process100_Image.csv
Patients to process: ['TCGA-DB-A4XG-01Z-00' 'TCGA-HT-A5R9-01Z-00' 'TCGA-QH-A6CU-01Z-00']
2022-08-08 14:23:06.106073
Reading from /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/Output5.1/Process100_Image.csv
Patients to process: ['TCGA-DB-A64P-01Z-00' 'TCGA-DH-A669-01Z-00' 'TCGA-DH-A66G-01Z-00'
 'TCGA-HW-A5KJ-01Z-00' 'TCGA-QH-A65R-01Z-00' 'TCGA-QH-A6X8-01Z-00'
 'TCGA-S9-A6TW-01Z-00' 'TCGA-S9-A6TX-01Z-00' 'TCGA-S9-A7J2-01Z-00'
 'TCGA-S9-A7J3-01Z-00']
2022-08-08 14:23:10.092234
Reading from /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/Output4/Process100_Image.csv
Patients to process: ['TCGA-DB-A4XF-01Z-00' 'TCGA-DH-A66B-01Z-00' 'TCGA-HT-A5RA-01Z-00'
 'TCGA-HT-A618-01Z-00' 'TCGA-S9-A6U6-01Z-00' 'TCGA-S9-A6U9-01Z-00'
 'TCGA-S9-A6WL-01Z-00']
2022-08-08 14:23:13.025737
Reading from /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/Output4.1/Pro

In [None]:
print(datetime.now())
print('Saving',len(rows),'rows')
df = pd.DataFrame(rows)
df.to_csv(TRACKING_FILE,index=False,na_rep='nan')
df = None

In [None]:
print(datetime.now())
print('Validating outputs')
tracking_df = pd.read_csv(TRACKING_FILE)
errors = 0
nans = 0
nonzero = 0
checked = 0
CHECK_EVERY = 300
for ndx,data in tracking_df.iterrows():
    if ndx%CHECK_EVERY == 0:
        print('.',end='')
        extra=data[EXTRA_COL]
        patient=data['patient_directory']
        patch_filename=data['patch_filename']
        orig_directory=data['orig_directory']
        orig_imagenum=data['orig_imagenum']
        new_imagenum=data['new_imagenum']
        if pd.isna(extra):
            nans += 1
        if extra != 0:
            nonzero += 1
        #
        orig_file = BASE_PATH_IN+orig_directory+IMAGE_FILE
        orig_df = pd.read_csv(orig_file)
        orig_df = orig_df.fillna(0)   # only the original file should contain nan
        row = orig_df[orig_df['ImageNumber']==orig_imagenum]
        if len(row)!=1:
            print('Error! Expected one of orig imagenum',orig_imagenum)
            print(patch_filename)
            errors += 1
        orig_filename = row.iloc[0]['FileName_Tumor']
        if orig_filename != patch_filename:
            print('Error! expected',patch_filename,'got',row['FileName_Tumor'])
            print(patch_filename)
            errors += 1
        orig_extra = row.iloc[0][EXTRA_COL]
        if orig_extra != extra:
            print('Error! Expected orig',extra,orig_extra)
            print(patch_filename)
            errors += 1
        #
        new_file = BASE_PATH_OUT+patient+'/'+IMAGE_FILE
        new_df = pd.read_csv(new_file)
        row = new_df[new_df['ImageNumber']==new_imagenum]
        if len(row)!=1:
            print('Error! Expected one of new imagenum',new_imagenum)
            print(patch_filename)
            errors += 1
        new_extra = row.iloc[0][EXTRA_COL]
        if new_extra != extra:
            print('Error! Expected new',extra,new_extra)
            print(patch_filename)
            errors += 1
        checked += 1
print('\nTotal checked',checked)
print('Total nans',nans)
print('Total nonzero',nonzero)
if errors == 0:
    print('Validated!')
else:
    print('There were',errors,'errors')
print(datetime.now())