# Cell Profiler - Per Patient
The raw CellProfiler outputs are organized per run.  

Here, redistribute outputs to one directory per patient.  
Based on TumorIII/Image.04 notebook which worked out the Image.csv mechanics.   

At the same time, drop the columns we don't want.  
Based on TumorIII/Image.05 notebook which generated the Filtered directory.

To do: redistribute the other csv files.  

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
print(datetime.now())

2022-07-31 09:23:06.165772


In [2]:
BASE_PATH_IN='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/'
BASE_PATH_OUT='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/'
INPUT_DIRS=[
'Output5/',
'Output5.1/',
'Output4/',
'Output4.1/',
'Output3/',
'Output3.1/',
'Output2/',
'Output1/',
'Output0/'
]
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']
IMAGE_COL='ImageNumber'
TUMOR_COL='FileName_Tumor'   # use this column to disambiguate patients
PATIENT_COL='Patient'        # add this column to emphasize patient ID
# Patch filename format: TCGA-06-0129-01Z-00-DX1_5400_5100.png
# For WSI ID, use first 23 letters.
# For patient or case ID, use first 19 letters. 
LEN_CASE_ID=19
LEN_WSI_ID=23

In [3]:
# Cell Profiler assigns an ImageNumber to each patch, starting with 1,
# so the second patient's patches start at some higher number.
# Here, renumber so all patients have patches 1 to n.
def renumber_images(df):
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Original ImageNumber min,max:',min_image_num,max_image_num)
    for i in range(len(df)):
        old_num=df.at[i,IMAGE_COL]
        df.at[i,'ImageNumber']=old_num-min_image_num+1
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Revised ImageNumber min,max:',min_image_num,max_image_num)

In [4]:
# Save csv to the otuput directory which is named after the patient/case ID.
# Create the output directory if it does not already exist from a prior run.
def _save(df,dirname,filename):
    outdir = BASE_PATH_OUT+dirname
    try:
        os.mkdir(outdir)
    except FileExistsError:
        print('Directory',dirname,'already exists. Will overwrite files in there.')
    outfile = outdir+'/'+filename
    df.to_csv(outfile,index=False)

In [5]:
# For convenience, add a column containing patient ID.
def add_patient_column(df):
    patient_column = df[TUMOR_COL].astype(str).str[:LEN_CASE_ID]
    df[PATIENT_COL] = patient_column
    return df

In [6]:
def _drop_cols(df,cols):
    if len(cols)>0:
        df = df.drop(columns=cols) 
    return df

In [7]:
# Some tracking columns must be removed before training.
def drop_bad_cols(df):
    # Timing stats can give away classes if classes were processed differently. 
    bad_cols = [c for c in df.columns if c.startswith('ExecutionTime_')]
    df = _drop_cols(df,bad_cols)
    # The total patch area is constant so no use for training.
    bad_cols = [c for c in df.columns if c.startswith('AreaOccupied_TotalArea_')]
    df = _drop_cols(df,bad_cols)
    # These columns are mostly empty except for FileName_Tumor.
    # FileName_Tumor can give away the class or at least the patient.
    bad_cols = [c for c in df.columns if c.endswith('_Tumor')]
    df = _drop_cols(df,bad_cols)
    # These other string columns could give away the class or at least the patient.
    # Note we added the Patient column to help split the dataframe by patient.
    bad_cols = [
        'Group_Index','Group_Number','ImageSet_ImageSet',
        'Patient','Metadata_FileLocation']
    # Remove this too if Cell Profiler generates it: 'ProcessingStatus'
    df = _drop_cols(df,bad_cols)
    # Windows runs have this, Linux runs don't: ProcessingStatus
    ps = 'ProcessingStatus'
    if ps in df.columns:
        df = df.drop(columns=ps)
        print('Removed column ProcessingStatus')
    return df

In [8]:
# Convert NaN to zero    
# Usual cause of NaN is mean_RBC_diameter where Count_RBC=0
def drop_bad_vals(df):
    df.fillna(0,inplace=True)
    df.fillna(0,inplace=True)
    nan = df.isna().sum().sum()
    temp_df=df.select_dtypes(include='object')
    inf = np.isinf(df).values.sum()
    if nan>0 or len(temp_df.columns)>0 or inf>0:
        raise Exception('Non-numeric values in df')
    return df

In [9]:
# Process all patients within a given dataframe.
# Expect a mult-patient dataframe.
def extract_patients(df,filename):
    patients = df[PATIENT_COL].unique()
    print('Patients to process:',patients)
    for one_patient in patients:
        print('Start patient',one_patient,)
        patient_data = df.loc[df[PATIENT_COL]==one_patient]
        shape1 = patient_data.shape
        patient_data = patient_data.reset_index(drop=True) # so iloc starts at zero
        renumber_images(patient_data)
        patient_data = drop_bad_cols(patient_data)
        patient_data = drop_bad_vals(patient_data)
        _save(patient_data,one_patient,filename)
        print('Patient',one_patient,'saved to filename',filename,);
        shape2 = patient_data.shape
        print('Patient',one_patient,'shape change from',shape1,'to',shape2)

In [10]:
# Main loop.
# Process Image.csv
from datetime import datetime
print(datetime.now())
filename = FILENAMES[0]  
for indir in INPUT_DIRS:
    infile = BASE_PATH_IN+indir+filename
    print('Reading from',infile)
    df = pd.read_csv(infile)
    print('Original dataframe shape:',df.shape)
    df = add_patient_column(df)
    extract_patients(df,filename)
    from datetime import datetime
    print(datetime.now())
print('Done')

2022-07-31 09:23:06.233432
Reading from /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/Output5/Process100_Image.csv
Original dataframe shape: (1591, 5346)
Patients to process: ['TCGA-DB-A4XG-01Z-00' 'TCGA-HT-A5R9-01Z-00' 'TCGA-QH-A6CU-01Z-00']
Start patient TCGA-DB-A4XG-01Z-00
Original ImageNumber min,max: 1 790
Revised ImageNumber min,max: 1 790
Patient TCGA-DB-A4XG-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-DB-A4XG-01Z-00 shape change from (790, 5347) to (790, 5302)
Start patient TCGA-HT-A5R9-01Z-00
Original ImageNumber min,max: 791 1186
Revised ImageNumber min,max: 1 396
Patient TCGA-HT-A5R9-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-HT-A5R9-01Z-00 shape change from (396, 5347) to (396, 5302)
Start patient TCGA-QH-A6CU-01Z-00
Original ImageNumber min,max: 1187 1591
Revised ImageNumber min,max: 1 405
Patient TCGA-QH-A6CU-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-QH-A6CU-01Z-00 shape change from (405, 5347) to (405, 5302)
2022-

Patient TCGA-S9-A7IZ-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-S9-A7IZ-01Z-00 shape change from (400, 5348) to (400, 5302)
Start patient TCGA-S9-A7R2-01Z-00
Original ImageNumber min,max: 2420 2813
Revised ImageNumber min,max: 1 394
Removed column ProcessingStatus
Patient TCGA-S9-A7R2-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-S9-A7R2-01Z-00 shape change from (394, 5348) to (394, 5302)
Start patient TCGA-TM-A7C3-01Z-00
Original ImageNumber min,max: 2814 3213
Revised ImageNumber min,max: 1 400
Removed column ProcessingStatus
Patient TCGA-TM-A7C3-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-TM-A7C3-01Z-00 shape change from (400, 5348) to (400, 5302)
2022-07-31 09:23:55.007992
Reading from /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/Output3/Process100_Image.csv
Original dataframe shape: (6364, 5346)
Patients to process: ['TCGA-CS-4943-01Z-00' 'TCGA-CS-5397-01Z-00' 'TCGA-DU-5854-01Z-00'
 'TCGA-DU-6402-01Z-00' 'TCGA-DU-7299-01Z-00' '

Patient TCGA-HT-7873-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-HT-7873-01Z-00 shape change from (861, 5347) to (861, 5302)
Start patient TCGA-HT-7902-01Z-00
Original ImageNumber min,max: 11176 11953
Revised ImageNumber min,max: 1 778
Patient TCGA-HT-7902-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-HT-7902-01Z-00 shape change from (778, 5347) to (778, 5302)
Start patient TCGA-QH-A6CZ-01Z-00
Original ImageNumber min,max: 11954 12347
Revised ImageNumber min,max: 1 394
Patient TCGA-QH-A6CZ-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-QH-A6CZ-01Z-00 shape change from (394, 5347) to (394, 5302)
Start patient TCGA-QH-A6XA-01Z-00
Original ImageNumber min,max: 12348 12744
Revised ImageNumber min,max: 1 397
Patient TCGA-QH-A6XA-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-QH-A6XA-01Z-00 shape change from (397, 5347) to (397, 5302)
Start patient TCGA-S9-A6WI-01Z-00
Original ImageNumber min,max: 12745 13140
Revised ImageNumber min,max: 1 

Patient TCGA-02-0004-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-02-0004-01Z-00 shape change from (396, 5347) to (396, 5302)
Start patient TCGA-02-0010-01Z-00
Original ImageNumber min,max: 397 1984
Revised ImageNumber min,max: 1 1588
Patient TCGA-02-0010-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-02-0010-01Z-00 shape change from (1588, 5347) to (1588, 5302)
Start patient TCGA-02-0025-01Z-00
Original ImageNumber min,max: 1985 2781
Revised ImageNumber min,max: 1 797
Patient TCGA-02-0025-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-02-0025-01Z-00 shape change from (797, 5347) to (797, 5302)
Start patient TCGA-02-0033-01Z-00
Original ImageNumber min,max: 2782 3177
Revised ImageNumber min,max: 1 396
Patient TCGA-02-0033-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-02-0033-01Z-00 shape change from (396, 5347) to (396, 5302)
Start patient TCGA-02-0285-01Z-00
Original ImageNumber min,max: 3178 3581
Revised ImageNumber min,max: 1 404
Pa

Patient TCGA-14-1452-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-14-1452-01Z-00 shape change from (1592, 5347) to (1592, 5302)
Start patient TCGA-14-1453-01Z-00
Original ImageNumber min,max: 32015 33496
Revised ImageNumber min,max: 1 1482
Patient TCGA-14-1453-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-14-1453-01Z-00 shape change from (1482, 5347) to (1482, 5302)
Start patient TCGA-14-1795-01Z-00
Original ImageNumber min,max: 33497 36314
Revised ImageNumber min,max: 1 2818
Patient TCGA-14-1795-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-14-1795-01Z-00 shape change from (2818, 5347) to (2818, 5302)
Start patient TCGA-14-1829-01Z-00
Original ImageNumber min,max: 36315 39478
Revised ImageNumber min,max: 1 3164
Patient TCGA-14-1829-01Z-00 saved to filename Process100_Image.csv
Patient TCGA-14-1829-01Z-00 shape change from (3164, 5347) to (3164, 5302)
Start patient TCGA-15-1446-01Z-00
Original ImageNumber min,max: 39479 40284
Revised ImageNumber 