# Cell Profiler - Per Patient
The raw CellProfiler outputs are organized per run.  
Here, redistribute outputs to one directory per patient.  

Based on TumorIII/Image.04 notebook which worked out the Image.csv mechanics.   
To do: redistribute the other csv files.  

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
print(datetime.now())

2022-07-31 08:06:02.348297


In [2]:
BASE_PATH_IN='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/'
BASE_PATH_OUT='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/'
INPUT_DIRS=[
'Output3.1/',
'Output4.1/',
'Output5.1/',
'Output5/',
'Output4/',
'Output3/',
'Output2/',
'Output1/',
'Output0/'
]
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']
IMAGE_COL='ImageNumber'
TUMOR_COL='FileName_Tumor'   # use this column to disambiguate patients
PATIENT_COL='Patient'        # add this column to emphasize patient ID
# Patch filename format: TCGA-06-0129-01Z-00-DX1_5400_5100.png
# For WSI ID, use first 23 letters.
# For patient or case ID, use first 19 letters. 
LEN_CASE_ID=19
LEN_WSI_ID=23

In [3]:
# Cell Profiler assigns an ImageNumber to each patch, starting with 1,
# so the second patient's patches start at some higher number.
# Here, renumber so all patients have patches 1 to n.
def renumber_images(df):
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Original ImageNumber min,max:',min_image_num,max_image_num)
    for i in range(len(df)):
        old_num=df.at[i,IMAGE_COL]
        df.at[i,'ImageNumber']=old_num-min_image_num+1
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Revised ImageNumber min,max:',min_image_num,max_image_num)

In [4]:
# Save csv to the otuput directory which is named after the patient/case ID.
# Create the output directory if it does not already exist from a prior run.
def _save(df,dirname,filename):
    outdir = BASE_PATH_OUT+dirname
    try:
        os.mkdir(outdir)
    except FileExistsError:
        print('Directory',dirname,'already exists. Will overwrite files in there.')
    outfile = outdir+'/'+filename
    df.to_csv(outfile,index=False)

In [5]:
# Process all patients within a given dataframe.
def extract_patients(df,filename):
    patients = df[PATIENT_COL].unique()
    print('Patients to process:',patients)
    for one_patient in patients:
        print('Start patient',one_patient)
        patient_data = df.loc[df[PATIENT_COL]==one_patient]
        patient_data = patient_data.reset_index(drop=True) # so iloc starts at zero
        renumber_images(patient_data)
        _save(patient_data,one_patient,filename)
        print('Saved patient',one_patient,'filename',filename,'shape:',patient_data.shape)

In [6]:
# For convenience, add a column containing patient ID.
def add_patient_column(df):
    patient_column = df[TUMOR_COL].astype(str).str[:LEN_CASE_ID]
    df[PATIENT_COL] = patient_column
    return df

In [7]:
# Main loop.
# Process Image.csv
from datetime import datetime
print(datetime.now())
filename = FILENAMES[0]  
for indir in INPUT_DIRS:
    infile = BASE_PATH_IN+indir+filename
    print('Reading from',infile)
    df = pd.read_csv(infile)
    print('Original dataframe shape:',df.shape)
    df = add_patient_column(df)
    extract_patients(df,filename)
    from datetime import datetime
    print(datetime.now())
print('Done')

2022-07-31 08:06:02.370704
Reading from /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/Output3.1/Process100_Image.csv
Original dataframe shape: (363, 5347)
Patients to process: ['TCGA-CS-4942-01Z-00']
Start patient TCGA-CS-4942-01Z-00
Original ImageNumber min,max: 1 363
Revised ImageNumber min,max: 1 363
Saved patient TCGA-CS-4942-01Z-00 filename Process100_Image.csv shape: (363, 5348)
2022-07-31 08:06:03.853177
Reading from /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/Output4.1/Process100_Image.csv
Original dataframe shape: (3213, 5347)
Patients to process: ['TCGA-HT-A5RC-01Z-00' 'TCGA-QH-A6CS-01Z-00' 'TCGA-QH-A6XC-01Z-00'
 'TCGA-S9-A6U1-01Z-00' 'TCGA-S9-A7IS-01Z-00' 'TCGA-S9-A7IZ-01Z-00'
 'TCGA-S9-A7R2-01Z-00' 'TCGA-TM-A7C3-01Z-00']
Start patient TCGA-HT-A5RC-01Z-00
Original ImageNumber min,max: 1 406
Revised ImageNumber min,max: 1 406
Saved patient TCGA-HT-A5RC-01Z-00 filename Process100_Image.csv shape: (406, 5348)
Start patient TCGA-QH-A6CS-01Z-00
Original Imag

Saved patient TCGA-DU-6402-01Z-00 filename Process100_Image.csv shape: (393, 5347)
Start patient TCGA-DU-7299-01Z-00
Original ImageNumber min,max: 1519 1914
Revised ImageNumber min,max: 1 396
Saved patient TCGA-DU-7299-01Z-00 filename Process100_Image.csv shape: (396, 5347)
Start patient TCGA-FG-5963-01Z-00
Original ImageNumber min,max: 1915 2310
Revised ImageNumber min,max: 1 396
Saved patient TCGA-FG-5963-01Z-00 filename Process100_Image.csv shape: (396, 5347)
Start patient TCGA-FG-7636-01Z-00
Original ImageNumber min,max: 2311 2711
Revised ImageNumber min,max: 1 401
Saved patient TCGA-FG-7636-01Z-00 filename Process100_Image.csv shape: (401, 5347)
Start patient TCGA-FG-A60L-01Z-00
Original ImageNumber min,max: 2712 3117
Revised ImageNumber min,max: 1 406
Saved patient TCGA-FG-A60L-01Z-00 filename Process100_Image.csv shape: (406, 5347)
Start patient TCGA-HT-7606-01Z-00
Original ImageNumber min,max: 3118 3913
Revised ImageNumber min,max: 1 796
Saved patient TCGA-HT-7606-01Z-00 filena

Saved patient TCGA-HT-7616-01Z-00 filename Process100_Image.csv shape: (790, 5347)
Start patient TCGA-HT-7676-01Z-00
Original ImageNumber min,max: 6275 6647
Revised ImageNumber min,max: 1 373
Saved patient TCGA-HT-7676-01Z-00 filename Process100_Image.csv shape: (373, 5347)
Start patient TCGA-HT-7693-01Z-00
Original ImageNumber min,max: 6648 7783
Revised ImageNumber min,max: 1 1136
Saved patient TCGA-HT-7693-01Z-00 filename Process100_Image.csv shape: (1136, 5347)
Start patient TCGA-HT-7881-01Z-00
Original ImageNumber min,max: 7784 9701
Revised ImageNumber min,max: 1 1918
Saved patient TCGA-HT-7881-01Z-00 filename Process100_Image.csv shape: (1918, 5347)
Start patient TCGA-HT-A617-01Z-00
Original ImageNumber min,max: 9702 10101
Revised ImageNumber min,max: 1 400
Saved patient TCGA-HT-A617-01Z-00 filename Process100_Image.csv shape: (400, 5347)
Start patient TCGA-HW-7487-01Z-00
Original ImageNumber min,max: 10102 10502
Revised ImageNumber min,max: 1 401
Saved patient TCGA-HW-7487-01Z-00

Saved patient TCGA-06-5412-01Z-00 filename Process100_Image.csv shape: (348, 5347)
Start patient TCGA-08-0517-01Z-00
Original ImageNumber min,max: 21231 21623
Revised ImageNumber min,max: 1 393
Saved patient TCGA-08-0517-01Z-00 filename Process100_Image.csv shape: (393, 5347)
Start patient TCGA-08-0518-01Z-00
Original ImageNumber min,max: 21624 24819
Revised ImageNumber min,max: 1 3196
Saved patient TCGA-08-0518-01Z-00 filename Process100_Image.csv shape: (3196, 5347)
Start patient TCGA-08-0520-01Z-00
Original ImageNumber min,max: 24820 25215
Revised ImageNumber min,max: 1 396
Saved patient TCGA-08-0520-01Z-00 filename Process100_Image.csv shape: (396, 5347)
Start patient TCGA-14-0786-01Z-00
Original ImageNumber min,max: 25216 26413
Revised ImageNumber min,max: 1 1198
Saved patient TCGA-14-0786-01Z-00 filename Process100_Image.csv shape: (1198, 5347)
Start patient TCGA-14-0787-01Z-00
Original ImageNumber min,max: 26414 27036
Revised ImageNumber min,max: 1 623
Saved patient TCGA-14-0787