# CSV Prep
Process the 30 H&E WSI from Martinez.
Previously, the CellProfiler Process100 pipeline ran on WSI patches in batches.
The batches were designed to meet processing time requirements,
with several patients per batch as time allowed.
Here, we reorganize the data by patient.

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
BASE_PATH='/home/jrm/Martinez/CellProfilerRuns/CP_20220705/'
INPUT_DIRS=[
'HEcenterpatch.train.Yneg.A3-A5/',
'HEcenterpatch.train.Yneg.B13/',
'HEcenterpatch.train.Yneg.C11-D3/',
'HEcenterpatch.train.Yneg.E5/',
'HEcenterpatch.train.Yneg.F3-F15/',
'HEcenterpatch.train.Yneg.G15-H15/',
'HEcenterpatch.train.Ypos.B7-D5/',
'HEcenterpatch.train.Ypos.E7-F9/',
'HEcenterpatch.train.Ypos.G3-I13/',
'HEcenterpatch.valid.Yneg/',
'HEcenterpatch.valid.Ypos/']
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']

In [3]:
def drop_cols(df,cols):
    if len(cols)>0:
        df = df.drop(columns=cols) 
    return df

In [4]:
def drop_bad_cols(df):
    bad_cols = [c for c in df.columns if c.startswith('ExecutionTime_')]
    df = drop_cols(df,bad_cols)
    bad_cols = [c for c in df.columns if c.endswith('_Tumor')]
    df = drop_cols(df,bad_cols)
    bad_cols = ['Group_Index','Group_Number','ImageSet_ImageSet','ProcessingStatus','Patient']
    df = drop_cols(df,bad_cols)
    return df

In [5]:
def renumber_images(df):
    min_image_num = min(df['ImageNumber'])
    max_image_num = max(df['ImageNumber'])
    print('Original ImageNumber min,max:',min_image_num,max_image_num)
    for i in range(len(df)):
        old_num=df.at[i,'ImageNumber']
        df.at[i,'ImageNumber']=old_num-min_image_num+1
    min_image_num = min(df['ImageNumber'])
    max_image_num = max(df['ImageNumber'])
    print('Revised ImageNumber min,max:',min_image_num,max_image_num)

In [6]:
def save(df,dirname,filename):
    outdir = BASE_PATH+dirname
    try:
        os.mkdir(outdir)
    except FileExistsError:
        print('Directory',dirname,'already exists. Will overwrite files in there.')
    outfile = outdir+'/'+filename
    df.to_csv(outfile)

In [7]:
dirnum=0  # start with first input directory
filenum=0   # start with Image.csv
infile = BASE_PATH+INPUT_DIRS[dirnum]+FILENAMES[filenum]
print('Reading from',infile)
TUMOR_COL='FileName_Tumor'
PATIENT_COL='Patient'
df = pd.read_csv(infile)
print('Original dataframe shape:',df.shape)

Reading from /home/jrm/Martinez/CellProfilerRuns/CP_20220705/HEcenterpatch.train.Yneg.A3-A5/Process100_Image.csv
Original dataframe shape: (2121, 5347)


In [8]:
patient_column = df[TUMOR_COL].astype(str).str[:3]
df[PATIENT_COL] = patient_column
patients = df[PATIENT_COL].unique()
print('Patients to process:',patients)

Patients to process: ['A3_' 'A5_']


In [9]:
for one_patient in patients:
    patient_data = df.loc[df[PATIENT_COL]==one_patient]
    patient_data = patient_data.reset_index() # so iloc starts at zero
    patient_data = drop_bad_cols(patient_data)
    print('Patient',one_patient,'num rows=',len(patient_data))
    renumber_images(patient_data)
    save(patient_data,one_patient,FILENAMES[filenum])
    print('Saved patient',one_patient,'dataframe shape:',patient_data.shape)

Patient A3_ num rows= 1022
Original ImageNumber min,max: 1 1022
Revised ImageNumber min,max: 1 1022
Directory A3_ already exists. Will overwrite files in there.
Saved patient A3_ dataframe shape: (1022, 5308)
Patient A5_ num rows= 1099
Original ImageNumber min,max: 1023 2121
Revised ImageNumber min,max: 1 1099
Directory A5_ already exists. Will overwrite files in there.
Saved patient A5_ dataframe shape: (1099, 5308)
