# CSV Prep
Process the 28 H&E WSI from Martinez.
Previously, the CellProfiler Process100 pipeline ran on WSI patches in batches.
The batches were designed to meet processing time requirements,
with several patients per batch as time allowed.
Here, we reorganize the data by patient.
So far, we only worked out the Image.csv file processing.

In [1]:
import numpy as np
import pandas as pd
import os

In [12]:
from datetime import datetime
print(datetime.now())

2022-07-14 12:51:01.719205


In [2]:
BASE_PATH='/home/jrm/Martinez/CellProfilerRuns/CP_20220705/'
INPUT_DIRS=[
'HEcenterpatch.train.Yneg.A3-A5/',
'HEcenterpatch.train.Yneg.B13/',
'HEcenterpatch.train.Yneg.C11-D3/',
'HEcenterpatch.train.Yneg.E5/',
'HEcenterpatch.train.Yneg.F3-F15/',
'HEcenterpatch.train.Yneg.G15-H15/',
'HEcenterpatch.train.Ypos.B7-D5/',
'HEcenterpatch.train.Ypos.E7-F9/',
'HEcenterpatch.train.Ypos.G3-I13/',
'HEcenterpatch.valid.Yneg/',
'HEcenterpatch.valid.Ypos/']
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']

In [3]:
def drop_cols(df,cols):
    if len(cols)>0:
        df = df.drop(columns=cols) 
    return df

In [4]:
def drop_bad_cols(df):
    bad_cols = [c for c in df.columns if c.startswith('ExecutionTime_')]
    df = drop_cols(df,bad_cols)
    bad_cols = [c for c in df.columns if c.endswith('_Tumor')]
    df = drop_cols(df,bad_cols)
    bad_cols = ['Group_Index','Group_Number','ImageSet_ImageSet','ProcessingStatus','Patient']
    df = drop_cols(df,bad_cols)
    return df

In [5]:
def renumber_images(df):
    min_image_num = min(df['ImageNumber'])
    max_image_num = max(df['ImageNumber'])
    print('Original ImageNumber min,max:',min_image_num,max_image_num)
    for i in range(len(df)):
        old_num=df.at[i,'ImageNumber']
        df.at[i,'ImageNumber']=old_num-min_image_num+1
    min_image_num = min(df['ImageNumber'])
    max_image_num = max(df['ImageNumber'])
    print('Revised ImageNumber min,max:',min_image_num,max_image_num)

In [6]:
def save(df,dirname,filename):
    outdir = BASE_PATH+dirname
    try:
        os.mkdir(outdir)
    except FileExistsError:
        print('Directory',dirname,'already exists. Will overwrite files in there.')
    outfile = outdir+'/'+filename
    df.to_csv(outfile)

In [7]:
def extract_patients(df,filename):
    patients = df[PATIENT_COL].unique()
    print('Patients to process:',patients)
    for one_patient in patients:
        patient_data = df.loc[df[PATIENT_COL]==one_patient]
        patient_data = patient_data.reset_index() # so iloc starts at zero
        patient_data = drop_bad_cols(patient_data)
        renumber_images(patient_data)
        save(patient_data,one_patient,filename)
        print('Saved patient',one_patient,'filename',filename,'shape:',patient_data.shape)

In [8]:
filename = FILENAMES[0]  # So far, we only process Image.csv

for indir in INPUT_DIRS:
    infile = BASE_PATH+indir+filename
    print('Reading from',infile)
    TUMOR_COL='FileName_Tumor'
    PATIENT_COL='Patient'
    df = pd.read_csv(infile)
    print('Original dataframe shape:',df.shape)
    # Patient ID is in first 3 letters of the patch filename.
    # For convenience, add a column containing patient ID.
    patient_column = df[TUMOR_COL].astype(str).str[:3]
    df[PATIENT_COL] = patient_column
    extract_patients(df,filename)

Reading from /home/jrm/Martinez/CellProfilerRuns/CP_20220705/HEcenterpatch.train.Yneg.A3-A5/Process100_Image.csv
Original dataframe shape: (2121, 5347)
Patients to process: ['A3_' 'A5_']
Original ImageNumber min,max: 1 1022
Revised ImageNumber min,max: 1 1022
Directory A3_ already exists. Will overwrite files in there.
Saved patient A3_ filename Process100_Image.csv shape: (1022, 5308)
Original ImageNumber min,max: 1023 2121
Revised ImageNumber min,max: 1 1099
Directory A5_ already exists. Will overwrite files in there.
Saved patient A5_ filename Process100_Image.csv shape: (1099, 5308)
Reading from /home/jrm/Martinez/CellProfilerRuns/CP_20220705/HEcenterpatch.train.Yneg.B13/Process100_Image.csv
Original dataframe shape: (1209, 5347)
Patients to process: ['B13']
Original ImageNumber min,max: 1 1209
Revised ImageNumber min,max: 1 1209
Directory B13 already exists. Will overwrite files in there.
Saved patient B13 filename Process100_Image.csv shape: (1209, 5308)
Reading from /home/jrm/Ma

Saved patient F3_ filename Process100_Image.csv shape: (297, 5308)
Original ImageNumber min,max: 2839 3115
Revised ImageNumber min,max: 1 277
Directory F7_ already exists. Will overwrite files in there.
Saved patient F7_ filename Process100_Image.csv shape: (277, 5308)
Original ImageNumber min,max: 3116 3378
Revised ImageNumber min,max: 1 263
Directory G15 already exists. Will overwrite files in there.
Saved patient G15 filename Process100_Image.csv shape: (263, 5308)
Original ImageNumber min,max: 3379 3632
Revised ImageNumber min,max: 1 254
Directory H15 already exists. Will overwrite files in there.
Saved patient H15 filename Process100_Image.csv shape: (254, 5308)
Original ImageNumber min,max: 3633 3935
Revised ImageNumber min,max: 1 303
Directory H1_ already exists. Will overwrite files in there.
Saved patient H1_ filename Process100_Image.csv shape: (303, 5308)
Original ImageNumber min,max: 3936 4201
Revised ImageNumber min,max: 1 266
Directory H3_ already exists. Will overwrite f