# CSV Prep
Process the 30 H&E WSI from Martinez.
Previously, the CellProfiler Process100 pipeline ran on WSI patches in batches.
The batches were designed to meet processing time requirements,
with several patients per batch as time allowed.
Here, we reorganize the data by patient.

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
BASE_PATH='/home/jrm/Martinez/CellProfilerRuns/CP_20220705/'
INPUT_DIRS=[
'HEcenterpatch.train.Yneg.A3-A5/',
'HEcenterpatch.train.Yneg.B13/',
'HEcenterpatch.train.Yneg.C11-D3/',
'HEcenterpatch.train.Yneg.E5/',
'HEcenterpatch.train.Yneg.F3-F15/',
'HEcenterpatch.train.Yneg.G15-H15/',
'HEcenterpatch.train.Ypos.B7-D5/',
'HEcenterpatch.train.Ypos.E7-F9/',
'HEcenterpatch.train.Ypos.G3-I13/',
'HEcenterpatch.valid.Yneg/',
'HEcenterpatch.valid.Ypos/']
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']

In [3]:
dirnum=0  # start with first input directory
filenum=0   # start with image.csv
filename = BASE_PATH+INPUT_DIRS[dirnum]+FILENAMES[filenum]
filename

'/home/jrm/Martinez/CellProfilerRuns/CP_20220705/HEcenterpatch.train.Yneg.A3-A5/Process100_Image.csv'

In [4]:
TUMOR_COL='FileName_Tumor'
PATIENT_COL='Patient'
df = pd.read_csv(filename)
print('Reading',FILENAMES[filenum])
print('Rows in the input file:',len(df))

Reading Process100_Image.csv
Rows in the input file: 2121


In [5]:
patient_column = df[TUMOR_COL].astype(str).str[:3]
df[PATIENT_COL] = patient_column
print('Original dataframe shape:',df.shape)
patients = df[PATIENT_COL].unique()
patients

Original dataframe shape: (2121, 5348)


array(['A3_', 'A5_'], dtype=object)

In [6]:
patient_num=0   # start with first patient
one_patient=patients[patient_num]
one_patient

'A3_'

In [7]:
patient_df = df.loc[df[PATIENT_COL]==one_patient]

In [8]:
def drop_cols(df,cols):
    if len(bad_cols)>0:
        df = df.drop(columns=bad_cols) 
    return df
bad_cols = [c for c in patient_df.columns if c.startswith('ExecutionTime_')]
patient_df = drop_cols(patient_df,bad_cols)
bad_cols = [c for c in patient_df.columns if c.endswith('_Tumor')]
patient_df = drop_cols(patient_df,bad_cols)
bad_cols = ['Group_Index','Group_Number','ImageSet_ImageSet','ProcessingStatus','Patient']
patient_df = drop_cols(patient_df,bad_cols)
print('Patient dataframe shape:',patient_df.shape)

Patient dataframe shape: (1022, 5307)


In [9]:
# Renumber images starting at 1
min_image_num = min(patient_df['ImageNumber'])
max_image_num = max(patient_df['ImageNumber'])
print('Original min,max:',min_image_num,max_image_num)
for i in range(len(patient_df)):
    patient_df.at[i,'ImageNumber']=i+1
min_image_num = min(patient_df['ImageNumber'])
max_image_num = max(patient_df['ImageNumber'])
print('final min,max:',min_image_num,max_image_num)


Original min,max: 1 1022
final min,max: 1 1022


In [10]:
outdir = BASE_PATH+one_patient
print('Make directory',outdir)
try:
    os.mkdir(outdir)
except FileExistsError:
    print('Directory already exists. Will overwrite files in there.')
outfile = outdir+'/'+FILENAMES[filenum]

Make directory /home/jrm/Martinez/CellProfilerRuns/CP_20220705/A3_
Directory already exists. Will overwrite files in there.


In [11]:
patient_df.to_csv(outfile)