# CSV Prep
Process the 30 H&E WSI from Martinez.
Previously, the CellProfiler Process100 pipeline ran on WSI patches in batches.
The batches were designed to meet processing time requirements,
with several patients per batch as time allowed.
Here, we reorganize the data by patient.

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
BASE_PATH='/home/jrm/Martinez/CellProfilerRuns/CP_20220705/'
INPUT_DIRS=[
'HEcenterpatch.train.Yneg.A3-A5/',
'HEcenterpatch.train.Yneg.B13/',
'HEcenterpatch.train.Yneg.C11-D3/',
'HEcenterpatch.train.Yneg.E5/',
'HEcenterpatch.train.Yneg.F3-F15/',
'HEcenterpatch.train.Yneg.G15-H15/',
'HEcenterpatch.train.Ypos.B7-D5/',
'HEcenterpatch.train.Ypos.E7-F9/',
'HEcenterpatch.train.Ypos.G3-I13/',
'HEcenterpatch.valid.Yneg/',
'HEcenterpatch.valid.Ypos/']
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']

In [3]:
dirnum=0  # start with first input directory
filenum=0   # start with image.csv
filename = BASE_PATH+INPUT_DIRS[dirnum]+FILENAMES[filenum]
filename

'/home/jrm/Martinez/CellProfilerRuns/CP_20220705/HEcenterpatch.train.Yneg.A3-A5/Process100_Image.csv'

In [4]:
TUMOR_COL='FileName_Tumor'
PATIENT_COL='Patient'
df = pd.read_csv(filename)
print('Reading',FILENAMES[filenum])
print('Rows in the input file:',len(df))

Reading Process100_Image.csv
Rows in the input file: 2121


In [5]:
patient_column = df[TUMOR_COL].astype(str).str[:3]
df[PATIENT_COL] = patient_column
patients = df[PATIENT_COL].unique()
patients

array(['A3_', 'A5_'], dtype=object)

In [6]:
patient_num=0   # start with first patient
one_patient=patients[patient_num]
one_patient

'A3_'

In [7]:
patient_df = df.loc[df[PATIENT_COL]==one_patient]
print('Rows in the output file:',len(patient_df))

Rows in the output file: 1022


In [8]:
outdir = BASE_PATH+one_patient
print('Make directory',outdir)
try:
    os.mkdir(outdir)
except FileExistsError:
    print('Directory already exists. Will overwrite files in there.')
outfile = outdir+'/'+FILENAMES[filenum]

Make directory /home/jrm/Martinez/CellProfilerRuns/CP_20220705/A3_
Directory already exists. Will overwrite files in there.


In [10]:
patient_df.to_csv(outfile)