# Cell Profiler - Per Patient
Redistribute rows of Nucleus.csv files into one file per patient.  
In preparation for other csv files, store each file in its own directory.  
Also, drop the columns that won't be used for classification.  

Problem: the raw CellProfiler outputs were organized per run.  
Inputs like: Class0/Process100_Nucleus.csv (combines patients of cancer class 0)  
Outputs like: TCGA-HT-7482-01Z-00/Process100_Image.csv (specific to one patient)

Based on CP_PerPatient.01 which worked out the Image.csv mechanics.   

In [1]:
import numpy as np
import pandas as pd
import os
from datetime import datetime
print(datetime.now())

2022-08-08 09:42:24.980947


In [2]:
BASE_PATH_IN='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/'
BASE_PATH_OUT='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/'
INPUT_DIRS=[
'Output5/',
'Output5.1/',
'Output4/',
'Output4.1/',
'Output3/',
'Output3.1/',
'Output2/',
'Output1/',
'Output0/'
]
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']
IMAGE_FILE=FILENAMES[0]
NUCLEUS_FILE=FILENAMES[5]
TRACKING_FILE='PatchTracking.csv'
IMAGE_COL='ImageNumber'
TUMOR_COL='FileName_Tumor'   # use this column to disambiguate patients
PATIENT_COL='Patient'        # add this column to emphasize patient ID
# Patch filename format: TCGA-06-0129-01Z-00-DX1_5400_5100.png
# For WSI ID, use first 23 letters.
# For patient or case ID, use first 19 letters. 
LEN_CASE_ID=19
LEN_WSI_ID=23

In [15]:
TEST = BASE_PATH_IN + INPUT_DIRS[0] + NUCLEUS_FILE
df = pd.read_csv(TEST)
bad_cols=['Children_Cells_Count',
          'Number_Object_Number','ImageNumber','ObjectNumber']
for c in df.columns:
    print(c)

ImageNumber
ObjectNumber
AreaShape_Area
AreaShape_BoundingBoxArea
AreaShape_BoundingBoxMaximum_X
AreaShape_BoundingBoxMaximum_Y
AreaShape_BoundingBoxMinimum_X
AreaShape_BoundingBoxMinimum_Y
AreaShape_Center_X
AreaShape_Center_Y
AreaShape_CentralMoment_0_0
AreaShape_CentralMoment_0_1
AreaShape_CentralMoment_0_2
AreaShape_CentralMoment_0_3
AreaShape_CentralMoment_1_0
AreaShape_CentralMoment_1_1
AreaShape_CentralMoment_1_2
AreaShape_CentralMoment_1_3
AreaShape_CentralMoment_2_0
AreaShape_CentralMoment_2_1
AreaShape_CentralMoment_2_2
AreaShape_CentralMoment_2_3
AreaShape_Compactness
AreaShape_ConvexArea
AreaShape_Eccentricity
AreaShape_EquivalentDiameter
AreaShape_EulerNumber
AreaShape_Extent
AreaShape_FormFactor
AreaShape_HuMoment_0
AreaShape_HuMoment_1
AreaShape_HuMoment_2
AreaShape_HuMoment_3
AreaShape_HuMoment_4
AreaShape_HuMoment_5
AreaShape_HuMoment_6
AreaShape_InertiaTensorEigenvalues_0
AreaShape_InertiaTensorEigenvalues_1
AreaShape_InertiaTensor_0_0
AreaShape_InertiaTensor_0_1
Area

In [16]:
df1 = df[bad_cols]
print(df1.describe())
loc_cols=[c for c in df.columns if c.startswith('Location_')
          or c.startswith('AreaShape_BoundingBoxM')
          or c.startswith('AreaShape_Center')]
df2 = df[loc_cols]
df2

       Children_Cells_Count  Number_Object_Number   ImageNumber  ObjectNumber
count               53535.0          53535.000000  53535.000000  53535.000000
mean                    1.0             20.981078    690.112655     20.981078
std                     0.0             14.292415    421.273324     14.292415
min                     1.0              1.000000      1.000000      1.000000
25%                     1.0              9.000000    338.000000      9.000000
50%                     1.0             19.000000    647.000000     19.000000
75%                     1.0             31.000000   1023.000000     31.000000
max                     1.0             89.000000   1591.000000     89.000000


Unnamed: 0,AreaShape_BoundingBoxMaximum_X,AreaShape_BoundingBoxMaximum_Y,AreaShape_BoundingBoxMinimum_X,AreaShape_BoundingBoxMinimum_Y,AreaShape_Center_X,AreaShape_Center_Y,Location_CenterMassIntensity_X_Hematoxylin,Location_CenterMassIntensity_Y_Hematoxylin,Location_CenterMassIntensity_Z_Hematoxylin,Location_Center_X,Location_Center_Y,Location_Center_Z,Location_MaxIntensity_X_Hematoxylin,Location_MaxIntensity_Y_Hematoxylin,Location_MaxIntensity_Z_Hematoxylin
0,194,14,168,0,179.680934,5.241245,179.161527,5.006069,0.0,179.680934,5.241245,0,171.0,8.0,0.0
1,215,54,172,13,192.229268,31.889431,192.485777,31.985459,0.0,192.229268,31.889431,0,208.0,25.0,0.0
2,239,57,212,22,225.123711,38.482818,225.220652,38.576053,0.0,225.123711,38.482818,0,229.0,30.0,0.0
3,209,76,173,46,191.584634,60.704682,191.790409,60.779885,0.0,191.584634,60.704682,0,194.0,68.0,0.0
4,233,138,204,108,217.935380,122.276252,217.713390,122.412892,0.0,217.935380,122.276252,0,216.0,124.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
53530,231,300,206,289,218.536697,294.834862,218.527726,295.220703,0.0,218.536697,294.834862,0,222.0,299.0,0.0
53531,60,113,29,99,43.770408,104.454082,43.966185,104.072904,0.0,43.770408,104.454082,0,40.0,104.0,0.0
53532,122,141,90,110,104.073892,125.798030,103.518681,125.307928,0.0,104.073892,125.798030,0,100.0,123.0,0.0
53533,145,185,108,135,124.060908,161.367663,124.485395,162.197364,0.0,124.060908,161.367663,0,123.0,175.0,0.0


In [3]:
# Cell Profiler assigns an ImageNumber to each patch, starting with 1,
# so the second patient's patches start at some higher number.
# Here, renumber so all patients have patches 1 to n.
def renumber_images(df):
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Original ImageNumber min,max:',min_image_num,max_image_num)
    for i in range(len(df)):
        old_num=df.at[i,IMAGE_COL]
        df.at[i,'ImageNumber']=old_num-min_image_num+1
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Revised ImageNumber min,max:',min_image_num,max_image_num)

In [4]:
# For convenience, add a column containing patient ID.
def add_patient_column(df):
    patient_column = df[TUMOR_COL].astype(str).str[:LEN_CASE_ID]
    df[PATIENT_COL] = patient_column
    return df

In [5]:
def _drop_cols(df,cols):
    if len(cols)>0:
        df = df.drop(columns=cols) 
    return df

In [6]:
# Some tracking columns must be removed before training.
def drop_bad_cols(df):
    # Timing stats can give away classes if classes were processed differently. 
    bad_cols = [c for c in df.columns if c.startswith('ExecutionTime_')]
    df = _drop_cols(df,bad_cols)
    # The total patch area is constant so no use for training.
    bad_cols = [c for c in df.columns if c.startswith('AreaOccupied_TotalArea_')]
    df = _drop_cols(df,bad_cols)
    # These columns are mostly empty except for FileName_Tumor.
    # FileName_Tumor can give away the class or at least the patient.
    bad_cols = [c for c in df.columns if c.endswith('_Tumor')]
    df = _drop_cols(df,bad_cols)
    # These other string columns could give away the class or at least the patient.
    # Note we added the Patient column to help split the dataframe by patient.
    bad_cols = [
        'Group_Index','Group_Number','ImageSet_ImageSet',
        'Patient','Metadata_FileLocation']
    # Remove this too if Cell Profiler generates it: 'ProcessingStatus'
    df = _drop_cols(df,bad_cols)
    # Windows runs have this, Linux runs don't: ProcessingStatus
    ps = 'ProcessingStatus'
    if ps in df.columns:
        df = df.drop(columns=ps)
        print('Removed column ProcessingStatus')
    return df

In [7]:
# Convert NaN to zero    
# Usual cause of NaN is mean_RBC_diameter where Count_RBC=0
def drop_bad_vals(df):
    df.fillna(0,inplace=True)
    df.fillna(0,inplace=True)
    nan = df.isna().sum().sum()
    temp_df=df.select_dtypes(include='object')
    inf = np.isinf(df).values.sum()
    if nan>0 or len(temp_df.columns)>0 or inf>0:
        raise Exception('Non-numeric values in df')
    return df

In [8]:
# Save csv to the otuput directory which is named after the patient/case ID.
# Create the output directory if it does not already exist from a prior run.
def _save_datafile(patient,df,filename):
    directory = patient+'/'
    outdir = BASE_PATH_OUT+directory
    outfile = outdir + filename
    try:
        os.mkdir(outdir)
    except FileExistsError:
        pass  # Now that we write multiple files, this warning is getting old
        # print('Directory',directory,'already exists. Will overwrite files in there.')
    df.to_csv(outfile,index=False)

In [9]:
# Process all patients within a given dataframe.
# Expect a mult-patient dataframe.
def extract_patients(df):
    keep_cols=['ImageNumber','FileName_Tumor']
    patients = df[PATIENT_COL].unique()
    print('Patients to process:',patients)
    for one_patient in patients:
        print('Start patient',one_patient,)
        patient_data = df.loc[df[PATIENT_COL]==one_patient]
        shape1 = patient_data.shape
        tracking_data = patient_data[keep_cols]
        _save_datafile(one_patient,tracking_data,TRACKING_FILE)
        patient_data = patient_data.reset_index(drop=True) # so iloc starts at zero
        renumber_images(patient_data)
        patient_data = drop_bad_cols(patient_data)
        patient_data = drop_bad_vals(patient_data)
        _save_datafile(one_patient,patient_data,IMAGE_FILE)
        shape2 = patient_data.shape
        print('Patient',one_patient,'shape changed from',shape1,'to',shape2)

In [10]:
# Main loop.
# Process Image.csv
from datetime import datetime
print(datetime.now())
for indir in INPUT_DIRS:
    infile = BASE_PATH_IN+indir+IMAGE_FILE
    print('Reading from',infile)
    df = pd.read_csv(infile)
    print('Original dataframe shape:',df.shape)
    df = add_patient_column(df)
    extract_patients(df)
    from datetime import datetime
    print(datetime.now())
print('Done')

2022-08-01 13:37:43.062487
Reading from /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/Output5/Process100_Image.csv
Original dataframe shape: (1591, 5346)
Patients to process: ['TCGA-DB-A4XG-01Z-00' 'TCGA-HT-A5R9-01Z-00' 'TCGA-QH-A6CU-01Z-00']
Start patient TCGA-DB-A4XG-01Z-00
Original ImageNumber min,max: 1 790
Revised ImageNumber min,max: 1 790
Patient TCGA-DB-A4XG-01Z-00 shape changed from (790, 5347) to (790, 5302)
Start patient TCGA-HT-A5R9-01Z-00
Original ImageNumber min,max: 791 1186
Revised ImageNumber min,max: 1 396
Patient TCGA-HT-A5R9-01Z-00 shape changed from (396, 5347) to (396, 5302)
Start patient TCGA-QH-A6CU-01Z-00
Original ImageNumber min,max: 1187 1591
Revised ImageNumber min,max: 1 405
Patient TCGA-QH-A6CU-01Z-00 shape changed from (405, 5347) to (405, 5302)
2022-08-01 13:37:49.455740
Reading from /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/Output5.1/Process100_Image.csv
Original dataframe shape: (4402, 5347)
Patients to process: ['TCGA-DB-A64P-0

Patient TCGA-DU-6402-01Z-00 shape changed from (393, 5347) to (393, 5302)
Start patient TCGA-DU-7299-01Z-00
Original ImageNumber min,max: 1519 1914
Revised ImageNumber min,max: 1 396
Patient TCGA-DU-7299-01Z-00 shape changed from (396, 5347) to (396, 5302)
Start patient TCGA-FG-5963-01Z-00
Original ImageNumber min,max: 1915 2310
Revised ImageNumber min,max: 1 396
Patient TCGA-FG-5963-01Z-00 shape changed from (396, 5347) to (396, 5302)
Start patient TCGA-FG-7636-01Z-00
Original ImageNumber min,max: 2311 2711
Revised ImageNumber min,max: 1 401
Patient TCGA-FG-7636-01Z-00 shape changed from (401, 5347) to (401, 5302)
Start patient TCGA-FG-A60L-01Z-00
Original ImageNumber min,max: 2712 3117
Revised ImageNumber min,max: 1 406
Patient TCGA-FG-A60L-01Z-00 shape changed from (406, 5347) to (406, 5302)
Start patient TCGA-HT-7606-01Z-00
Original ImageNumber min,max: 3118 3913
Revised ImageNumber min,max: 1 796
Patient TCGA-HT-7606-01Z-00 shape changed from (796, 5347) to (796, 5302)
Start patie

Patient TCGA-HT-7605-01Z-00 shape changed from (1537, 5347) to (1537, 5302)
Start patient TCGA-HT-7616-01Z-00
Original ImageNumber min,max: 5485 6274
Revised ImageNumber min,max: 1 790
Patient TCGA-HT-7616-01Z-00 shape changed from (790, 5347) to (790, 5302)
Start patient TCGA-HT-7676-01Z-00
Original ImageNumber min,max: 6275 6647
Revised ImageNumber min,max: 1 373
Patient TCGA-HT-7676-01Z-00 shape changed from (373, 5347) to (373, 5302)
Start patient TCGA-HT-7693-01Z-00
Original ImageNumber min,max: 6648 7783
Revised ImageNumber min,max: 1 1136
Patient TCGA-HT-7693-01Z-00 shape changed from (1136, 5347) to (1136, 5302)
Start patient TCGA-HT-7881-01Z-00
Original ImageNumber min,max: 7784 9701
Revised ImageNumber min,max: 1 1918
Patient TCGA-HT-7881-01Z-00 shape changed from (1918, 5347) to (1918, 5302)
Start patient TCGA-HT-A617-01Z-00
Original ImageNumber min,max: 9702 10101
Revised ImageNumber min,max: 1 400
Patient TCGA-HT-A617-01Z-00 shape changed from (400, 5347) to (400, 5302)
St

Patient TCGA-06-5412-01Z-00 shape changed from (348, 5347) to (348, 5302)
Start patient TCGA-08-0517-01Z-00
Original ImageNumber min,max: 21231 21623
Revised ImageNumber min,max: 1 393
Patient TCGA-08-0517-01Z-00 shape changed from (393, 5347) to (393, 5302)
Start patient TCGA-08-0518-01Z-00
Original ImageNumber min,max: 21624 24819
Revised ImageNumber min,max: 1 3196
Patient TCGA-08-0518-01Z-00 shape changed from (3196, 5347) to (3196, 5302)
Start patient TCGA-08-0520-01Z-00
Original ImageNumber min,max: 24820 25215
Revised ImageNumber min,max: 1 396
Patient TCGA-08-0520-01Z-00 shape changed from (396, 5347) to (396, 5302)
Start patient TCGA-14-0786-01Z-00
Original ImageNumber min,max: 25216 26413
Revised ImageNumber min,max: 1 1198
Patient TCGA-14-0786-01Z-00 shape changed from (1198, 5347) to (1198, 5302)
Start patient TCGA-14-0787-01Z-00
Original ImageNumber min,max: 26414 27036
Revised ImageNumber min,max: 1 623
Patient TCGA-14-0787-01Z-00 shape changed from (623, 5347) to (623, 5