# Data 04
Break up the CellProfiler Nucleus.csv files.
The data were clumped by cancer class (all of class 0 in one Nucleus.csv file) 
merely because they were processed in that order.

Here, create one Nucleus.csv file per case/patient. (We still need to split up other object files like Cells.csv, but we'll do that next.)

Whereas the Image.csv files contained patient ID in the FileName_Tumor field,
the Nucleus.csv files do not.
It is necessary to get the patient ID from the Image.csv file,
using their common field ImageNumber.

Note there may be 2 or more WSI per case.
We aim to predict cancer class per patch, then roll up to WSI or case.

In [11]:
import numpy as np
import pandas as pd
import os

In [12]:
from datetime import datetime
print(datetime.now())

2022-07-21 15:56:11.478243


In [13]:
BASE_PATH_IN='/home/jrm/Adjeroh/Naved/CP_80K/'
BASE_PATH_OUT='/home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/'
INPUT_DIRS=[
'Output5/',
'Output4/',
'Output3/',
'Output2/',
'Output1/',
'Output0/'
]
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']
IMAGE_FILENAME = FILENAMES[0]  # Image.csv  
OBJECT_FILENAME = FILENAMES[5]  # Nucleus.csv
IMAGE_COL='ImageNumber'
TUMOR_COL='FileName_Tumor'   # use this column to disambiguate patients
PATIENT_COL='Patient'
LEN_CASE_ID=19 # prefix of TCGA-06-0129-01Z-00-DX1_5400_5100.png
LEN_WSI_ID=23

In [18]:
def get_mapping(indir):
    infile = BASE_PATH_IN+indir+IMAGE_FILENAME
    print('Reading from',infile)
    df = pd.read_csv(infile)
    mapping = {}
    for row in df:
        image_number = row[IMAGE_COL]
        tumor_file = row[TUMOR_COL]
        patient_ID = str(tumor_file)[:LEN_CASE_ID]
        # TO DO: verify that we always get the same mapping!
        mapping[image_number]=patient_ID
    return mapping

In [20]:
# test
IMAGE_TO_PATIENT = get_mapping(INPUT_DIRS[0])
print(IMAGE_TO_PATIENT)

Reading from /home/jrm/Adjeroh/Naved/CP_80K/Output5/Process100_Image.csv


TypeError: string indices must be integers

In [4]:
def renumber_images(df):
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Original ImageNumber min,max:',min_image_num,max_image_num)
    for i in range(len(df)):
        old_num=df.at[i,IMAGE_COL]
        df.at[i,'ImageNumber']=old_num-min_image_num+1
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Revised ImageNumber min,max:',min_image_num,max_image_num)

In [5]:
def save(df,dirname,filename):
    outdir = BASE_PATH_OUT+dirname
    try:
        os.mkdir(outdir)
    except FileExistsError:
        print('Directory',dirname,'already exists. May overwrite files in there.')
    outfile = outdir+'/'+filename
    df.to_csv(outfile,index=False)

In [6]:
# This loop will have to operate on cancer class
# since Image_number starts at 1 per class.

def extract_patients(df,filename):
    patients = df[PATIENT_COL].unique()
    print('Patients to process:',patients)
    for one_patient in patients:
        print('Start patient',one_patient)
        patient_data = df.loc[df[PATIENT_COL]==one_patient]
        patient_data = patient_data.reset_index(drop=True) # so iloc starts at zero
        renumber_images(patient_data)
        save(patient_data,one_patient,filename)
        print('Saved patient',one_patient,'filename',filename,'shape:',patient_data.shape)

In [7]:
def add_patient_column(df):
    # Patch filename format: TCGA-06-0129-01Z-00-DX1_5400_5100.png
    # For WSI ID, use first 23 letters.
    # For patient or case ID, use first 19 letters. 
    patient_column = df[TUMOR_COL].astype(str).str[:LEN_CASE_ID]
    # For convenience, add a column containing patient ID.
    df[PATIENT_COL] = patient_column
    return df

In [8]:


for indir in INPUT_DIRS:
    infile = BASE_PATH_IN+indir+filename
    print('Reading from',infile)
    df = pd.read_csv(infile)
    print('Original dataframe shape:',df.shape)
    df = add_patient_column(df)
    extract_patients(df,filename)

We will operate on files named Process100_Nucleus.csv
Reading from /home/jrm/Adjeroh/Naved/CP_80K/Output5/Process100_Nucleus.csv
Original dataframe shape: (53535, 650)


KeyError: 'FileName_Tumor'

In [None]:
from datetime import datetime
print(datetime.now())

In [9]:
df = pd.read_csv('/home/jrm/Adjeroh/Naved/CP_80K/Output5/Process100_Nucleus.csv')

In [10]:
for c in df.columns:
    print(c)

ImageNumber
ObjectNumber
AreaShape_Area
AreaShape_BoundingBoxArea
AreaShape_BoundingBoxMaximum_X
AreaShape_BoundingBoxMaximum_Y
AreaShape_BoundingBoxMinimum_X
AreaShape_BoundingBoxMinimum_Y
AreaShape_Center_X
AreaShape_Center_Y
AreaShape_CentralMoment_0_0
AreaShape_CentralMoment_0_1
AreaShape_CentralMoment_0_2
AreaShape_CentralMoment_0_3
AreaShape_CentralMoment_1_0
AreaShape_CentralMoment_1_1
AreaShape_CentralMoment_1_2
AreaShape_CentralMoment_1_3
AreaShape_CentralMoment_2_0
AreaShape_CentralMoment_2_1
AreaShape_CentralMoment_2_2
AreaShape_CentralMoment_2_3
AreaShape_Compactness
AreaShape_ConvexArea
AreaShape_Eccentricity
AreaShape_EquivalentDiameter
AreaShape_EulerNumber
AreaShape_Extent
AreaShape_FormFactor
AreaShape_HuMoment_0
AreaShape_HuMoment_1
AreaShape_HuMoment_2
AreaShape_HuMoment_3
AreaShape_HuMoment_4
AreaShape_HuMoment_5
AreaShape_HuMoment_6
AreaShape_InertiaTensorEigenvalues_0
AreaShape_InertiaTensorEigenvalues_1
AreaShape_InertiaTensor_0_0
AreaShape_InertiaTensor_0_1
Area