# Nucleus
Break up the CellProfiler Nucleus.csv files.
The data were clumped by cancer class (all of class 0 in one Nucleus.csv file) 
merely because they were processed in that order.

Here, create one Nucleus.csv file per case/patient. (We still need to split up other object files like Cells.csv, but we'll do that next.)

Whereas the Image.csv files contained patient ID in the FileName_Tumor field,
the Nucleus.csv files do not.
It is necessary to get the patient ID from the Image.csv file,
using their common field ImageNumber.

Note there may be 2 or more WSI per case.
We aim to predict cancer class per patch, then roll up to WSI or case.

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
from datetime import datetime
print(datetime.now())

2022-07-23 09:08:53.297346


In [3]:
BASE_PATH_IN='/home/jrm/Adjeroh/Naved/CP_80K/'
BASE_PATH_OUT='/home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/'
INPUT_DIRS=[
'Output5/',
'Output4/',
'Output3/',
'Output2/',
'Output1/',
'Output0/'
]
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']
IMAGE_FILENAME = FILENAMES[0]  # Image.csv  
OBJECT_FILENAME = FILENAMES[5]  # Nucleus.csv
IMAGE_COL='ImageNumber'
TUMOR_COL='FileName_Tumor'   # use this column to disambiguate patients
PATIENT_COL='Patient'
LEN_CASE_ID=19 # prefix of TCGA-06-0129-01Z-00-DX1_5400_5100.png
LEN_WSI_ID=23

## Process one Image.csv file

In [4]:
def add_patient_column(df):
    # Patch filename format: TCGA-06-0129-01Z-00-DX1_5400_5100.png
    # For WSI ID, use first 23 letters.
    # For patient or case ID, use first 19 letters. 
    patient_column = df[TUMOR_COL].astype(str).str[:LEN_CASE_ID]
    # For convenience, add a column containing patient ID.
    df[PATIENT_COL] = patient_column
    return df

In [5]:
# The mapping from patient to image number exists only in the Image.csv file.
# All object files (like Nucleus) contain only image number.
def get_mapping(indir):
    infile = BASE_PATH_IN+indir+IMAGE_FILENAME
    print('Reading from',infile)
    df = pd.read_csv(infile)
    df = add_patient_column(df)
    df = df[ [IMAGE_COL,PATIENT_COL] ]
    df = df.set_index(IMAGE_COL)
    return df

In [6]:
# test
image_to_patient = get_mapping(INPUT_DIRS[0])
image_to_patient

Reading from /home/jrm/Adjeroh/Naved/CP_80K/Output5/Process100_Image.csv


Unnamed: 0_level_0,Patient
ImageNumber,Unnamed: 1_level_1
1,TCGA-DB-A4XG-01Z-00
2,TCGA-DB-A4XG-01Z-00
3,TCGA-DB-A4XG-01Z-00
4,TCGA-DB-A4XG-01Z-00
5,TCGA-DB-A4XG-01Z-00
...,...
1587,TCGA-QH-A6CU-01Z-00
1588,TCGA-QH-A6CU-01Z-00
1589,TCGA-QH-A6CU-01Z-00
1590,TCGA-QH-A6CU-01Z-00


## Process one Nucleus.csv file

In [7]:
def get_nuclei(indir):
    infile = BASE_PATH_IN+indir+OBJECT_FILENAME
    print('Reading from',infile)
    df = pd.read_csv(infile)
    df = df.set_index(IMAGE_COL)
    # In our pipeline, every nucleus has exactly 1 child cell.
    # We have no need for the Number_Object_Number or ObjectNumber.
    df = df.drop(columns=['Children_Cells_Count','ObjectNumber','Number_Object_Number'])
    df[PATIENT_COL]=''
    return df

In [8]:
def populate_patient(df,mapper):
    for index, row in df.iterrows():
        patient = mapper.loc[index][PATIENT_COL]
        #print(index,patient)
        df.at[index,PATIENT_COL]=patient
    return df

In [9]:
# test
mapper = get_mapping(INPUT_DIRS[0])
df = get_nuclei(INPUT_DIRS[0])
df = populate_patient(df,mapper)
df

Reading from /home/jrm/Adjeroh/Naved/CP_80K/Output5/Process100_Image.csv
Reading from /home/jrm/Adjeroh/Naved/CP_80K/Output5/Process100_Nucleus.csv


Unnamed: 0_level_0,AreaShape_Area,AreaShape_BoundingBoxArea,AreaShape_BoundingBoxMaximum_X,AreaShape_BoundingBoxMaximum_Y,AreaShape_BoundingBoxMinimum_X,AreaShape_BoundingBoxMinimum_Y,AreaShape_Center_X,AreaShape_Center_Y,AreaShape_CentralMoment_0_0,AreaShape_CentralMoment_0_1,...,Texture_Variance_Hematoxylin_4_03_256,Texture_Variance_Hematoxylin_5_00_256,Texture_Variance_Hematoxylin_5_01_256,Texture_Variance_Hematoxylin_5_02_256,Texture_Variance_Hematoxylin_5_03_256,Texture_Variance_Hematoxylin_7_00_256,Texture_Variance_Hematoxylin_7_01_256,Texture_Variance_Hematoxylin_7_02_256,Texture_Variance_Hematoxylin_7_03_256,Patient
ImageNumber,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,257,364,194,14,168,0,179.680934,5.241245,257.0,-4.263256e-14,...,564.722473,648.004869,678.803563,644.486105,556.292714,589.667692,861.632147,645.127433,520.479410,TCGA-DB-A4XG-01Z-00
1,1230,1763,215,54,172,13,192.229268,31.889431,1230.0,3.126388e-13,...,427.195252,464.462349,470.453898,463.417370,411.709306,456.646188,514.630194,486.037547,412.162634,TCGA-DB-A4XG-01Z-00
1,582,945,239,57,212,22,225.123711,38.482818,582.0,1.705303e-13,...,255.022783,257.044711,272.115662,240.692317,265.408639,282.593527,268.629002,258.640336,308.727980,TCGA-DB-A4XG-01Z-00
1,833,1080,209,76,173,46,191.584634,60.704682,833.0,6.288303e-13,...,209.908539,204.716900,216.001172,210.714399,203.749999,213.471251,230.988281,194.248272,197.194280,TCGA-DB-A4XG-01Z-00
1,619,870,233,138,204,108,217.935380,122.276252,619.0,1.397993e-12,...,360.099220,402.140038,309.685058,334.911450,370.105404,422.193783,263.134268,322.224580,426.499802,TCGA-DB-A4XG-01Z-00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1590,218,275,231,300,206,289,218.536697,294.834862,218.0,-4.263256e-14,...,1038.210680,991.631311,1097.601128,1126.843023,1190.290937,964.271943,1630.624628,1697.607939,1432.979592,TCGA-QH-A6CU-01Z-00
1591,196,434,60,113,29,99,43.770408,104.454082,196.0,3.392842e-13,...,1095.310204,814.818886,762.034722,961.244051,1109.953056,771.915748,711.678571,1945.555556,1326.931332,TCGA-QH-A6CU-01Z-00
1591,406,992,122,141,90,110,104.073892,125.798030,406.0,6.679102e-13,...,1417.674260,1393.986496,1515.761414,1412.736777,1188.149908,1235.288744,1692.522912,1576.714266,781.383551,TCGA-QH-A6CU-01Z-00
1591,903,1850,145,185,108,135,124.060908,161.367663,903.0,2.700062e-13,...,1210.211681,1189.410863,1146.025525,1160.344858,1269.514888,1229.177456,1140.045958,1187.982750,1347.723797,TCGA-QH-A6CU-01Z-00


In [4]:
def renumber_images(df):
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Original ImageNumber min,max:',min_image_num,max_image_num)
    for i in range(len(df)):
        old_num=df.at[i,IMAGE_COL]
        df.at[i,'ImageNumber']=old_num-min_image_num+1
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Revised ImageNumber min,max:',min_image_num,max_image_num)

In [5]:
def save(df,dirname,filename):
    outdir = BASE_PATH_OUT+dirname
    try:
        os.mkdir(outdir)
    except FileExistsError:
        print('Directory',dirname,'already exists. May overwrite files in there.')
    outfile = outdir+'/'+filename
    df.to_csv(outfile,index=False)

In [6]:
# This loop will have to operate on cancer class
# since Image_number starts at 1 per class.

def extract_patients(df,filename):
    patients = df[PATIENT_COL].unique()
    print('Patients to process:',patients)
    for one_patient in patients:
        print('Start patient',one_patient)
        patient_data = df.loc[df[PATIENT_COL]==one_patient]
        patient_data = patient_data.reset_index(drop=True) # so iloc starts at zero
        renumber_images(patient_data)
        save(patient_data,one_patient,filename)
        print('Saved patient',one_patient,'filename',filename,'shape:',patient_data.shape)

## Process all Nucleus.csv files

In [8]:


for indir in INPUT_DIRS:
    infile = BASE_PATH_IN+indir+filename
    print('Reading from',infile)
    df = pd.read_csv(infile)
    print('Original dataframe shape:',df.shape)
    df = add_patient_column(df)
    extract_patients(df,filename)

We will operate on files named Process100_Nucleus.csv
Reading from /home/jrm/Adjeroh/Naved/CP_80K/Output5/Process100_Nucleus.csv
Original dataframe shape: (53535, 650)


KeyError: 'FileName_Tumor'

In [None]:
from datetime import datetime
print(datetime.now())

In [9]:
df = pd.read_csv('/home/jrm/Adjeroh/Naved/CP_80K/Output5/Process100_Nucleus.csv')

In [10]:
for c in df.columns:
    print(c)

ImageNumber
ObjectNumber
AreaShape_Area
AreaShape_BoundingBoxArea
AreaShape_BoundingBoxMaximum_X
AreaShape_BoundingBoxMaximum_Y
AreaShape_BoundingBoxMinimum_X
AreaShape_BoundingBoxMinimum_Y
AreaShape_Center_X
AreaShape_Center_Y
AreaShape_CentralMoment_0_0
AreaShape_CentralMoment_0_1
AreaShape_CentralMoment_0_2
AreaShape_CentralMoment_0_3
AreaShape_CentralMoment_1_0
AreaShape_CentralMoment_1_1
AreaShape_CentralMoment_1_2
AreaShape_CentralMoment_1_3
AreaShape_CentralMoment_2_0
AreaShape_CentralMoment_2_1
AreaShape_CentralMoment_2_2
AreaShape_CentralMoment_2_3
AreaShape_Compactness
AreaShape_ConvexArea
AreaShape_Eccentricity
AreaShape_EquivalentDiameter
AreaShape_EulerNumber
AreaShape_Extent
AreaShape_FormFactor
AreaShape_HuMoment_0
AreaShape_HuMoment_1
AreaShape_HuMoment_2
AreaShape_HuMoment_3
AreaShape_HuMoment_4
AreaShape_HuMoment_5
AreaShape_HuMoment_6
AreaShape_InertiaTensorEigenvalues_0
AreaShape_InertiaTensorEigenvalues_1
AreaShape_InertiaTensor_0_0
AreaShape_InertiaTensor_0_1
Area