# Nucleus
Break up the CellProfiler Nucleus.csv files.
The data were clumped by cancer class (all of class 0 in one Nucleus.csv file) 
merely because they were processed in that order.

Here, create one Nucleus.csv file per case/patient. (We still need to split up other object files like Cells.csv, but we'll do that next.)

Whereas the Image.csv files contained patient ID in the FileName_Tumor field,
the Nucleus.csv files do not.
It is necessary to get the patient ID from the Image.csv file,
using their common field ImageNumber.

Note there may be 2 or more WSI per case.
We aim to predict cancer class per patch, then roll up to WSI or case.

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
from datetime import datetime
print(datetime.now())

2022-07-23 09:23:34.565151


In [3]:
BASE_PATH_IN='/home/jrm/Adjeroh/Naved/CP_80K/'
BASE_PATH_OUT='/home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/'
INPUT_DIRS=[
'Output5/',
'Output4/',
'Output3/',
'Output2/',
'Output1/',
'Output0/'
]
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']
IMAGE_FILENAME = FILENAMES[0]  # Image.csv  
OBJECT_FILENAME = FILENAMES[5]  # Nucleus.csv
IMAGE_COL='ImageNumber'
TUMOR_COL='FileName_Tumor'   # use this column to disambiguate patients
PATIENT_COL='Patient'
LEN_CASE_ID=19 # prefix of TCGA-06-0129-01Z-00-DX1_5400_5100.png
LEN_WSI_ID=23

## Process one Image.csv file

In [4]:
def add_patient_column(df):
    # Patch filename format: TCGA-06-0129-01Z-00-DX1_5400_5100.png
    # For WSI ID, use first 23 letters.
    # For patient or case ID, use first 19 letters. 
    patient_column = df[TUMOR_COL].astype(str).str[:LEN_CASE_ID]
    # For convenience, add a column containing patient ID.
    df[PATIENT_COL] = patient_column
    return df

In [5]:
# The mapping from patient to image number exists only in the Image.csv file.
# All object files (like Nucleus) contain only image number.
def get_mapping(indir):
    infile = BASE_PATH_IN+indir+IMAGE_FILENAME
    print('Reading from',infile)
    df = pd.read_csv(infile)
    df = add_patient_column(df)
    df = df[ [IMAGE_COL,PATIENT_COL] ]
    df = df.set_index(IMAGE_COL)
    return df

In [6]:
# test
image_to_patient = get_mapping(INPUT_DIRS[5])
image_to_patient

Reading from /home/jrm/Adjeroh/Naved/CP_80K/Output0/Process100_Image.csv


Unnamed: 0_level_0,Patient
ImageNumber,Unnamed: 1_level_1
1,TCGA-02-0004-01Z-00
2,TCGA-02-0004-01Z-00
3,TCGA-02-0004-01Z-00
4,TCGA-02-0004-01Z-00
5,TCGA-02-0004-01Z-00
...,...
44962,TCGA-41-5651-01Z-00
44963,TCGA-41-5651-01Z-00
44964,TCGA-41-5651-01Z-00
44965,TCGA-41-5651-01Z-00


## Process one Nucleus.csv file

In [7]:
def get_nuclei(indir):
    infile = BASE_PATH_IN+indir+OBJECT_FILENAME
    print('Reading from',infile)
    df = pd.read_csv(infile)
    df = df.set_index(IMAGE_COL)
    # In our pipeline, every nucleus has exactly 1 child cell.
    # We have no need for the Number_Object_Number or ObjectNumber.
    df = df.drop(columns=['Children_Cells_Count','ObjectNumber','Number_Object_Number'])
    locations = [c for c in df.columns if c.startswith('Location_')]
    df = df.drop(columns=locations)
    df[PATIENT_COL]=''  # add a patient column to be filled in later
    return df

In [8]:
def rollup(df):
    rollup = df.groupby([IMAGE_COL]).describe() ## this is slow
    rollup.columns=rollup.columns.map('_'.join)  ## helps random forest code
    return rollup

In [9]:
def populate_patient(df,mapper):
    for index, row in df.iterrows():
        patient = mapper.loc[index][PATIENT_COL]
        #print(index,patient)
        df.at[index,PATIENT_COL]=patient
    return df

In [None]:
# test
mapper = get_mapping(INPUT_DIRS[5])
df = get_nuclei(INPUT_DIRS[5])
print(df.shape)
print('Rollup...')
df = rollup(df)
print(df.shape)
print('Patient...')
df = populate_patient(df,mapper)
print(df.shape)
df

Reading from /home/jrm/Adjeroh/Naved/CP_80K/Output0/Process100_Image.csv
Reading from /home/jrm/Adjeroh/Naved/CP_80K/Output0/Process100_Nucleus.csv


# This is where we stopped
The describe function is very slow and runs out of memory.
We already have mean, median, and stdev in the image.csv file.
So, can live without the nucleus file.

In [None]:
print(datetime.now())

In [None]:
def renumber_images(df):
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Original ImageNumber min,max:',min_image_num,max_image_num)
    for i in range(len(df)):
        old_num=df.at[i,IMAGE_COL]
        df.at[i,'ImageNumber']=old_num-min_image_num+1
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Revised ImageNumber min,max:',min_image_num,max_image_num)

In [None]:
def save(df,dirname,filename):
    outdir = BASE_PATH_OUT+dirname
    try:
        os.mkdir(outdir)
    except FileExistsError:
        print('Directory',dirname,'already exists. May overwrite files in there.')
    outfile = outdir+'/'+filename
    df.to_csv(outfile,index=False)

In [None]:
# This loop will have to operate on cancer class
# since Image_number starts at 1 per class.

def extract_patients(df,filename):
    patients = df[PATIENT_COL].unique()
    print('Patients to process:',patients)
    for one_patient in patients:
        print('Start patient',one_patient)
        patient_data = df.loc[df[PATIENT_COL]==one_patient]
        patient_data = patient_data.reset_index(drop=True) # so iloc starts at zero
        renumber_images(patient_data)
        save(patient_data,one_patient,filename)
        print('Saved patient',one_patient,'filename',filename,'shape:',patient_data.shape)

## Process all Nucleus.csv files

In [None]:


for indir in INPUT_DIRS:
    infile = BASE_PATH_IN+indir+filename
    print('Reading from',infile)
    df = pd.read_csv(infile)
    print('Original dataframe shape:',df.shape)
    df = add_patient_column(df)
    extract_patients(df,filename)

In [None]:
from datetime import datetime
print(datetime.now())

In [None]:
df = pd.read_csv('/home/jrm/Adjeroh/Naved/CP_80K/Output5/Process100_Nucleus.csv')

In [None]:
for c in df.columns:
    print(c)