# Cell Profiler - Per Patient
Redistribute rows of Nucleus.csv files into one file per patient.  
In preparation for other csv files, store each file in its own directory.  
Also, drop the columns that won't be used for classification.  

Problem: the raw CellProfiler outputs were organized per run.  
Inputs like: Class0/Process100_Nucleus.csv (combines patients of cancer class 0)  
Outputs like: TCGA-HT-7482-01Z-00/Process100_Image.csv (specific to one patient)

Based on CP_PerPatient.01 which worked out the Image.csv mechanics.   

Because the class 0 nucleus file is so large (1 million lines), we avoid reading it into RAM.

For RF, change NaN to zero.

In [1]:
import numpy as np
import pandas as pd
import os
import csv
from datetime import datetime
print(datetime.now())

2022-08-09 10:01:09.303257


In [2]:
BASE_PATH_IN='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/'
BASE_PATH_OUT='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/'
TRACKING_FILE=BASE_PATH_OUT+'PatchTracking.csv'
INPUT_DIRS=[
'Output5/',
'Output5.1/',
'Output4/',
'Output4.1/',
'Output3/',
'Output3.1/',
'Output2/',
'Output1/',
'Output0/'
]
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']
IMAGE_FILE=FILENAMES[0]
NUCLEUS_FILE=FILENAMES[5]
IMAGE_COL='ImageNumber'
TUMOR_COL='FileName_Tumor'   # use this column to disambiguate patients
PATIENT_COL='Patient'        # add this column to emphasize patient ID
# Patch filename format: TCGA-06-0129-01Z-00-DX1_5400_5100.png
# For WSI ID, use first 23 letters.
# For patient or case ID, use first 19 letters. 
LEN_CASE_ID=19
LEN_WSI_ID=23

In [3]:
TEST = BASE_PATH_IN + INPUT_DIRS[0] + NUCLEUS_FILE
df = pd.read_csv(TEST)
bad_cols=['Children_Cells_Count',
          'Number_Object_Number','ImageNumber','ObjectNumber']
loc_cols=[c for c in df.columns if c.startswith('Location_')
          or c.startswith('AreaShape_BoundingBoxM')
          or c.startswith('AreaShape_Center')]
#for c in df.columns:
#    print(c)

In [4]:
def load_imagenum_converter():
    df = pd.read_csv(TRACKING_FILE)
    converter={}
    for ndx,data in df.iterrows():
        patient=data['patient_directory']
        orig_directory=data['orig_directory']
        orig_imagenum=int(data['orig_imagenum'])
        new_imagenum=int(data['new_imagenum'])
        key = (orig_directory,orig_imagenum)
        val = (patient,new_imagenum)
        converter[key] = val
    return converter

In [5]:
def save_patient(patient,list_of_dict):
    if patient is not None:
        filename = BASE_PATH_OUT + patient + '/' + NUCLEUS_FILE
        print('Save',filename)
        df = pd.DataFrame(list_of_dict)
        df = df.replace('nan',np.nan)
        before = df.isna().sum().sum()
        df = df.fillna(0)
        after = df.isna().sum().sum()
        print('Nan counts before and after:',before,after)
        df.to_csv(filename,index=False) #,na_rep='nan')

In [6]:
print(datetime.now())
print('Load image number conversions')
converter = load_imagenum_converter()
patients_processed = []
for orig_directory in INPUT_DIRS:
    filename = BASE_PATH_IN + orig_directory + NUCLEUS_FILE
    print(datetime.now())
    print('Processing',filename)
    with open(filename,'r') as infile:
        patient = None
        patient_rows = []
        prev_patient = None
        reader = csv.DictReader(infile)
        for row in reader:
            orig_imagenum = int(row[IMAGE_COL])
            key = (orig_directory,orig_imagenum)
            (patient,new_imagenum) = converter[key]
            if patient != prev_patient:
                save_patient(prev_patient,patient_rows)
                prev_patient = patient
                patient_rows = []
                if patient in patients_processed:
                    raise Exception(patient+' seen twice')
                patients_processed.append(patient)
            row[IMAGE_COL] = new_imagenum
            patient_rows.append(row)
        save_patient(patient,patient_rows)  # special case: last patient per file
print('Saved',len(patients_processed),'patients')    

2022-08-09 10:01:13.147650
Load image number conversions
2022-08-09 10:01:15.409708
Processing /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/Output5/Process100_Nucleus.csv
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-DB-A4XG-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 96016 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-HT-A5R9-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 41286 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-QH-A6CU-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 23311 0
2022-08-09 10:01:49.361434
Processing /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerOutputs/Output5.1/Process100_Nucleus.csv
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-DB-A64P-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 17763 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-DH-A669-01Z-00/Process100_Nucleus.csv
Nan coun

Nan counts before and after: 182160 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-HT-7483-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 79908 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-HT-7681-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 110315 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-HT-7873-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 85204 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-HT-7902-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 31951 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-QH-A6CZ-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 14092 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-QH-A6XA-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 12566 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-S9-A6WI-01Z-00/Process100_Nu

Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-06-5412-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 26058 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-08-0517-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 34725 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-08-0518-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 177560 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-08-0520-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 24605 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-14-0786-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 74335 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-14-0787-01Z-00/Process100_Nucleus.csv
Nan counts before and after: 50270 0
Save /home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/TCGA-14-0789-01Z-00/Process100_Nucleus.csv
Nan counts before and after:

In [7]:
print(datetime.now())


2022-08-09 10:21:11.476070
