# Cell Profiler - Per Patient
Nucleus rollup per patch.

In [1]:
import numpy as np
import pandas as pd
import os
import csv
from datetime import datetime
print(datetime.now())

2022-08-10 07:57:17.357715


In [2]:
BASE_PATH='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/' # Alien
BASE_PATH='D:\\Adjeroh\\Glioma\\August_Run\\CellProfilerPerPatient\\'   # Windows
NUCLEUS_INPUT='Process100_Nucleus.csv'
NUCLEUS_OUTPUT='NucleusToPatchRollup.csv'
IMAGE_COL='ImageNumber'

In [3]:
def save_patient(patient,df):
    if patient is not None:
        filename = BASE_PATH + patient + '/' + NUCLEUS_OUTPUT
        #print('Save',filename)
        #df = pd.DataFrame(one_row_as_dict)
        #df = df.replace('nan',np.nan)
        df.to_csv(filename)  # Now ImageNumber is the index. Do not say index=False! 

In [4]:
def drop_columns(df):
    cols = df.columns
    # For patch-level rollup, need to leave column ImageNumber
    bad_cols=[c for c in cols if c in
              ['Children_Cells_Count','Number_Object_Number','ObjectNumber']]
    # These fields specify the nucleus location relative to the patch, which is irrelevant.
    loc_cols=[c for c in cols 
              if c.startswith('Location_')
              or c.startswith('AreaShape_BoundingBoxM')
              or c.startswith('AreaShape_Center')]
    df = df.drop(columns=bad_cols)
    df = df.drop(columns=loc_cols)
    return df

In [5]:
def rollup(df):
    rollup = df.groupby([IMAGE_COL]).describe(include=[np.number]) ## this is slow: 3min/patient
    rollup.columns=rollup.columns.map('_'.join)  ## helps random forest code
    return rollup

In [6]:
print(datetime.now())
for patient in os.listdir(BASE_PATH):
    if patient.startswith('TCGA-'):
        print(datetime.now())
        print('Processing',patient)
        filename = BASE_PATH + patient +'/' + NUCLEUS_INPUT
        bigdf = pd.read_csv(filename)
        shape1 = bigdf.shape
        bigdf = drop_columns(bigdf)
        shape2 = bigdf.shape
        bigdf = rollup(bigdf)
        shape3 = bigdf.shape
        print('Shape initial, reduced, described:',shape1,shape2,shape3)
        save_patient(patient,bigdf)

2022-08-10 07:57:17.442322
2022-08-10 07:57:17.442322
Processing TCGA-HW-7495-01Z-00
Shape initial, reduced, described: (4294, 650) (4294, 632) (396, 5048)
2022-08-10 08:01:33.429943
Processing TCGA-HW-A5KJ-01Z-00
Shape initial, reduced, described: (5119, 650) (5119, 632) (409, 5048)
2022-08-10 08:05:58.957632
Processing TCGA-QH-A65R-01Z-00
Shape initial, reduced, described: (8770, 650) (8770, 632) (403, 5048)
2022-08-10 08:10:19.208370
Processing TCGA-QH-A65Z-01Z-00
Shape initial, reduced, described: (6273, 650) (6273, 632) (408, 5048)
2022-08-10 08:14:42.564152
Processing TCGA-QH-A6CS-01Z-00
Shape initial, reduced, described: (5884, 650) (5884, 632) (410, 5048)
2022-08-10 08:19:07.184638
Processing TCGA-QH-A6CU-01Z-00
Shape initial, reduced, described: (7769, 650) (7769, 632) (405, 5048)
2022-08-10 08:23:29.325910
Processing TCGA-QH-A6CZ-01Z-00
Shape initial, reduced, described: (4690, 650) (4690, 632) (394, 5048)
2022-08-10 08:27:43.274398
Processing TCGA-QH-A6X8-01Z-00
Shape initia

In [7]:
print(datetime.now())


2022-08-10 10:07:45.461140
