# Cell Profiler - Per Patient
Nucleus rollup per patch.

In [1]:
import numpy as np
import pandas as pd
import os
import csv
from datetime import datetime
print(datetime.now())

2022-08-09 13:10:29.374325


In [2]:
BASE_PATH='/home/jrm/Adjeroh/Glioma/August_Run/CellProfilerPerPatient/'
NUCLEUS_INPUT='Process100_Nucleus.csv'
NUCLEUS_OUTPUT='NucleusRollupToPatient.csv'
IMAGE_COL='ImageNumber'

In [3]:
def save_patient(patient,df):
    if patient is not None:
        filename = BASE_PATH + patient + '/' + NUCLEUS_OUTPUT
        #print('Save',filename)
        #df = pd.DataFrame(one_row_as_dict)
        #df = df.replace('nan',np.nan)
        df.to_csv(filename)  # Now ImageNumber is the index. Do not say index=False! 

In [4]:
def drop_columns(df):
    cols = df.columns
    # For patch-level rollup, need to leave column ImageNumber
    bad_cols=[c for c in cols if c in
              ['Children_Cells_Count','Number_Object_Number','ObjectNumber']]
    # These fields specify the nucleus location relative to the patch, which is irrelevant.
    loc_cols=[c for c in cols 
              if c.startswith('Location_')
              or c.startswith('AreaShape_BoundingBoxM')
              or c.startswith('AreaShape_Center')]
    df = df.drop(columns=bad_cols)
    df = df.drop(columns=loc_cols)
    return df

In [5]:
def rollup(df):
    rollup = df.groupby([IMAGE_COL]).describe(include=[np.number]) ## this is slow: 3min/patient
    rollup.columns=rollup.columns.map('_'.join)  ## helps random forest code
    return rollup

In [6]:
print(datetime.now())
for patient in os.listdir(BASE_PATH):
    if patient.startswith('TCGA-'):
        print(datetime.now())
        print('Processing',patient)
        filename = BASE_PATH + patient +'/' + NUCLEUS_INPUT
        bigdf = pd.read_csv(filename)
        shape1 = bigdf.shape
        bigdf = drop_columns(bigdf)
        shape2 = bigdf.shape
        bigdf = rollup(bigdf)
        shape3 = bigdf.shape
        print('Shape initial, reduced, described:',shape1,shape2,shape3)
        save_patient(patient,bigdf)

2022-08-09 13:10:29.391280
2022-08-09 13:10:29.391889
Processing TCGA-HT-7483-01Z-00
Shape initial, reduced, described: (26625, 650) (26625, 632) (1267, 5048)
2022-08-09 13:20:13.024580
Processing TCGA-02-0025-01Z-00
Shape initial, reduced, described: (18540, 650) (18540, 632) (790, 5048)
2022-08-09 13:26:13.501183
Processing TCGA-DH-A66B-01Z-00
Shape initial, reduced, described: (7156, 650) (7156, 632) (391, 5048)
2022-08-09 13:29:11.442766
Processing TCGA-08-0517-01Z-00
Shape initial, reduced, described: (11552, 650) (11552, 632) (392, 5048)
2022-08-09 13:32:10.063443
Processing TCGA-26-5139-01Z-00
Shape initial, reduced, described: (8843, 650) (8843, 632) (402, 5048)
2022-08-09 13:35:05.072403
Processing TCGA-S9-A6WE-01Z-00
Shape initial, reduced, described: (2800, 650) (2800, 632) (405, 5048)
2022-08-09 13:37:59.997335
Processing TCGA-S9-A6U1-01Z-00
Shape initial, reduced, described: (2956, 650) (2956, 632) (399, 5048)
2022-08-09 13:40:52.549214
Processing TCGA-DH-A66G-01Z-00
Shape

KeyboardInterrupt: 

In [None]:
print(datetime.now())
