# Process CellProfiler Image.csv
Here, we filtered Image.csv files to have...

* fewer columns (e.g. remove strings)
* fewer rows (e.g. remove high-RBC patches).

Assume there is one Image.csv file per patient (output from previous notebook).

Our hypothesis is that the slimmed CSVs are better substrate for learning.

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
from datetime import datetime
print(datetime.now())

2022-07-19 14:21:51.246047


In [3]:
BASE_PATH_IN='/home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/'
BASE_PATH_OUT='/home/jrm/Adjeroh/Naved/July_Run/CellProfilerFiltered/'
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']

In [4]:
ALL_PATIENTS = os.listdir(BASE_PATH_IN)

In [5]:
def drop_cols(df,cols):
    if len(cols)>0:
        df = df.drop(columns=cols) 
    return df

In [6]:
def drop_bad_cols(df):
    # Timing stats give away classes if classes were processed differently. 
    bad_cols = [c for c in df.columns if c.startswith('ExecutionTime_')]
    df = drop_cols(df,bad_cols)
    # These columns are 100% uniform -- total area of the patch.
    bad_cols = [c for c in df.columns if c.startswith('AreaOccupied_TotalArea_')]
    df = drop_cols(df,bad_cols)
    # These columns are mostly empty except for FileName_Tumor.
    bad_cols = [c for c in df.columns if c.endswith('_Tumor')]
    df = drop_cols(df,bad_cols)
    # String columns with filenames or directory names can give away the class.
    # Patient was added by us in the previous notebook.
    bad_cols = [
        'Group_Index','Group_Number','ImageSet_ImageSet',
        'Patient','Metadata_FileLocation']
    # Oddly, we don't have this column: 'ProcessingStatus'
    df = drop_cols(df,bad_cols)
    return df

In [7]:
# By removing bad patches, we can improve accuracy at patch level.
# Assume we would then aggregate patch predictions to WSI level.
# Thresholds were chosen by looking for elbows in these histograms:
# pd.DataFrame.hist(df,column='AreaOccupied_AreaOccupied_MergeRBC')
# pd.DataFrame.hist(df,column='AreaOccupied_AreaOccupied_Tissue')
# See notebook Data 03
def drop_bad_rows(df):
    # This filter would remove entirely one WSI. See odd case below.
    # df = df[df.AreaOccupied_AreaOccupied_Tissue>30000]
    df = df[df.AreaOccupied_AreaOccupied_MergeRBC<5000]    
    return df

In [8]:
def drop_bad_vals(df):
    # Expect nan for mean_RBC_diameter where #RBC=0
    df.fillna(0,inplace=True)
    df.fillna(0,inplace=True)
    nan = df.isna().sum().sum()
    temp_df=df.select_dtypes(include='object')
    inf = np.isinf(df).values.sum()
    if nan>0 or len(temp_df.columns)>0 or inf>0:
        raise Exception('Non-numeric values in df')
    return df

In [9]:
def save_file(df,directory,filename):
    try:
        os.mkdir(BASE_PATH_OUT+directory)
    except FileExistsError:
        print('Directory',directory,'already exists. Will overwrite files in there.')
    output = BASE_PATH_OUT+directory+'/'+filename
    df.to_csv(output,index=False)

In [10]:
infile = FILENAMES[0]  # So far, we only process Image.csv
print(datetime.now())
for directory in ALL_PATIENTS:
    input =  BASE_PATH_IN+ directory+'/'+infile
    print('Reading from',input)
    df = pd.read_csv(input)
    shape1 = df.shape
    df = drop_bad_cols(df)
    df = drop_bad_rows(df)
    df = drop_bad_vals(df)
    shape2 = df.shape
    print('Shape change from',shape1,'to',shape2)
    save_file(df,directory,infile)

Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-HT-7483-01Z-00/Process100_Image.csv
Shape change from (1267, 5347) to (829, 5302)
Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-02-0025-01Z-00/Process100_Image.csv
Shape change from (797, 5347) to (698, 5302)
Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-DH-A66B-01Z-00/Process100_Image.csv
Shape change from (391, 5347) to (389, 5302)
Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-08-0517-01Z-00/Process100_Image.csv
Shape change from (393, 5347) to (368, 5302)
Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-26-5139-01Z-00/Process100_Image.csv
Shape change from (402, 5347) to (391, 5302)
Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-S9-A6WE-01Z-00/Process100_Image.csv
Shape change from (405, 5347) to (405, 5302)
Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-15-1446-01Z-00/P

Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-DU-7015-01Z-00/Process100_Image.csv
Shape change from (381, 5347) to (381, 5302)
Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-HT-7873-01Z-00/Process100_Image.csv
Shape change from (861, 5347) to (769, 5302)
Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-26-5134-01Z-00/Process100_Image.csv
Shape change from (398, 5347) to (389, 5302)
Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-HT-8104-01Z-00/Process100_Image.csv
Shape change from (399, 5347) to (393, 5302)
Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-HT-A617-01Z-00/Process100_Image.csv
Shape change from (400, 5347) to (394, 5302)
Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-S9-A6WL-01Z-00/Process100_Image.csv
Shape change from (388, 5347) to (386, 5302)
Reading from /home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-HT-7881-01Z-00/Pr

In [11]:
print(datetime.now())

2022-07-19 14:27:06.220264


In [12]:
df = pd.read_csv('/home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-HW-7495-01Z-00/Process100_Image.csv')
df.shape

(396, 5347)

In [13]:
for c in df.columns:
    if c.startswith('Area'):
        print(c)

AreaOccupied_AreaOccupied_ExpandCells
AreaOccupied_AreaOccupied_MergeRBC
AreaOccupied_AreaOccupied_Nucleus
AreaOccupied_AreaOccupied_Tissue
AreaOccupied_Perimeter_ExpandCells
AreaOccupied_Perimeter_MergeRBC
AreaOccupied_Perimeter_Nucleus
AreaOccupied_Perimeter_Tissue
AreaOccupied_TotalArea_ExpandCells
AreaOccupied_TotalArea_MergeRBC
AreaOccupied_TotalArea_Nucleus
AreaOccupied_TotalArea_Tissue


We seem to have one WSI with very low tissue count, though the WSI looks fine at 
[TCGA](https://portal.gdc.cancer.gov/files/6d237796-dfd7-4a00-b44d-78ea26ec3a49).

In [14]:
col=df[['AreaOccupied_AreaOccupied_Nucleus','AreaOccupied_AreaOccupied_ExpandCells','AreaOccupied_AreaOccupied_Tissue']]
col.describe()

Unnamed: 0,AreaOccupied_AreaOccupied_Nucleus,AreaOccupied_AreaOccupied_ExpandCells,AreaOccupied_AreaOccupied_Tissue
count,396.0,396.0,396.0
mean,5169.277778,89998.934343,1052.719697
std,1993.628727,21.206298,2824.147793
min,317.0,89578.0,0.0
25%,3876.0,90000.0,0.0
50%,5159.0,90000.0,52.0
75%,6418.75,90000.0,779.75
max,12466.0,90000.0,24920.0


For comparison, here is some other WSI.

In [15]:
df = pd.read_csv('/home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-HT-7482-01Z-00/Process100_Image.csv')
df.shape

(3480, 5347)

In [16]:
col=df[['AreaOccupied_AreaOccupied_Nucleus','AreaOccupied_AreaOccupied_ExpandCells','AreaOccupied_AreaOccupied_Tissue']]
col.describe()

Unnamed: 0,AreaOccupied_AreaOccupied_Nucleus,AreaOccupied_AreaOccupied_ExpandCells,AreaOccupied_AreaOccupied_Tissue
count,3480.0,3480.0,3480.0
mean,9421.950862,89969.141667,73694.986782
std,10096.498242,1529.51126,26367.658635
min,0.0,0.0,0.0
25%,3983.75,90000.0,63920.5
50%,6374.5,90000.0,89927.0
75%,9906.25,90000.0,90000.0
max,71679.0,90000.0,90000.0
