# CSV filter
Assume one CSV per patient (output from CSV_prep notebook).
Write a slimmed down CSV with fewer columns (e.g. remove strings)
and fewer rows (e.g. remove high-RBC patches).
The slimmed CSVs are better substrate for learning.

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
from datetime import datetime
print(datetime.now())

2022-07-15 08:45:17.850104


In [3]:
BASE_PATH='/home/jrm/Martinez/CellProfilerRuns/CP_20220705/'
INPUT_FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']
OUTPUT_FILENAMES=[
'Filtered_Image.csv',
'Filtered_Cells.csv',
'Filtered_ExpandCells.csv',
'Filtered_Experiment.csv',
'Filtered_MergeRBC.csv',
'Filtered_Nucleus.csv',
'Filtered_RBC.csv',
'Filtered_ShrinkRBC.csv',
'Filtered_Tissue.csv']

In [4]:
DF_Ypos = ['B7_','B15','D1_','D5_','E7_','E9_','F9_','G3_','H13','I1_','I5_','I13']
DF_Yneg = ['A3_','A5_','B13','C1_','C11','D3_','E5_','F3_','F7_','F11','F15','G15','H1_','H3_','H7_','H15']
ALL_PATIENTS = DF_Ypos + DF_Yneg

In [5]:
def drop_cols(df,cols):
    if len(cols)>0:
        df = df.drop(columns=cols) 
    return df

In [6]:
def drop_bad_cols(df):
    # Timing stats give away classes if classes were processed differently. 
    bad_cols = [c for c in df.columns if c.startswith('ExecutionTime_')]
    df = drop_cols(df,bad_cols)
    # These columns are 100% uniform -- total area of the patch.
    bad_cols = [c for c in df.columns if c.startswith('AreaOccupied_TotalArea_')]
    df = drop_cols(df,bad_cols)
    # These columns are mostly empty except for FileName_Tumor.
    bad_cols = [c for c in df.columns if c.endswith('_Tumor')]
    df = drop_cols(df,bad_cols)
    # String columns with filenames or directory names can give away the class.
    bad_cols = [
        'Group_Index','Group_Number','ImageSet_ImageSet',
        'ProcessingStatus','Patient','Metadata_FileLocation']
    df = drop_cols(df,bad_cols)
    return df

In [7]:
# By removing bad patches, we can improve accuracy at patch level.
# Assume we would then aggregate patch predictions to WSI level.
# Thresholds were chosen by looking for elbows in these histograms:
# pd.DataFrame.hist(df,column='AreaOccupied_AreaOccupied_MergeRBC')
# pd.DataFrame.hist(df,column='AreaOccupied_AreaOccupied_Tissue')
def drop_bad_rows(df):
    df = df[df.AreaOccupied_AreaOccupied_Tissue>10000]
    df = df[df.AreaOccupied_AreaOccupied_MergeRBC<6000]    
    return df

In [8]:
def drop_bad_vals(df):
    # Expect nan for mean_RBC_diameter where #RBC=0
    df.fillna(0,inplace=True)
    df.fillna(0,inplace=True)
    nan = df.isna().sum().sum()
    temp_df=df.select_dtypes(include='object')
    inf = np.isinf(df).values.sum()
    if nan>0 or len(temp_df.columns)>0 or inf>0:
        raise Exception('Non-numeric values in df')
    return df

In [9]:
infile = INPUT_FILENAMES[0]  # So far, we only process Image.csv
outfile = OUTPUT_FILENAMES[0]  # So far, we only process Image.csv

for directory in ALL_PATIENTS:
    input = BASE_PATH+directory+'/'+infile
    output = BASE_PATH+directory+'/'+outfile
    print('Reading from',input)
    df = pd.read_csv(input)
    shape1 = df.shape
    df = drop_bad_cols(df)
    df = drop_bad_rows(df)
    df = drop_bad_vals(df)
    shape2 = df.shape
    print('Shape change from',shape1,'to',shape2)
    df.to_csv(output)

Reading from /home/jrm/Martinez/CellProfilerRuns/CP_20220705/B7_/Process100_Image.csv
Shape change from (283, 5350) to (278, 5304)
Reading from /home/jrm/Martinez/CellProfilerRuns/CP_20220705/B15/Process100_Image.csv
Shape change from (280, 5350) to (265, 5304)
Reading from /home/jrm/Martinez/CellProfilerRuns/CP_20220705/D1_/Process100_Image.csv
Shape change from (298, 5350) to (286, 5304)
Reading from /home/jrm/Martinez/CellProfilerRuns/CP_20220705/D5_/Process100_Image.csv
Shape change from (270, 5350) to (252, 5304)
Reading from /home/jrm/Martinez/CellProfilerRuns/CP_20220705/E7_/Process100_Image.csv
Shape change from (222, 5350) to (203, 5304)
Reading from /home/jrm/Martinez/CellProfilerRuns/CP_20220705/E9_/Process100_Image.csv
Shape change from (310, 5350) to (293, 5304)
Reading from /home/jrm/Martinez/CellProfilerRuns/CP_20220705/F9_/Process100_Image.csv
Shape change from (318, 5350) to (309, 5304)
Reading from /home/jrm/Martinez/CellProfilerRuns/CP_20220705/G3_/Process100_Image.c

In [10]:
from datetime import datetime
print(datetime.now())

2022-07-15 08:45:46.598849
