# Process CellProfiler Image.csv
Redistribute Image.csv rows
from one file per cancer classe
to one file per patient.

The data were clumped by cancer class (all of class 0 in one Image.csv file) 
merely because they were processed in that order.

Note we have not addressed splitting up object files like the Nucleus.csv files.
Note there may be 2 or more WSI per case.
We intend to predict cancer class per patch, then roll up to WSI or case.

In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
from datetime import datetime
print(datetime.now())

2022-07-18 16:51:10.133899


In [3]:
BASE_PATH_IN='/home/jrm/Adjeroh/Naved/CP_80K/'
BASE_PATH_OUT='/home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/'
INPUT_DIRS=[
'Output5/',
'Output4/',
'Output3/',
'Output2/',
'Output1/',
'Output0/'
]
FILENAMES=[
'Process100_Image.csv',
'Process100_Cells.csv',
'Process100_ExpandCells.csv',
'Process100_Experiment.csv',
'Process100_MergeRBC.csv',
'Process100_Nucleus.csv',
'Process100_RBC.csv',
'Process100_ShrinkRBC.csv',
'Process100_Tissue.csv']
IMAGE_COL='ImageNumber'
TUMOR_COL='FileName_Tumor'   # use this column to disambiguate patients
PATIENT_COL='Patient'        # add this column to emphasize patient ID

In [4]:
def renumber_images(df):
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Original ImageNumber min,max:',min_image_num,max_image_num)
    for i in range(len(df)):
        old_num=df.at[i,IMAGE_COL]
        df.at[i,'ImageNumber']=old_num-min_image_num+1
    min_image_num = min(df[IMAGE_COL])
    max_image_num = max(df[IMAGE_COL])
    print('Revised ImageNumber min,max:',min_image_num,max_image_num)

In [5]:
def save(df,dirname,filename):
    outdir = BASE_PATH_OUT+dirname
    try:
        os.mkdir(outdir)
    except FileExistsError:
        print('Directory',dirname,'already exists. Will overwrite files in there.')
    outfile = outdir+'/'+filename
    df.to_csv(outfile,index=False)

In [6]:
def extract_patients(df,filename):
    patients = df[PATIENT_COL].unique()
    print('Patients to process:',patients)
    for one_patient in patients:
        print('Start patient',one_patient)
        patient_data = df.loc[df[PATIENT_COL]==one_patient]
        patient_data = patient_data.reset_index(drop=True) # so iloc starts at zero
        renumber_images(patient_data)
        save(patient_data,one_patient,filename)
        print('Saved patient',one_patient,'filename',filename,'shape:',patient_data.shape)

In [7]:
LEN_CASE_ID=19
LEN_WSI_ID=23
def add_patient_column(df):
    # Patch filename format: TCGA-06-0129-01Z-00-DX1_5400_5100.png
    # For WSI ID, use first 23 letters.
    # For patient or case ID, use first 19 letters. 
    patient_column = df[TUMOR_COL].astype(str).str[:LEN_CASE_ID]
    # For convenience, add a column containing patient ID.
    df[PATIENT_COL] = patient_column
    return df

In [8]:
# Come back to this!
# So far, we only process Image.csv
# Need to process Nucleus.csv etc
# which have more complicated key: ImageNumber+ObjectNumber 
filename = FILENAMES[0]  

for indir in INPUT_DIRS:
    infile = BASE_PATH_IN+indir+filename
    print('Reading from',infile)
    df = pd.read_csv(infile)
    print('Original dataframe shape:',df.shape)
    df = add_patient_column(df)
    extract_patients(df,filename)

Reading from /home/jrm/Adjeroh/Naved/CP_80K/Output5/Process100_Image.csv
Original dataframe shape: (1591, 5346)
Patients to process: ['TCGA-DB-A4XG-01Z-00' 'TCGA-HT-A5R9-01Z-00' 'TCGA-QH-A6CU-01Z-00']
Start patient TCGA-DB-A4XG-01Z-00
Original ImageNumber min,max: 1 790
Revised ImageNumber min,max: 1 790
Saved patient TCGA-DB-A4XG-01Z-00 filename Process100_Image.csv shape: (790, 5347)
Start patient TCGA-HT-A5R9-01Z-00
Original ImageNumber min,max: 791 1186
Revised ImageNumber min,max: 1 396
Saved patient TCGA-HT-A5R9-01Z-00 filename Process100_Image.csv shape: (396, 5347)
Start patient TCGA-QH-A6CU-01Z-00
Original ImageNumber min,max: 1187 1591
Revised ImageNumber min,max: 1 405
Saved patient TCGA-QH-A6CU-01Z-00 filename Process100_Image.csv shape: (405, 5347)
Reading from /home/jrm/Adjeroh/Naved/CP_80K/Output4/Process100_Image.csv
Original dataframe shape: (3197, 5346)
Patients to process: ['TCGA-DB-A4XF-01Z-00' 'TCGA-DH-A66B-01Z-00' 'TCGA-HT-A5RA-01Z-00'
 'TCGA-HT-A618-01Z-00' 'TCGA

Saved patient TCGA-HT-7902-01Z-00 filename Process100_Image.csv shape: (778, 5347)
Start patient TCGA-QH-A6CZ-01Z-00
Original ImageNumber min,max: 11954 12347
Revised ImageNumber min,max: 1 394
Saved patient TCGA-QH-A6CZ-01Z-00 filename Process100_Image.csv shape: (394, 5347)
Start patient TCGA-QH-A6XA-01Z-00
Original ImageNumber min,max: 12348 12744
Revised ImageNumber min,max: 1 397
Saved patient TCGA-QH-A6XA-01Z-00 filename Process100_Image.csv shape: (397, 5347)
Start patient TCGA-S9-A6WI-01Z-00
Original ImageNumber min,max: 12745 13140
Revised ImageNumber min,max: 1 396
Saved patient TCGA-S9-A6WI-01Z-00 filename Process100_Image.csv shape: (396, 5347)
Start patient TCGA-S9-A7IQ-01Z-00
Original ImageNumber min,max: 13141 13541
Revised ImageNumber min,max: 1 401
Saved patient TCGA-S9-A7IQ-01Z-00 filename Process100_Image.csv shape: (401, 5347)
Start patient TCGA-S9-A7QY-01Z-00
Original ImageNumber min,max: 13542 13950
Revised ImageNumber min,max: 1 409
Saved patient TCGA-S9-A7QY-01Z

Saved patient TCGA-02-0439-01Z-00 filename Process100_Image.csv shape: (794, 5347)
Start patient TCGA-02-0446-01Z-00
Original ImageNumber min,max: 5169 5559
Revised ImageNumber min,max: 1 391
Saved patient TCGA-02-0446-01Z-00 filename Process100_Image.csv shape: (391, 5347)
Start patient TCGA-06-0125-01Z-00
Original ImageNumber min,max: 5560 6218
Revised ImageNumber min,max: 1 659
Saved patient TCGA-06-0125-01Z-00 filename Process100_Image.csv shape: (659, 5347)
Start patient TCGA-06-0129-01Z-00
Original ImageNumber min,max: 6219 7329
Revised ImageNumber min,max: 1 1111
Saved patient TCGA-06-0129-01Z-00 filename Process100_Image.csv shape: (1111, 5347)
Start patient TCGA-06-0185-01Z-00
Original ImageNumber min,max: 7330 7726
Revised ImageNumber min,max: 1 397
Saved patient TCGA-06-0185-01Z-00 filename Process100_Image.csv shape: (397, 5347)
Start patient TCGA-06-0189-01Z-00
Original ImageNumber min,max: 7727 9279
Revised ImageNumber min,max: 1 1553
Saved patient TCGA-06-0189-01Z-00 fil

In [9]:
from datetime import datetime
print(datetime.now())

2022-07-18 16:57:14.250197


In [11]:
pd.read_csv('/home/jrm/Adjeroh/Naved/July_Run/CellProfilerOutputs/TCGA-CS-4943-01Z-00/Process100_Image.csv')

Unnamed: 0,AreaOccupied_AreaOccupied_ExpandCells,AreaOccupied_AreaOccupied_MergeRBC,AreaOccupied_AreaOccupied_Nucleus,AreaOccupied_AreaOccupied_Tissue,AreaOccupied_Perimeter_ExpandCells,AreaOccupied_Perimeter_MergeRBC,AreaOccupied_Perimeter_Nucleus,AreaOccupied_Perimeter_Tissue,AreaOccupied_TotalArea_ExpandCells,AreaOccupied_TotalArea_MergeRBC,...,Threshold_SumOfEntropies_Nucleus,Threshold_SumOfEntropies_RBC,Threshold_SumOfEntropies_Tissue,Threshold_WeightedVariance_Cells,Threshold_WeightedVariance_Nucleus,Threshold_WeightedVariance_RBC,Threshold_WeightedVariance_Tissue,URL_Tumor,Width_Tumor,Patient
0,90000.0,0.0,13388.0,90000.0,9006.0,0.0,2408.0,1196.0,90000.0,90000.0,...,-12.536856,0.000000,0.000000,0.334282,0.203251,0.129709,0.129709,file:///largeDataVolume/LeHou_GBMLGG_300x300_a...,300,TCGA-CS-4943-01Z-00
1,90000.0,0.0,10596.0,89788.0,8416.0,0.0,2006.0,1254.0,90000.0,90000.0,...,-12.511983,0.000000,-11.725671,0.211245,0.227444,0.183138,0.172522,file:///largeDataVolume/LeHou_GBMLGG_300x300_a...,300,TCGA-CS-4943-01Z-00
2,90000.0,0.0,10717.0,89757.0,8528.0,0.0,2038.0,1292.0,90000.0,90000.0,...,-12.662106,0.000000,-13.508905,0.241006,0.284926,0.234737,0.221361,file:///largeDataVolume/LeHou_GBMLGG_300x300_a...,300,TCGA-CS-4943-01Z-00
3,90000.0,0.0,7939.0,87325.0,6168.0,0.0,1468.0,1282.0,90000.0,90000.0,...,-12.687723,0.000000,-13.289617,0.250596,0.352307,0.359925,0.245922,file:///largeDataVolume/LeHou_GBMLGG_300x300_a...,300,TCGA-CS-4943-01Z-00
4,90000.0,0.0,6727.0,84594.0,5486.0,0.0,1248.0,2241.0,90000.0,90000.0,...,-12.546795,0.000000,-13.816005,0.370128,0.541765,0.484045,0.314457,file:///largeDataVolume/LeHou_GBMLGG_300x300_a...,300,TCGA-CS-4943-01Z-00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,90000.0,0.0,6868.0,83355.0,5074.0,0.0,1182.0,2050.0,90000.0,90000.0,...,-12.790515,0.000000,-13.276381,0.309078,0.505641,0.538051,0.251302,file:///largeDataVolume/LeHou_GBMLGG_300x300_a...,300,TCGA-CS-4943-01Z-00
397,90000.0,0.0,4643.0,34121.0,4543.0,0.0,821.0,1533.0,90000.0,90000.0,...,-12.459374,0.000000,-12.515993,0.690531,0.821835,1.719831,0.574727,file:///largeDataVolume/LeHou_GBMLGG_300x300_a...,300,TCGA-CS-4943-01Z-00
398,90000.0,0.0,6916.0,46545.0,4700.0,0.0,1160.0,1350.0,90000.0,90000.0,...,-12.801891,0.000000,-12.285753,0.986854,0.999417,3.139283,0.896096,file:///largeDataVolume/LeHou_GBMLGG_300x300_a...,300,TCGA-CS-4943-01Z-00
399,90000.0,0.0,9833.0,90000.0,5862.0,0.0,1464.0,2386.0,90000.0,90000.0,...,-12.144768,-7.512157,0.000000,0.090304,0.221089,0.066730,0.066737,file:///largeDataVolume/LeHou_GBMLGG_300x300_a...,300,TCGA-CS-4943-01Z-00
