# Quantifying cell counts from Cytation
Applying `py-seg` to Cytation 5 data generated in HTS (VAPR) core by Clayton Wandishin.  Single 384-well plate imaged multiple times, 2 channels (red nuclei and (Sytox) green for dead cells). Need plate map of cell line(s), drugs and drug concentrations from Clayton.

Steps needed to perform processing and assemble data:

* Identify all image files (saved on vu1file quaranta2 share)
* Parse file names to determine time point, channel, well, and position
* Assemble task arguments for `py-seg` processing
* Send jobs to RabbitMq/Celery for processing
* Collect cell counts per time point (similar to `plate.id` from ImageXpress HTS core output)

In [1]:
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime, date

In [6]:
TOPDIR = '/mnt/darren/quaranta2/Cytation/2020-10-31'

def parseFileName(filename):
    filename = os.path.basename(filename)
    x = filename.split(".")[0]
    well = x.split("_")[0]
    pos = x.split("_")[3]
    ch = x.split("_")[4]
    time_i = x.split("_")[5]
    out = [well,ch,pos,time_i]
    return(out)

# [x+1 if x >= 45 else x+5 for x in l]

def fixWellName(well_name):
    # function to fix well names by ensuring 3-digit length 
    # (i.e., include preceding 0 in single-digit column numbers) 

    if isinstance(well_name, list):
        return([f'{wn[0]}0{wn[1]}' if len(wn) < 3 else wn for wn in well_name])
    elif(isinstance(well_name, str)):
        if len(well_name) < 3:
            return(f'{well_name[0]}0{well_name[1]}')
    else:
        well_name

def getDateTime(filepath):
    pat = "\d{6}_\d{6}"

    d = [re.search(pat, x) for x in filepath]
    d = [x[0] for x in d]
    d = [datetime.strptime(x, '%y%m%d_%H%M%S%f') for x in d]
    o = [x.strftime("%Y-%m-%d %H:%M:%S") for x in d]
    return(o)

def getTimeIdx(filepath):
    pat = "Experiment\d{1,2}"
    i = [re.search(pat,x) for x in filepath]
    i = [x[0] for x in i]
    i = [int(x.strip("Experiment")) for x in i]
    return(i)

#### Find all image files

In [7]:
os.chdir(TOPDIR)
fn = []
dn = []

for (dirpath, dirnames, filenames) in os.walk(TOPDIR):
    fn += [os.path.join(dirpath, f) for f in filenames]
    dn += [os.path.join(dirpath, d) for d in dirnames]

# remove .DS_Store (hiddent Spotlight) files, if present
fn = [f for f in fn if ".DS_Store" not in f]

# keep only TIFF files
fn = [f for f in fn if "Segmentation" not in f]
fn = [f for f in fn if "csv" not in f]



print(f"{len(fn)} files were found.")
print(f"{len(dn)} directories were found")

if(os.path.isfile(fn[0])):
    print(f"The file {os.path.basename(fn[0])} has a complete path.")
else:
    print(f"The file {os.path.basename(fn[0])} does NOT have a complete path.")

13440 files were found.
43 directories were found
The file E7_02_1_1_RFP_001.tif has a complete path.


In [8]:
fn.sort()
fn[:6]
# fn[10550:]

['/mnt/darren/quaranta2/Cytation/2020-10-31/201031_183840_Experiment1/201031_183840_Plate 1/B10_02_1_1_RFP_001.tif',
 '/mnt/darren/quaranta2/Cytation/2020-10-31/201031_183840_Experiment1/201031_183840_Plate 1/B10_02_1_2_RFP_001.tif',
 '/mnt/darren/quaranta2/Cytation/2020-10-31/201031_183840_Experiment1/201031_183840_Plate 1/B10_02_2_1_GFP_001.tif',
 '/mnt/darren/quaranta2/Cytation/2020-10-31/201031_183840_Experiment1/201031_183840_Plate 1/B10_02_2_2_GFP_001.tif',
 '/mnt/darren/quaranta2/Cytation/2020-10-31/201031_183840_Experiment1/201031_183840_Plate 1/B11_02_1_1_RFP_001.tif',
 '/mnt/darren/quaranta2/Cytation/2020-10-31/201031_183840_Experiment1/201031_183840_Plate 1/B11_02_1_2_RFP_001.tif']

#### Filename structure
Example filename: `B10_04_1_1_RFP_001.tif`  

* `B10` = well  
* `04` = unknown  
* `1` = channel number (`1` or `2` in these data)  
* `1` = position number (`1` or `2` in these data)  
* `RFP` = channel name (`RFP` or `GFP` in these data)  
* `001` = time point index (only `001` in these data; actual time point index in enclosing directory (2 up) `Experiment[0-9]{1,2}`)  
* `tif` = image file format (only `tif` in these data)  




In [9]:
file_info = pd.DataFrame([parseFileName(x) for x in fn])
file_info.columns = ['well','ch','pos','time_i']
file_info['file_name'] = fn

In [10]:
file_info.head()

Unnamed: 0,well,ch,pos,time_i,file_name
0,B10,RFP,1,1,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...
1,B10,RFP,2,1,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...
2,B10,GFP,1,1,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...
3,B10,GFP,2,1,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...
4,B11,RFP,1,1,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...


In [11]:
red = file_info.loc[file_info['ch']=='RFP','file_name']
red = red.reset_index(drop=True)
green = file_info.loc[file_info['ch']=='GFP','file_name']
green = green.reset_index(drop=True)

In [12]:
wells = file_info.loc[file_info['ch']=='RFP','well']
wells = wells.reset_index(drop=True)
wells = fixWellName(wells.tolist())
wells = pd.Series(wells)

In [13]:
temp = pd.DataFrame({'image_time': getDateTime(file_info.loc[file_info['ch']=='RFP','file_name']),
                     'time_i': getTimeIdx(file_info.loc[file_info['ch']=='RFP','file_name'])})

In [14]:
taskargs = pd.DataFrame({
                        'ch2_im_path': green,
                        'nuc_im_path': red,
                        'overwrite': 'TRUE',
                        'plate_id': temp['time_i'],
                        'regprops': 'FALSE',
                        'save_path': os.path.join(TOPDIR,'Segmentation'),
                        'well': wells
})

In [15]:
taskargs.head()

Unnamed: 0,ch2_im_path,nuc_im_path,overwrite,plate_id,regprops,save_path,well
0,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,True,1,False,/mnt/darren/quaranta2/Cytation/2020-10-31/Segm...,B10
1,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,True,1,False,/mnt/darren/quaranta2/Cytation/2020-10-31/Segm...,B10
2,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,True,1,False,/mnt/darren/quaranta2/Cytation/2020-10-31/Segm...,B11
3,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,True,1,False,/mnt/darren/quaranta2/Cytation/2020-10-31/Segm...,B11
4,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,True,1,False,/mnt/darren/quaranta2/Cytation/2020-10-31/Segm...,B12


#### Save Task Arguments to file
(Will not overwrite if file exists; must delete previous to write new file.)

In [16]:
today = date.today()
taskargs.shape

(6720, 7)

In [17]:
argfilepath = os.path.join(TOPDIR,f'TaskArgs_{today}.csv')
if not os.path.isfile(argfilepath):
    taskargs.to_csv(argfilepath, index=False)
    print(f'Saving task file as {argfilepath}')

Saving task file as /mnt/darren/quaranta2/Cytation/2020-10-31/TaskArgs_2020-11-02.csv


#### Examine some processing output

In [18]:
import sys
sys.path.append(r'/home/darren/git-repos/Segmentation-other/py-seg')

In [19]:
from MXtasksTempo import processIm
import cv2
import numpy as np
from pylab import imshow, gray

In [20]:
processIm(taskargs.loc[200].to_list())

Output worked ok

### Set up celery workers and send jobs to RabbitMQ
This is done via `ssh` to `tempo` in the `improc` Conda environment. Must also be in `~/git-repos/Segmentation-other/py-seg/`  

Must specify maximum concurrency when calling Celery worker.  

Then execute:  
`screen`
`celery -A MXtasksTempo worker --concurrency=120`  
<ctrl-A,D>  


`python sendMXtempoJobs.py /mnt/darren/quaranta2/Cytation/2020-10-31/TaskArgs_2020-11-02.csv`  



In [21]:
SEGDIR = os.path.join(TOPDIR,"Segmentation")

In [22]:
ccfn = []
ccdn = []

for (dirpath, dirnames, filenames) in os.walk(SEGDIR):
    ccfn += [os.path.join(dirpath, f) for f in filenames]
    ccdn += [os.path.join(dirpath, d) for d in dirnames]

# remove .DS_Store (hiddent Spotlight) files, if present
ccfn = [f for f in ccfn if ".DS_Store" not in f]
ccfn = [f for f in ccfn if "cellcount.csv" in f]
ccdn = [d for d in ccdn if ".DS_Store" not in d]
ccdn.sort()



In [23]:
d = pd.DataFrame()
for f in ccfn:
    d = d.append(pd.read_csv(f),ignore_index=True)



In [24]:
d.head()

Unnamed: 0,file_name,cell_count,file_name_ch2,ch2_pos,plate_id,well
0,B13_02_1_2_RFP_001.tif,22,B13_02_2_2_GFP_001.tif,9,2,B13
1,H15_02_1_2_RFP_001.tif,16,H15_02_2_2_GFP_001.tif,8,2,H15
2,C15_02_1_2_RFP_001.tif,22,C15_02_2_2_GFP_001.tif,7,2,C15
3,K16_02_1_2_RFP_001.tif,26,K16_02_2_2_GFP_001.tif,11,2,K16
4,F21_02_1_2_RFP_001.tif,13,F21_02_2_2_GFP_001.tif,10,2,F21


In [42]:
d.sort_values(by=['file_name','plate_id'], inplace=True, ignore_index=True)

In [44]:
d.head(20)

Unnamed: 0,file_name,cell_count,file_name_ch2,ch2_pos,plate_id,well
0,B10_02_1_1_RFP_001.tif,60,B10_02_2_1_GFP_001.tif,28,4,B10
1,B10_02_1_1_RFP_001.tif,29,B10_02_2_1_GFP_001.tif,1,9,B10
2,B10_02_1_1_RFP_001.tif,66,B10_02_2_1_GFP_001.tif,42,10,B10
3,B10_02_1_1_RFP_001.tif,25,B10_02_2_1_GFP_001.tif,1,11,B10
4,B10_02_1_1_RFP_001.tif,74,B10_02_2_1_GFP_001.tif,40,12,B10
5,B10_02_1_1_RFP_001.tif,26,B10_02_2_1_GFP_001.tif,1,13,B10
6,B10_02_1_1_RFP_001.tif,71,B10_02_2_1_GFP_001.tif,49,14,B10
7,B10_02_1_2_RFP_001.tif,8,B10_02_2_2_GFP_001.tif,0,1,B10
8,B10_02_1_2_RFP_001.tif,31,B10_02_2_2_GFP_001.tif,19,2,B10
9,B10_02_1_2_RFP_001.tif,4,B10_02_2_2_GFP_001.tif,0,3,B10


Add back directory name to enable parsing of image acquisition time

In [27]:
temp = taskargs

In [28]:
count_fn = os.path.join(TOPDIR,"20201031_Cytation_data.csv")

Write file only if it doesn't exist already

In [29]:
if not os.path.isfile(count_fn):
    d.to_csv(count_fn, index=False)

In [30]:
d.shape

(3334, 6)

In [31]:
d.head()

Unnamed: 0,file_name,cell_count,file_name_ch2,ch2_pos,plate_id,well
0,B2_02_1_2_RFP_001.tif,3,B2_02_2_2_GFP_001.tif,0,1,B02
1,B3_02_1_2_RFP_001.tif,8,B3_02_2_2_GFP_001.tif,2,1,B03
2,B4_02_1_2_RFP_001.tif,15,B4_02_2_2_GFP_001.tif,0,1,B04
3,B5_02_1_2_RFP_001.tif,3,B5_02_2_2_GFP_001.tif,2,1,B05
4,B6_02_1_2_RFP_001.tif,8,B6_02_2_2_GFP_001.tif,2,1,B06


In [32]:
taskargs.sort_values(by=['plate_id','well'], inplace=True, ignore_index=True)

In [33]:
taskargs.head()

Unnamed: 0,ch2_im_path,nuc_im_path,overwrite,plate_id,regprops,save_path,well
0,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,True,1,False,/mnt/darren/quaranta2/Cytation/2020-10-31/Segm...,B02
1,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,True,1,False,/mnt/darren/quaranta2/Cytation/2020-10-31/Segm...,B02
2,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,True,1,False,/mnt/darren/quaranta2/Cytation/2020-10-31/Segm...,B03
3,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,True,1,False,/mnt/darren/quaranta2/Cytation/2020-10-31/Segm...,B03
4,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,/mnt/darren/quaranta2/Cytation/2020-10-31/2010...,True,1,False,/mnt/darren/quaranta2/Cytation/2020-10-31/Segm...,B04


In [38]:
[os.path.basename(x) for x in taskargs['nuc_im_path'].to_list()][:10] 

['B2_02_1_1_RFP_001.tif',
 'B2_02_1_2_RFP_001.tif',
 'B3_02_1_1_RFP_001.tif',
 'B3_02_1_2_RFP_001.tif',
 'B4_02_1_1_RFP_001.tif',
 'B4_02_1_2_RFP_001.tif',
 'B5_02_1_1_RFP_001.tif',
 'B5_02_1_2_RFP_001.tif',
 'B6_02_1_1_RFP_001.tif',
 'B6_02_1_2_RFP_001.tif']

In [39]:
d['file_name'].to_list()[:10]

['B2_02_1_2_RFP_001.tif',
 'B3_02_1_2_RFP_001.tif',
 'B4_02_1_2_RFP_001.tif',
 'B5_02_1_2_RFP_001.tif',
 'B6_02_1_2_RFP_001.tif',
 'B7_02_1_2_RFP_001.tif',
 'B8_02_1_2_RFP_001.tif',
 'B9_02_1_2_RFP_001.tif',
 'B10_02_1_2_RFP_001.tif',
 'B11_02_1_2_RFP_001.tif']

In [None]:
d['full_path'] = taskargs['nuc_im_path']
d['full_path_ch2'] = taskargs['ch2_im_path']

In [None]:
d.head()

In [None]:
d.shape


In [None]:
a = d.sort_values(by=['well','plate_id'])

In [None]:
a.head()

In [None]:
a['full_path'].sort_values().to_list()