### Image processing Cytation data
Clayton Wandishin experiment  
2020-12-03

In [1]:
import os
import re
import pandas as pd
import numpy as np
from datetime import datetime, date

#### Define location of data

In [2]:
TOPDIR = "/mnt/darren/quaranta2/Cytation/2020-11-02"
IMDIR = os.path.join(TOPDIR,"images")

def parseFileName(filename):
    filename = os.path.basename(filename)
    x = filename.split(".")[0]
    well = x.split("_")[0]
    pos = x.split("_")[3]
    ch = x.split("_")[4]
    time_i = x.split("_")[5]
    out = [well,ch,pos,time_i]
    return(out)

# [x+1 if x >= 45 else x+5 for x in l]

def fixWellName(well_name):
    # function to fix well names by ensuring 3-digit length 
    # (i.e., include preceding 0 in single-digit column numbers) 

    if isinstance(well_name, list):
        return([f'{wn[0]}0{wn[1]}' if len(wn) < 3 else wn for wn in well_name])
    elif(isinstance(well_name, str)):
        if len(well_name) < 3:
            return(f'{well_name[0]}0{well_name[1]}')
    else:
        well_name

def getDateTime(filepath):
    pat = "\d{6}_\d{6}"

    d = [re.search(pat, x) for x in filepath]
    d = [x[0] for x in d]
    d = [datetime.strptime(x, '%y%m%d_%H%M%S%f') for x in d]
    o = [x.strftime("%Y-%m-%d %H:%M:%S") for x in d]
    return(o)

def getTimeIdx(filepath):
    pat = "Experiment\d{1,2}"
    i = [re.search(pat,x) for x in filepath]
    i = [x[0] for x in i]
    i = [int(x.strip("Experiment")) for x in i]
    return(i)


#### Find all image files

In [3]:
os.chdir(IMDIR)
fn = []
dn = []

for (dirpath, dirnames, filenames) in os.walk(IMDIR):
    fn += [os.path.join(dirpath, f) for f in filenames]
    dn += [os.path.join(dirpath, d) for d in dirnames]

# remove .DS_Store (hiddent Spotlight) files, if present
fn = [f for f in fn if ".DS_Store" not in f]

# keep only TIFF files
fn = [f for f in fn if "Segmentation" not in f]
fn = [f for f in fn if "csv" not in f]

print(f"{len(fn)} files were found.")
print(f"{len(dn)} directories were found")

if(os.path.isfile(fn[0])):
    print(f"The file {os.path.basename(fn[0])} has a complete path.")
else:
    print(f"The file {os.path.basename(fn[0])} does NOT have a complete path.")

164620 files were found.
316 directories were found
The file E7_02_1_1_RFP_001.tif has a complete path.


In [4]:
fn.sort()
fn[:6]

['/mnt/darren/quaranta2/Cytation/2020-11-02/images/201103_040216_Experiment1/201103_040216_!PLATE_BARCODE!/B10_02_1_1_RFP_001.tif',
 '/mnt/darren/quaranta2/Cytation/2020-11-02/images/201103_040216_Experiment1/201103_040216_!PLATE_BARCODE!/B10_02_1_2_RFP_001.tif',
 '/mnt/darren/quaranta2/Cytation/2020-11-02/images/201103_040216_Experiment1/201103_040216_!PLATE_BARCODE!/B10_02_2_1_GFP_001.tif',
 '/mnt/darren/quaranta2/Cytation/2020-11-02/images/201103_040216_Experiment1/201103_040216_!PLATE_BARCODE!/B10_02_2_2_GFP_001.tif',
 '/mnt/darren/quaranta2/Cytation/2020-11-02/images/201103_040216_Experiment1/201103_040216_!PLATE_BARCODE!/B11_02_1_1_RFP_001.tif',
 '/mnt/darren/quaranta2/Cytation/2020-11-02/images/201103_040216_Experiment1/201103_040216_!PLATE_BARCODE!/B11_02_1_2_RFP_001.tif']

#### Filename structure
Example filename: `B10_04_1_1_RFP_001.tif`  

* `B10` = well  
* `04` = unknown  
* `1` = channel number (`1` or `2` in these data)  
* `1` = position number (only single position per well in these data)  
* `RFP` = channel name (`RFP` or `GFP` in these data)  
* `001` = time point index (`001` through `010` in these data)  
* `tif` = image file format (only `tif` in these data)  



In [5]:
file_info = pd.DataFrame([parseFileName(x) for x in fn])
file_info.columns = ['well','ch','pos','time_i']
file_info['file_name'] = fn

In [6]:
file_info.head()

Unnamed: 0,well,ch,pos,time_i,file_name
0,B10,RFP,1,1,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...
1,B10,RFP,2,1,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...
2,B10,GFP,1,1,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...
3,B10,GFP,2,1,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...
4,B11,RFP,1,1,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...


In [7]:
red = file_info.loc[file_info['ch']=='RFP','file_name']
red = red.reset_index(drop=True)
green = file_info.loc[file_info['ch']=='GFP','file_name']
green = green.reset_index(drop=True)

In [8]:
wells = file_info.loc[file_info['ch']=='RFP','well']
wells = wells.reset_index(drop=True)
wells = fixWellName(wells.tolist())
wells = pd.Series(wells)


In [9]:
wells.head()

0    B10
1    B10
2    B11
3    B11
4    B12
dtype: object

In [10]:
temp = pd.DataFrame({'image_time': getDateTime(file_info.loc[file_info['ch']=='RFP','file_name']),
                     'time_i': getTimeIdx(file_info.loc[file_info['ch']=='RFP','file_name'])})

In [11]:
temp.head()

Unnamed: 0,image_time,time_i
0,2020-11-03 04:02:01,1
1,2020-11-03 04:02:01,1
2,2020-11-03 04:02:01,1
3,2020-11-03 04:02:01,1
4,2020-11-03 04:02:01,1


#### Generate plate lookup table
Currently (as of 2020-11-24) the `processIm` function of py-seg expects an integer value for `plate_id`. To bypass this I will make a table of integers that correspond to the unique `


In [12]:
pid_str = [os.path.basename(os.path.dirname(os.path.dirname(x))) for x in red]
pid_unique = np.unique(pid_str)
pid_int = [10000+int(np.where(pid_unique == x)[0]) for x in pid_str]
pid_lookup = pd.DataFrame({'PlateId':pid_unique, 'PlateInt':np.unique(pid_int)})

In [13]:
pid_lookup.shape

(158, 2)

In [14]:
if not os.path.exists(os.path.join(TOPDIR, "PlateID_lookup_table.csv")):
    pid_lookup.to_csv(os.path.join(TOPDIR, "PlateID_lookup_table.csv"))

In [15]:
taskargs = pd.DataFrame({
                        'ch2_im_path': green,
                        'nuc_im_path': red,
                        'overwrite': 'FALSE',
                        'plate_id': pid_int,
                        'regprops': 'FALSE',
                        'save_path': os.path.join(TOPDIR,'Segmentation'),
                        'well': wells
})

In [16]:
taskargs.head()

Unnamed: 0,ch2_im_path,nuc_im_path,overwrite,plate_id,regprops,save_path,well
0,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...,False,10000,False,/mnt/darren/quaranta2/Cytation/2020-11-02/Segm...,B10
1,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...,False,10000,False,/mnt/darren/quaranta2/Cytation/2020-11-02/Segm...,B10
2,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...,False,10000,False,/mnt/darren/quaranta2/Cytation/2020-11-02/Segm...,B11
3,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...,False,10000,False,/mnt/darren/quaranta2/Cytation/2020-11-02/Segm...,B11
4,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...,/mnt/darren/quaranta2/Cytation/2020-11-02/imag...,False,10000,False,/mnt/darren/quaranta2/Cytation/2020-11-02/Segm...,B12


#### Save Task Arguments to file
(Will not overwrite if file exists; must delete previous to write new file.)

In [17]:
today = date.today()
taskargs.shape

(82311, 7)

In [18]:
argfilepath = os.path.join(TOPDIR,f'TaskArgs_{today}.csv')
if not os.path.isfile(argfilepath):
    taskargs.to_csv(argfilepath, index=False)
    print(f'Saving task file as {argfilepath}')

Saving task file as /mnt/darren/quaranta2/Cytation/2020-11-02/TaskArgs_2020-12-04.csv


#### Examine some processing output

In [19]:
import sys
sys.path.append(r'/home/darren/git-repos/Segmentation-other/py-seg')

In [20]:
from MXtasksTempo import processIm
import cv2
import numpy as np
from pylab import imshow, gray

In [21]:
# _ = [processIm(taskargs.loc[i].to_list()) for i in range(ta_test.shape[0])]

### Set up celery workers and send jobs to RabbitMQ
This is done via `ssh` to `tempo` in the `improc` Conda environment. Must also be in `~/git-repos/Segmentation-other/py-seg/`  

Must specify maximum concurrency when calling Celery worker.  

Then execute:  
`screen`
`celery -A MXtasksTempo worker --concurrency=120`  
<ctrl-A,D>  


`python sendMXtempoJobs.py /mnt/darren/quaranta2/Cytation/2020-11-02/TaskArgs_2020-11-24.csv`  



## Assembling processing output
Cell count data and segmentation overlays have been saved in the `Segmentation` directory in the `/vu1file/quaranta2/Cytation/2020-11-02`. Attempted to assemble data using the `assemPlateData` function from the `diprate` R package, but it appears the summarized output files from each plate may not contain all output data files. Will examine first and attempt to reassemble.

In [22]:
SEGDIR = os.path.join(TOPDIR,"Segmentation")
SEGDIR

'/mnt/darren/quaranta2/Cytation/2020-11-02/Segmentation'

Find all output files

In [23]:
os.chdir(SEGDIR)
fn = []
dn = []

for (dirpath, dirnames, filenames) in os.walk(SEGDIR):
    fn += [os.path.join(dirpath, f) for f in filenames]
    dn += [os.path.join(dirpath, d) for d in dirnames]

# remove .DS_Store (hiddent Spotlight) files, if present
fn = [f for f in fn if ".DS_Store" not in f]

# keep only CSV files
fn = [f for f in fn if "csv" in f]

print(f"{len(fn)} files were found.")
print(f"{len(dn)} directories were found")


82309 files were found.
158 directories were found


In [24]:
dn.sort()

In [25]:
dn[0:5]

['/mnt/darren/quaranta2/Cytation/2020-11-02/Segmentation/Plate10000',
 '/mnt/darren/quaranta2/Cytation/2020-11-02/Segmentation/Plate10001',
 '/mnt/darren/quaranta2/Cytation/2020-11-02/Segmentation/Plate10002',
 '/mnt/darren/quaranta2/Cytation/2020-11-02/Segmentation/Plate10003',
 '/mnt/darren/quaranta2/Cytation/2020-11-02/Segmentation/Plate10004']

In [26]:
fn.sort()
fn[0:5]

['/mnt/darren/quaranta2/Cytation/2020-11-02/Segmentation/Plate10000/Plate10000_B02_B2_02_1_1_RFP_001.tif_cellcount.csv',
 '/mnt/darren/quaranta2/Cytation/2020-11-02/Segmentation/Plate10000/Plate10000_B02_B2_02_1_2_RFP_001.tif_cellcount.csv',
 '/mnt/darren/quaranta2/Cytation/2020-11-02/Segmentation/Plate10000/Plate10000_B03_B3_02_1_1_RFP_001.tif_cellcount.csv',
 '/mnt/darren/quaranta2/Cytation/2020-11-02/Segmentation/Plate10000/Plate10000_B03_B3_02_1_2_RFP_001.tif_cellcount.csv',
 '/mnt/darren/quaranta2/Cytation/2020-11-02/Segmentation/Plate10000/Plate10000_B04_B4_02_1_1_RFP_001.tif_cellcount.csv']

In [27]:
fn = [x for x in fn if "cellcount.csv" in x]

In [28]:
len(fn)

82309

In [29]:
len([x for x in fn if "Plate10000" in x])

480

In [30]:
taskargs[taskargs['plate_id']==10000].shape

(480, 7)

In [51]:
ad = pd.read_csv(fn[0])

In [52]:
for i in range(len(fn)):
    if i != 0:
        df = pd.read_csv(fn[i])
        ad = ad.append(df, ignore_index=True)

In [53]:
ad.shape

(82309, 6)

In [54]:
ad.head()

Unnamed: 0,file_name,cell_count,file_name_ch2,ch2_pos,plate_id,well
0,B2_02_1_1_RFP_001.tif,52,B2_02_2_1_GFP_001.tif,2,10000,B02
1,B2_02_1_2_RFP_001.tif,29,B2_02_2_2_GFP_001.tif,4,10000,B02
2,B3_02_1_1_RFP_001.tif,47,B3_02_2_1_GFP_001.tif,3,10000,B03
3,B3_02_1_2_RFP_001.tif,38,B3_02_2_2_GFP_001.tif,2,10000,B03
4,B4_02_1_1_RFP_001.tif,30,B4_02_2_1_GFP_001.tif,5,10000,B04


In [56]:
ad.to_csv(os.path.join(TOPDIR,"20201102_Cytation_cellcounts.csv"), index=False)