# CellProfiler on Ukko2
This notebook contains cells that implement tasks that you need to do to get CellProfiler running on Ukko2, and for collecting the results.

The variables you set in cell "Project settings" affect also the cells below. 

## Mounting Ukko2 folders on the local computer

On Linux (and probably also on Mac) you can use sshfs:
```
mkdir /home/hajaalin/ukko2-proj
mkdir /home/hajaalin/ukko2-wrk/
sshfs ukko2.cs.helsinki.fi:/wrk/hajaalin /home/hajaalin/ukko2-wrk/
sshfs ukko2.cs.helsinki.fi:/proj/hajaalin /home/hajaalin/ukko2-proj/

```
On Windows:
- To mount \$WRKDIR, type \\\\ukko2-smb.cs.helsinki.fi\\YOUR_UH_USERNAME in File Explorer.

- To mount \$PROJ, type \\\\nas-fs2.cs.helsinki.fi\\proj\\YOUR_UH_USERNAME in File Explorer.

## Project settings
You might change these back and forth between different projects/runs as you run the cells below.

In [60]:
from jinja2 import Environment, FileSystemLoader
from pathlib import Path,PosixPath,PurePosixPath,WindowsPath

user = 'hajaalin'
email = 'harri.jaalinoja@helsinki.fi'

# Ukko2 $PROJ and $WRKDIR folders mounted on the local computer.
proj_local = PosixPath('/home/%s/ukko2-proj' % (user))
#proj_local = WindowsPath('\\\\nas-fs2.cs.helsinki.fi\\proj\\%s' % (user))
wrk_local = PosixPath('/home/%s/ukko2-wrk' % (user))
#wrk_local = WindowsPath('\\\\ukko2-smb.cs.helsinki.fi\\%s' % (user))
# $PROJ path on Ukko2.
proj_remote = PurePosixPath('/proj/%s' % (user))
# $WRKDIR path on Ukko2.
wrk_remote = PurePosixPath('/wrk/%s' % (user))

# this is where the results will be copied on the local machine
results_root_local = PosixPath('/home/%s/tmp/cp_on_ukko2' % (user))
#results_local = WindowsPath('D:\\temp\\%s' % (user))


project = 'tskarhu'
#run = '20190204b'
#run = '20190205a'
run = '20190205b'
run = '20190206b'
run = '20190207a'
run = '20190207b'

# number of wells and sites per well
nwells = 36
sites_per_well = 4

# objects the measurements of which to combine
objects = 'Image Nuclei Cells'


###
### You should not need to modify anything below this line. ###
###

results_prefix = project + '_' + run + '_'

# define directories
sbatch_dir_local = proj_local / 'Projects' / 'CellProfiler' / project / run
cp_batch_dir_local = wrk_local / 'CellProfiler' / 'cp_batch_files' / project / run
cp_output_dir_local = wrk_local / 'CellProfiler' / 'output' / project / run
results_local = results_root_local / 'CellProfiler' / 'output' / project / run

# these are the same directories but as seen on the cluster
sbatch_dir_remote = proj_remote / 'Projects' / 'CellProfiler' / project / run
cp_batch_dir_remote = wrk_remote / 'CellProfiler' / 'cp_batch_files' / project / run
cp_output_dir_remote = wrk_remote / 'CellProfiler' / 'output' / project / run

# Directory with sbatch script template
templatedir = '.'

# Create the jinja2 environment.
j2_env = Environment(loader=FileSystemLoader(templatedir))

# print a directory name to show the settings
print(cp_output_dir_remote)

/wrk/hajaalin/CellProfiler/output/tskarhu/20190207b


## Create batch job script

In [54]:
#
# Memory options
#mem_per_cpu = 2048
#mem_per_cpu = 4096
mem_per_cpu = 6144
java_opts = "-Xmx512m"


#
# Parameters for splitting the run in batches.

# number of threads available, assuming we request 4 nodes, but using only 1 thread per core
# (see https://wiki.helsinki.fi/display/it4sci/Technical+Specifications)
nnodes = 4
cpus_per_node = 2
cores_per_cpu = 14
threads_available = nnodes * cpus_per_node * cores_per_cpu
# something on Ukko2 seems to not work when trying to use maximum number of cores, 
# so let's settle for half
threads_available = threads_available / 2

#https://stackoverflow.com/questions/9761562/how-many-factors-in-an-integer
def factors(n):
    result = []

    for i in range(1, n + 1):
        if n % i == 0:
            result.append(i)

    return result

nsites = nwells * sites_per_well

# to keep all batches the same size, the number of batches must be a factor of nsites...
f = factors(nsites)
# ... and smaller than nthreads
f2 = [i for i in f if i <= threads_available]
nbatches = max(f2)
#print(f)
#print(f2)
batch_size = int(nsites / nbatches)

# manual batch size settings
#nbatches = 36
#batch_size = 4

ntasks = nbatches
cpus_per_task = 1
threads_requested = ntasks * cpus_per_task

batch_last_start = (nbatches - 1) * batch_size + 1
nsites_sanity_check = int(nbatches*batch_size)
print("threads_available: %d" % threads_available)
print("threads_requested: %d" % threads_requested)
print("nsites: %d" % nsites)
print("nbatches: %d" % nbatches)
print("nbatch_size: %d" % batch_size)
print("nsites_sanity_check: %d" % nsites_sanity_check)

#import os
#print(os.listdir(wrk_local))
#print(os.listdir(proj_local))
#print(os.listdir(proj_local / 'Projects' / 'CellProfiler'))

# create directories for sbatch script, batch data file (.h5) and output 
sbatch_dir_local.mkdir(parents=True, exist_ok=False)
cp_batch_dir_local.mkdir(parents=True, exist_ok=True)
cp_output_dir_local.mkdir(parents=True, exist_ok=True)

context = { 'email' : email, \
            'nnodes' : nnodes, \
            'ntasks' : ntasks, \
            'cpus_per_task' : cpus_per_task, \
            'mem_per_cpu' : mem_per_cpu, \
            'java_opts' : java_opts, \
            'workdir' : cp_output_dir_remote, \
            'outputroot' : cp_output_dir_remote, \
            'resultsdir' : sbatch_dir_remote, \
            'project' : project, \
            'run' : run, \
            'resultsprefix' : results_prefix, \
            'cp_batchfile' : cp_batch_dir_remote.joinpath(Path('Batch_data.h5')), \
            'nwells' : nwells, \
            'sites_per_well' : sites_per_well, \
            'nsites' : nsites, \
            'nbatches' : nbatches, \
            'nsites_sanity_check' : nsites_sanity_check, \
            'batch_size' : batch_size, \
            'batch_last_start' : batch_last_start, \
            'objects' : objects }

script = j2_env.get_template('cp_sbatch_template.sh.j2').render(context)

# save the results
scriptfilename = Path("sbatch_%s_%s_%d_%d.sh" % (project, run, nsites, batch_size ))
scriptfile = sbatch_dir_local.joinpath(scriptfilename)
with open(scriptfile, "w", newline='\n') as fh:
    fh.write(script)   
#print(scriptfile)

# print sbatch command
print("# To submit the batch job, run the following command on Ukko2:")
print("sbatch --exclude=ukko2-paavo,ukko2-pekka " + str(sbatch_dir_remote.joinpath(scriptfilename)) + " > jobid")
print("# To check the job que:")
print("squeue | grep " + user)


threads_available: 56
threads_requested: 48
nsites: 144
nbatches: 48
nbatch_size: 3
nsites_sanity_check: 144
# To submit the batch job, run the following command on Ukko2:
sbatch --exclude=ukko2-paavo,ukko2-pekka /proj/hajaalin/Projects/CellProfiler/tskarhu/20190207b/sbatch_tskarhu_20190207b_144_3.sh > jobid
# To check the job que:
squeue | grep hajaalin


## Run batch job
At this point you have to log in to Ukko2 with from the terminal (Linux, Mac) or with Putty (Windows), and run the sbatch command printed out by the previous cell. See instructions in https://wiki.helsinki.fi/display/LMU/CellProfiler+on+Ukko2+cluster.

## Copy results to local computer

In [57]:
import os
import shutil

# create results directory
#results_local.mkdir(parents=True, exist_ok=False)

# copy Ukko2 work directory to local computer
shutil.copytree(cp_output_dir_local, results_local)

# copy the combined .csv files
for f in sbatch_dir_local.glob('*.csv'):
    shutil.copy(f, results_local)

print('copied folder: %s' %(cp_output_dir_local))
print('listing of %s:' % (results_local))
print(sorted(os.listdir(results_local / '..')))

copied folder: /home/hajaalin/ukko2-wrk/CellProfiler/output/tskarhu/20190207b
listing of /home/hajaalin/tmp/cp_on_ukko2/CellProfiler/output/tskarhu/20190207b:
['20190205b', '20190206a', '20190206b', '20190207b']


## Create result files with local pathnames

In [58]:
import pandas as pd

# data location mappings, same as specified in CreateBatchFiles
datamap = {'/wrk/hajaalin/data/tskarhu' : '/mnt/lmu-netapp/instruments/Nano/MDCStore/tskarhu'}

for f in results_local.glob('*.csv'):
    if 'local.csv' in str(f):
        continue
    print(f)
    
    df = pd.read_csv(f, float_precision='round_trip')
    paths = [k for k in list(df.columns.values) if "PathName" in k]
    originals = [k for k in paths if "Overlay" not in k]
    overlays = [k for k in paths if "Overlay" in k]
    
    #print(originals)
    #print(overlays)

    for p in originals:
        for k in datamap.keys():
            df[p] = df[p].str.replace(k, datamap[k])
            
    for p in overlays:
        df[p] = df[p].str.replace(str(cp_output_dir_remote), str(results_local))
    
    # Write the modified .csv.
    # Default na_rep='', which will cause columns with missing values to have type VARCHAR() in SQLite.
    # To prevent this, set na_rep='nan', as it is in the .csv files produced by CP.
    # TODO: check why the result file has more 'nan' values than the original.
    local_name = f.stem + "_local" + f.suffix
    df.to_csv(f.with_name(local_name), index=False, na_rep='nan')
    print(local_name)

/home/hajaalin/tmp/cp_on_ukko2/CellProfiler/output/tskarhu/20190207b/tskarhu_20190207b_Combined_Cells.csv
tskarhu_20190207b_Combined_Cells_local.csv
/home/hajaalin/tmp/cp_on_ukko2/CellProfiler/output/tskarhu/20190207b/tskarhu_20190207b_Combined_Image.csv
tskarhu_20190207b_Combined_Image_local.csv
/home/hajaalin/tmp/cp_on_ukko2/CellProfiler/output/tskarhu/20190207b/tskarhu_20190207b_Combined_Nuclei.csv
tskarhu_20190207b_Combined_Nuclei_local.csv


## Merge all object measurements in one .csv

In [96]:
objs = objects.split(' ')
objs.remove('Image')
#print(objs)

merge_on = ['ImageNumber', 'ObjectNumber']
keep_col_names = ['ImageNumber', 'ObjectNumber', 'Metadata']

df = pd.DataFrame()
for o in objs:
    
    csv = list(results_local.glob('*Combined_' + o + '.csv'))
    if len(csv) > 1:
        print('Error: ambiguous object files')
        print(csv)
    ofile = csv[0]
    print(ofile)
    tmp = pd.read_csv(ofile, float_precision='round_trip')

    # rename measurement columns
    columns = {}
    measurements = [k for k in list(tmp.columns.values) if not k.startswith(tuple(keep_col_names))]
    for m in measurements:
        columns[m] = m + '_' + o
    tmp.rename(index=str, columns=columns, inplace=True)
    
    #print('tmp.head')
    #print(tmp.head(1))

    if df.empty:
        df = tmp
    else:
        # find and drop metadata columns
        metadata = [k for k in list(tmp.columns.values) if k.startswith('Metadata')]
        tmp.drop(metadata, axis=1, inplace=True)
            
        df = pd.merge(left=df, right=tmp, on=merge_on)
    
    #print('df.head')
    #print(df.head(1))

ofile = ofile.with_name(ofile.name.replace(o,"Objects"))
df.to_csv(ofile, index=False, na_rep='nan')
print(ofile)

/home/hajaalin/tmp/cp_on_ukko2/CellProfiler/output/tskarhu/20190207b/tskarhu_20190207b_Combined_Nuclei.csv
/home/hajaalin/tmp/cp_on_ukko2/CellProfiler/output/tskarhu/20190207b/tskarhu_20190207b_Combined_Cells.csv
/home/hajaalin/tmp/cp_on_ukko2/CellProfiler/output/tskarhu/20190207b/tskarhu_20190207b_Combined_Objects.csv


## Create .properties file for CellProfiler Analyst

In [97]:
#
# Settings for CellProfiler Analyst
#

image_csv_file = results_local.joinpath(Path(results_prefix + 'Combined_Image_local.csv'))
object_csv_file = results_local.joinpath(Path(results_prefix + 'Combined_Objects.csv'))

image_names = 'Blue,Red,NucleiOverlay,CellsOverlay'
image_path_cols = ''
image_file_cols = ''
for i in image_names.split(','):
    image_path_cols = image_path_cols + 'PathName_' + i + ','
    image_file_cols = image_file_cols + 'FileName_' + i + ','

# create a properties file for CPA
context = { 'image_csv_file' : image_csv_file, \
            'object_csv_file' : object_csv_file, \
            'cell_x_loc' : 'Location_Center_X_Nuclei', \
            'cell_y_loc' : 'Location_Center_Y_Nuclei', \
            'image_path_cols' : image_path_cols, \
            'image_file_cols' : image_file_cols, \
            'image_names' : image_names, \
            'image_channel_colors' : 'blue,red,gray,gray', \
            'object_name' : 'cell, cells,', \
            'plate_type' : 96, \
            'image_tile_size' : 50}

properties = j2_env.get_template('DefaultDB_MyExpt.properties').render(context)

# save the results
properties_filename = Path("%s_%s_nan.properties" % (project, run))
properties_file = results_local.joinpath(properties_filename)
with open(properties_file, "w", newline='\n') as fh:
    fh.write(properties)   
print(properties_file)

/home/hajaalin/tmp/cp_on_ukko2/CellProfiler/output/tskarhu/20190207b/tskarhu_20190207b_nan.properties
