# Perform parallel inference on a GWAS 2022 Axioscan7 images

In [1]:
import argparse
import os
import random
import shutil
import time
import warnings
import pickle
import numpy as np
import math
import sys
import copy
import re
import pandas as pd
import matplotlib.pyplot as plt
import json
import cv2
from itertools import compress

import torch
import torch.nn as nn
import detectron2
from detectron2.utils.logger import setup_logger
setup_logger()
from detectron2 import model_zoo
from detectron2.engine import DefaultPredictor,DefaultTrainer,HookBase
from detectron2.config import get_cfg
from detectron2.utils.visualizer import Visualizer,ColorMode,GenericMask
from detectron2.structures import BoxMode
from detectron2.evaluation import COCOEvaluator,inference_on_dataset
from detectron2.data import build_detection_test_loader,DatasetMapper,build_detection_train_loader,MetadataCatalog,DatasetCatalog
import detectron2.data.transforms as T
import detectron2.utils.comm as comm

import ray
import time

import uuid as uuid
from operator import itemgetter
import seaborn as sns

import shapely
import shapely.geometry
from shapely.geometry import Polygon,MultiPolygon,GeometryCollection
from shapely.validation import make_valid
from shapely.geometry import mapping
#import geopandas as gpd

#import imgfileutils as imf
#import segmentation_tools as sgt
from aicsimageio import AICSImage, imread
from skimage import measure, segmentation
from skimage.measure import regionprops
from skimage.color import label2rgb
#import progressbar
from IPython.display import display, HTML
#from MightyMosaic import MightyMosaic

import glob
from PIL import Image
import csv

In [2]:
# setup directory
root = r'/Users/lovely_shufan/'

project_dir = root + r'Dropbox (Edison_Lab@UGA)/AMF/AMF Imaging 2022/0_inference_using_MaskRCNN_2021/'
output_dir = project_dir + r'2_infer_result/GA_GWAS_2022/'

model_dir = root + r'Dropbox (Edison_Lab@UGA)/AMF/AMF Imaging 2021/2_computer_vision/'

data_dir = r'/Volumes/easystore/GWAS 2022/'

blocks = ['Block2/','Block3/','Block8/']

## check available resources on the computer
Total 20 cores
Estimated time to do inference on 1 czi image is 40 min
to finish inference on 337 accessions x 3 untreated blocks x 40 min = 40440 mins = 674 hrs = 28 days on one core or 2 days on 15 cores

In [5]:
RAY_GCE_TPU_ACCELERATOR_ENDPOINT = (
    "http://metadata.google.internal/computeMetadata/"
    "v1/instance/attributes/accelerator-type"
)
RAY_GCE_TPU_HEADERS = {"Metadata-Flavor": "Google"}
import requests
accelerator_type_request = requests.get(
    RAY_GCE_TPU_ACCELERATOR_ENDPOINT,
    headers=RAY_GCE_TPU_HEADERS,
)

In [6]:
print(accelerator_type_request)

<Response [200]>


In [3]:
ray.init()
ray.available_resources()['CPU']

2023-09-29 12:36:47,304	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


20.0

In [3]:
ray.init(num_cpus=18, ignore_reinit_error=True)

2023-10-01 20:54:41,550	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.8.18
Ray version:,2.6.3
Dashboard:,http://127.0.0.1:8265


## test parallel computing using Ray

### Trial 1
everything seems great, code is working

In [None]:
@ray.remote
def timer_ray(x):
    time.sleep(1)
    return x

In [None]:
t0 = time.time()
values = [timer_ray.remote(x) for x in range(4)]
print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))
print(values)

In [None]:
print(ray.get(values))

### Trial 2
Based on code below, ray.get()is extremely slow. Need to optimize ray.get() behavior

In [None]:
def busy(i):
    time.sleep(i)
    return i

In [None]:
@ray.remote
def rbusy(i):
    return busy(i)

In [None]:
# no parallel
t0 = time.time()
objs = [(i, busy(1.0)) for i in range(5)]
print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))
for i,obj in list(objs):
    print(f'{i}: {obj}')

In [None]:
# with parallel
t0 = time.time()
objs = [(i, ray.get(rbusy.remote(1.0)), time.time() - t0) for i in range(5)]
end = time.time() - t0
print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))

for i,obj,t in list(objs):
    print('%d: %s (time: %6.3f seconds)' % (i, obj, t))

This example recreates the problem below. Immediately calling ray.get() on each object ID is effectively blocking the progress, in another word, waited for each remote call to finish before starting the next task. Now we try to solve this by making ray.get() non-blocking.

In [None]:
t0 = time.time()
ids = [(i, rbusy.remote(1.0), time.time() - t0) for i in range(5)] # only calling object id now
t1 = time.time() - t0
print('Time Elapsed:\t{:.5f}'.format(time.time() - t0))

for i,obj_id,t in list(ids):
    print('%d: %s time: %.5f SECONDS' % (i, obj_id, t))

In [None]:
t0 = time.time()
for i, obj_id, t in list(ids):
    print('%d: (%s) time: %.5f SECONDS' % (i, ray.get(obj_id), time.time() - t0))

print('Time Elapsed:\t{:.5f}'.format(time.time() - t0))

The two step process is much faster than directly calling ray.get() after each task. The reason is that while we are waiting for the first task to finish in the second loop, the other four tasks are running in parallel and would finish almost at the same time. We no longer need to wait for the second, third ... task to finish. Therefore, the later ray.get() does not wait long before the object is ready. Now we will generalize this method to task that requires different running time. 

### Trial 3

In [None]:
times = range(5,30,5) # 5, 10, ...
ids = [rbusy.remote(i) for i in times]  # Each waits progressively longer
for x in list(zip(times, ids)):        # Show a list of ids and the times each one will take 
    print(x)

In [None]:
for i in range(1,40): 
    ready, not_ready = ray.wait(ids, num_returns = 2)
    print('iteration:', i) 
    print('Ready length, values: ', len(ready), ray.get(ready))
    print('Not Ready length:', len(not_ready))
    ids = not_ready
    if not ids:
        break

In [None]:
ids = [rbusy.remote(i) for i in times]  # New set of ids
for i in range(1,40):
    num_returns = 2 if len(ids) >= 2 else len(ids)
    ready, not_ready = ray.wait(ids, num_returns = num_returns)
    print('iteration:', i) 
    print('Ready length, values: ', len(ready), ray.get(ready))
    print('Not Ready length:', len(not_ready))
    ids = not_ready
    if not ids: 
        break

## Calculate AXIOSCAN per-channel mean
1. by randomly choosing 30 scenes
    with 1 core
    with 15 cores
2. by randomly choosing 100 images
    with 15 cores

In [9]:
def findBGRmean(file):
    czi = AICSImage(file)
    scenes = czi.scenes
    num_scene = len(scenes)
    x = random.randint(0, num_scene-1)
    czi.set_scene(x)
    img = czi.get_image_data("YXS", T=0,C=0,Z=0)
    
    return np.array([np.mean(img[:,:,0]),np.mean(img[:,:,1]),np.mean(img[:,:,2])])

In [10]:
@ray.remote
def rfindBGRmean(file):
    czi = AICSImage(file)
    scenes = czi.scenes
    x = random.choice(scenes)
    czi.set_scene(x)
    img = czi.get_image_data("YXS", T=0,C=0,Z=0)
    
    return [np.mean(img[:,:,0]),np.mean(img[:,:,1]),np.mean(img[:,:,2])]

In [None]:
# 64G memory limit
memory_limit = 64 * 1024 * 1024 * 1024
print(memory_limit)

In [None]:
# list all files in block2
all_file = [os.path.join(path, name) 
 for path, dirs, files in os.walk(os.path.join(data_dir, 'Block2/'), topdown=False)
 for name in files
if name.endswith('.czi')]
        

In [None]:
print(len(all_file))
print(all_file[0])

### randomly choose 30 scenes and run with 1 core

In [None]:
# randomly choose 30 scenes
sample30 = random.sample(all_file,30)

In [None]:
# calculate channel means without parallel
n=0
t0 = time.time()
for czi in sampleA:
    n = n+1
    print(n)
    print(findBGRmean(czi))
    print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))
print('Total Time Elapsed:\t{:.4f}'.format(time.time() - t0))

### randomly choose 30 scenes and run with 15 cores

In [None]:
# test to make sure the code is finishing in reasonable amount of time
id30 = [rfindBGRmean.remote(czi) for czi in sample30]
t0 = time.time()
for i in range(0,30):
    ready, not_ready = ray.wait(id30, num_returns = 1)
    print('iteration:', i) 
    print('Ready length, values: ', len(ready), ray.get(ready))
    print('Not Ready length:', len(not_ready))
    id30 = not_ready
    if not ids: 
        break
print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))
#print(np.mean(np.array(ray.get(values30)),axis=0))
#print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))

In [None]:
id30 = [rfindBGRmean.remote(czi) for czi in sample30]
results = []
t0 = time.time()
for i in range(1,40):
    ready, not_ready = ray.wait(id30, num_returns = 1)
    print('iteration:', i) 
    results.append(ray.get(ready))
    id30 = not_ready
    if not id30: 
        break
print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))

In [None]:
block2_means = np.mean(results,axis=0)[0]

In [None]:
diff_block2 = np.subtract(train_means, block2_means)
print(block2_means)
print(diff_block2)

In [11]:
# do the same things above for block 3 and 8
means = []
for block in ['Block2/','Block3/','Block8/']:
    print(block)
    results = []
    all_file = []
    t0 = time.time()
    for path, dirs, files in os.walk(os.path.join(data_dir, block), topdown=False):
        for name in files:
            if name.endswith('.czi'):
                all_file.append(os.path.join(path, name))
    sample30 = random.sample(all_file, 30)
    id30 = [rfindBGRmean.remote(czi) for czi in sample30]
    for i in range(1,40):
        ready, not_ready = ray.wait(id30, num_returns = 1)
        print('iteration:', i) 
        results.append(ray.get(ready))
        id30 = not_ready
        if not id30: 
            break
    block_means = np.mean(results,axis=0)[0].tolist()
    means.append(block_means)
    print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))
print('Total Time Elapsed:\t{:.4f}'.format(time.time() - t0))        

Block2/
iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 10
iteration: 11
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
iteration: 30
Time Elapsed:	119.3953
Block3/
iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
iteration: 10
iteration: 11
iteration: 12
iteration: 13
iteration: 14
iteration: 15
iteration: 16
iteration: 17
iteration: 18
iteration: 19
iteration: 20
iteration: 21
iteration: 22
iteration: 23
iteration: 24
iteration: 25
iteration: 26
iteration: 27
iteration: 28
iteration: 29
iteration: 30
Time Elapsed:	112.0988
Block8/
iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iter

In [13]:
train_means = np.array([145.85450147081923, 152.11404784018066, 161.0139541272665])

### Calculate diff between per channels means for centering 

In [9]:
#print(means)
#print(train_means)
#print(np.subtract(train_means, means))
#diffs = np.subtract(train_means, means)
#diffs = np.reshape(np.append(diff_block2,np.subtract(train_means, means)), (3,3))

diffs = [[10.12746633, 18.06137262, 27.510427], 
         [16.24330208, 23.29101365, 34.06829238], 
         [10.1628206,  16.92133011, 26.18143223]]
print(diffs[0])

[10.12746633, 18.06137262, 27.510427]


## parallel inference step-by-step
1. for loops to read in czi images
2. parallelize at the step of looping by tiles

In [12]:
classes=['root','AMF internal hypha','AMF external hypha','AMF arbuscule','AMF vesicle','AMF spore','others']

In [13]:
cfg = get_cfg() # return default configuration
cfg.merge_from_file(model_zoo.get_config_file("COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_3x.yaml")) # copy config files from open source projects

# training configuration
cfg.DATASETS.TEST=()
cfg.DATALOADER.NUM_WORKERS=2
#cfg.SOLVER.IMS_PER_BATCH=args.batch_size

cfg.MODEL.ROI_HEADS.BATCH_SIZE_PER_IMAGE=128 #Number of regions per image used to train RPN. faster, and good enough for this toy dataset (default: 512)
cfg.MODEL.ROI_HEADS.NUM_CLASSES=len(classes)# (see https://detectron2.readthedocs.io/tutorials/datasets.html#update-the-config-for-new-datasets)
cfg.MODEL.BACKBONE.FREEZE_AT=2
cfg.SEED=1
cfg.AUG_FLAG=1

# inference configuration
cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.8  # set threshold for this model
cfg.MODEL.WEIGHTS=os.path.join(model_dir, "Trainset1_model_best.pth") # path to the best model trained
cfg.MODEL.DEVICE='cpu' # use cpu for inference


inf_metadata = MetadataCatalog.get("inference").set(thing_classes=['root','AMF internal hypha','AMF external hypha','AMF arbuscule','AMF vesicle','AMF spore','others'])
predictor = DefaultPredictor(cfg)

[32m[09/29 12:40:39 d2.checkpoint.detection_checkpoint]: [0m[DetectionCheckpointer] Loading from /Users/lovely_shufan/Dropbox (Edison_Lab@UGA)/AMF/AMF Imaging 2021/2_computer_vision/Trainset1_model_best.pth ...


The checkpoint state_dict contains keys that are not used by the model:
  [35mpixel_mean[0m
  [35mpixel_std[0m


In [4]:
diff = [10.12746633, 18.06137262, 27.510427]

Terminated with signal 15
  File "/Users/lovely_shufan/opt/anaconda3/envs/detectron2-py38/lib/python3.8/site-packages/ray/autoscaler/_private/monitor.py", line 720, in <module>
    monitor.run()
  File "/Users/lovely_shufan/opt/anaconda3/envs/detectron2-py38/lib/python3.8/site-packages/ray/autoscaler/_private/monitor.py", line 595, in run
    self._run()
  File "/Users/lovely_shufan/opt/anaconda3/envs/detectron2-py38/lib/python3.8/site-packages/ray/autoscaler/_private/monitor.py", line 449, in _run
    time.sleep(AUTOSCALER_UPDATE_INTERVAL_S)



In [None]:
diff_expanded = diff[:, np.newaxis, np.newaxis]

In [14]:
def centering2train(diff, img):
    row = img.shape[0]
    col = img.shape[1]
    times = row * col
    for i in range(0,3,1):
        diffarray = np.repeat(diff[i], times, axis = 0)
        diffmx = diffarray.reshape((row,col))
        img[:,:,i] = np.add(img[:,:,i],diffmx)
        img[img > 255] = 255
        img[img < 0] = 0
    return img

def padImg(img, tilex, tiley):
    '''
    :param img:
    :return padded img:
    
    :rtype ndnumpy.array:
    Objective: output a padded image dividle by tile size
    '''
    y = img.shape[0]
    x = img.shape[1]
    pad_top = tiley - (y % tiley)
    pad_lft = tilex - (x % tilex)
    img = cv2.copyMakeBorder(img,pad_top,0,pad_lft,0,cv2.BORDER_CONSTANT,value=[0,0,0])
    
    return img

In [34]:
@ray.remote
def inference(pathtofile, block, diff, predictor):
    blklist = []
    imgidlist = []
    sceneidlist = []
    tileidlist = []
    classlist = []
    confscorelist=[]
    arealist = []

    # read in czi
    czi = AICSImage(pathtofile)
   
    for scene in czi.scenes[0:2]:
        # extract image by scene
        czi.set_scene(scene)
        img = czi.get_image_data("YXS", T=0,C=0,Z=0) # numpy.ndarray
                
        # pad image
        img = padImg(img, 2560, 1920)
    
        # centering
        img = centering2train(diff, img)
                 
        # tiling
        for i in range(0,img.shape[0],1920):
            for j in range(0,img.shape[1],2560):
                xmin = j
                xmax = j + 2560
                ymin = i
                ymax = i + 1920
                tile_id = str(xmin)+"_"+str(ymin)+"_"+str(xmax)+"_"+str(ymax)
                subimg = img[ymin:ymax,xmin:xmax]
                outputs = predictor(subimg)
                        
                #inference outputs
                clasind = outputs['instances'].get('pred_classes')
                allmasks = outputs['instances'].get('pred_masks')
                allscores = outputs['instances'].get('scores')
            
                num_seg = clasind.size()[0]
            
                if num_seg != 0: # only save an entry when the image contains a segmentation
                    blklist = blklist + np.repeat(block[:-1], num_seg).tolist()
                    imgidlist = imgidlist + np.repeat(pathtofile, num_seg).tolist()
                    sceneidlist = sceneidlist + np.repeat(scene, num_seg).tolist()
                    tileidlist = tileidlist + np.repeat(tile_id, num_seg).tolist()
                    confscorelist = confscorelist + allscores.tolist()
                
                    # calculate the area of segmentation
                    v = Visualizer(subimg[:, :, ::-1], MetadataCatalog.get("inference"), scale=1.0)
                    for i in range(0,num_seg,1):
                        #calculate mask area
                        locmask = np.asarray(allmasks[i,:,:])
                        gmask = GenericMask(locmask,v.output.height,v.output.width)
                        if gmask.polygons:
                            mergpolygon = gmask.polygons[0]
                            all_points_x = mergpolygon[::2]
                            all_points_y = mergpolygon[1::2]
                            pgon = Polygon(zip(all_points_x,all_points_y))
                            arealist.append(pgon.area)
                            # class index to class name
                            classlist.append(classes[clasind.tolist()[i]])
                        else:
                            continue  # Skip this iteration

    # export inference result as df
    infresults = pd.DataFrame({
    'block': blklist,
    'filename': imgidlist,
    'scene': sceneidlist,
    'tile': tileidlist,
    'annotations': classlist,
    'area': arealist,
    'confidenceScore': confscorelist})
    
    # delete 
    return infresults

## make a test inference run for only two images and two scenes per image

In [227]:
#test_images = os.listdir(os.path.join(data_dir, 'Block2/'))[0:2]
allpath2block2img = [os.path.join(path,name) for path, dirs, files in os.walk(os.path.join(data_dir,'Block2/'))
for name in files
if name.endswith('.czi')]

In [228]:
print(allpath2block2img[0:2])

['/Volumes/easystore/GWAS 2022/Block2/e2.1_PI6/e2.1_PI656065_C0.czi', '/Volumes/easystore/GWAS 2022/Block2/PI144134/PI144134_C12_R16.czi']


In [229]:
ids = [inference.remote(path, 'Block2/', diffs[0], predictor) 
       for path in allpath2block2img[0:2]]

results = list()
t0 = time.time()
for i in range(1,340,1):
    ready, not_ready = ray.wait(ids, num_returns = 1)
    print('working on image:', i) 
    results.append(ray.get(ready))
    ids = not_ready
    if not ids: 
        break
    print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))
print('Total Time Elapsed:\t{:.4f}'.format(time.time() - t0))

[2m[36m(inference pid=21380)[0m The checkpoint state_dict contains keys that are not used by the model:
[2m[36m(inference pid=21380)[0m   [35mpixel_mean[0m
[2m[36m(inference pid=21380)[0m   [35mpixel_std[0m
[2m[36m(inference pid=21380)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 4x across cluster][0m
[2m[36m(inference pid=21380)[0m   [35mpixel_std[0m[32m [repeated 8x across cluster][0m
[2m[36m(inference pid=21380)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 4x across cluster][0m
[2m[36m(inference pid=21380)[0m   [35mpixel_std[0m[32m [repeated 8x across cluster][0m
[2m[36m(inference pid=21380)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 4x across cluster][0m
[2m[36m(inference pid=21380)[0m   [35mpixel_std[0m[32m [repeated 8x across cluster][0m
[2m[36m(inference pid=21376)[0m The checkpoint state_dict co

working on image: 1
Time Elapsed:	159.0395


[2m[36m(inference pid=21376)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 3x across cluster][0m
[2m[36m(inference pid=21376)[0m   [35mpixel_std[0m[32m [repeated 6x across cluster][0m


working on image: 2
Total Time Elapsed:	166.3434


In [230]:
print(type(results[0][0]))

<class 'pandas.core.frame.DataFrame'>


In [240]:
print(len(results))
print(len(results[0][0]))
print(len(results[1][0]))

2
18
11


In [241]:
df = results[0][0]
for i in range(1,len(results),1):
    newdf = results[i][0]
    df = pd.concat([df, newdf], axis=0)
    
display(df)
print(len(df))

Unnamed: 0,block,filename,scene,tile,annotations,area,confidenceScore
0,Block2,/Volumes/easystore/GWAS 2022/Block2/PI144134/P...,ScanRegion0,10240_0_12800_1920,AMF external hypha,11913.0,0.995756
1,Block2,/Volumes/easystore/GWAS 2022/Block2/PI144134/P...,ScanRegion0,2560_1920_5120_3840,AMF external hypha,38379.0,0.997879
2,Block2,/Volumes/easystore/GWAS 2022/Block2/PI144134/P...,ScanRegion0,2560_1920_5120_3840,root,1274540.0,0.992442
3,Block2,/Volumes/easystore/GWAS 2022/Block2/PI144134/P...,ScanRegion0,2560_1920_5120_3840,AMF internal hypha,119245.5,0.900109
4,Block2,/Volumes/easystore/GWAS 2022/Block2/PI144134/P...,ScanRegion0,5120_2_7680_1922,AMF external hypha,3243.0,0.878535
5,Block2,/Volumes/easystore/GWAS 2022/Block2/PI144134/P...,ScanRegion0,10240_0_12800_1920,AMF external hypha,11913.0,0.995756
6,Block2,/Volumes/easystore/GWAS 2022/Block2/PI144134/P...,ScanRegion0,2560_3840_5120_5760,root,2192908.0,0.989744
7,Block2,/Volumes/easystore/GWAS 2022/Block2/PI144134/P...,ScanRegion0,2560_3840_5120_5760,AMF external hypha,26620.5,0.955502
8,Block2,/Volumes/easystore/GWAS 2022/Block2/PI144134/P...,ScanRegion0,2560_3840_5120_5760,AMF external hypha,3856.0,0.934331
9,Block2,/Volumes/easystore/GWAS 2022/Block2/PI144134/P...,ScanRegion0,5120_2_7680_1922,AMF external hypha,3243.0,0.878535


29


In [None]:
a = list()
a.append(results[0][0])
a.append(results[1][0])

In [None]:
b = pd.concat(a, ignore_index=True)

In [None]:
print(type(a))
print(type(a[1]))
print(a[0].shape)
print(a[1].shape)
print(type(b))
print(b.shape)

In [None]:
print(a[0])

In [None]:
print(a[1])

## Inference on all block 2 images but only two scenes per image

In [36]:
del df

NameError: name 'df' is not defined

In [None]:
ids = [inference.remote(os.path.join(path,name), 'Block2/', diffs[0], predictor) 
        for path, dirs, files in os.walk(os.path.join(data_dir, 'Block2/'))
        for name in files
        if name.endswith('.czi')]

results = list() #creates a new dataframe that's empty
t0 = time.time()
for i in range(1,340,1):
    ready, not_ready = ray.wait(ids, num_returns = 1)
    print('working on image:', i) 
    results.append(ray.get(ready))
    ids = not_ready
    if not ids: 
        break
    print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))
print('Total Time Elapsed:\t{:.4f}'.format(time.time() - t0))

#%% Clean-up object store data 
del ids


In [246]:
df = results[0][0]
for i in range(1,len(results),1):
    newdf = results[i][0]
    df = pd.concat([df, newdf], axis=0)
df.to_csv(os.path.join(output_dir,"block2_segmentation.txt"),index=False)

[2m[36m(inference pid=21896)[0m The checkpoint state_dict contains keys that are not used by the model:
[2m[36m(inference pid=21896)[0m   [35mpixel_mean[0m
[2m[36m(inference pid=21896)[0m   [35mpixel_std[0m
[2m[36m(inference pid=21895)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 6x across cluster][0m
[2m[36m(inference pid=21895)[0m   [35mpixel_std[0m[32m [repeated 12x across cluster][0m
[2m[36m(inference pid=21877)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 6x across cluster][0m
[2m[36m(inference pid=21877)[0m   [35mpixel_std[0m[32m [repeated 12x across cluster][0m
[2m[36m(inference pid=21379)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 6x across cluster][0m
[2m[36m(inference pid=21379)[0m   [35mpixel_std[0m[32m [repeated 12x across cluster][0m
[2m[36m(inference pid=21896)[0m The checkpoint state_dict

In [254]:
print(len(results))
print(len(allpath2block2img))

339
344


There are 344 images under block 2 for unknown reasons, expected for < 337 images 

In [261]:
# run the remaining images in block 2
remaining_blk2 = allpath2block2img[-5:]
print(len(remaining_blk2))
print(remaining_blk2)

5
['/Volumes/easystore/GWAS 2022/Block2/PI655981/PI655981_C16_R14.czi', '/Volumes/easystore/GWAS 2022/Block2/PI655983/PI655983_C03_R07.czi', '/Volumes/easystore/GWAS 2022/Block2/PI656015/PI656015_C04_R19.czi', '/Volumes/easystore/GWAS 2022/Block2/PI656035/PI656035_C15_R05.czi', '/Volumes/easystore/GWAS 2022/Block2/PI92270/PI92270_C08_R10.czi']


In [262]:
ids_remain_blk2 = [inference.remote(imgpath, 'Block2/', diffs[0], predictor) 
        for imgpath in remaining_blk2]

results_remain_blk2 = list() #creates a new dataframe that's empty
t0 = time.time()
for i in range(1,340,1):
    ready, not_ready = ray.wait(ids_remain_blk2, num_returns = 1)
    print('working on image:', i) 
    results_remain_blk2.append(ray.get(ready))
    ids_remain_blk2 = not_ready
    if not ids_remain_blk2: 
        break
    print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))
print('Total Time Elapsed:\t{:.4f}'.format(time.time() - t0))

[2m[36m(inference pid=21379)[0m The checkpoint state_dict contains keys that are not used by the model:
[2m[36m(inference pid=21379)[0m   [35mpixel_mean[0m
[2m[36m(inference pid=21379)[0m   [35mpixel_std[0m
[2m[36m(inference pid=21880)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 6x across cluster][0m
[2m[36m(inference pid=21880)[0m   [35mpixel_std[0m[32m [repeated 12x across cluster][0m
[2m[36m(inference pid=21898)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 6x across cluster][0m
[2m[36m(inference pid=21898)[0m   [35mpixel_std[0m[32m [repeated 12x across cluster][0m
[2m[36m(inference pid=21877)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 6x across cluster][0m
[2m[36m(inference pid=21877)[0m   [35mpixel_std[0m[32m [repeated 12x across cluster][0m
[2m[36m(inference pid=21379)[0m The checkpoint state_dict

working on image: 1
Time Elapsed:	176.0011


[2m[36m(inference pid=21379)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 6x across cluster][0m
[2m[36m(inference pid=21379)[0m   [35mpixel_std[0m[32m [repeated 12x across cluster][0m
[2m[36m(inference pid=21879)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 5x across cluster][0m
[2m[36m(inference pid=21879)[0m   [35mpixel_std[0m[32m [repeated 10x across cluster][0m
[2m[36m(inference pid=21898)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 5x across cluster][0m
[2m[36m(inference pid=21898)[0m   [35mpixel_std[0m[32m [repeated 10x across cluster][0m
[2m[36m(inference pid=21877)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 5x across cluster][0m
[2m[36m(inference pid=21877)[0m   [35mpixel_std[0m[32m [repeated 10x across cluster][0m
[2m[36m(inference pid=21379)[0m The c

working on image: 2
Time Elapsed:	211.5510


[2m[36m(inference pid=21877)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 4x across cluster][0m
[2m[36m(inference pid=21877)[0m   [35mpixel_std[0m[32m [repeated 8x across cluster][0m
[2m[36m(inference pid=21879)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 4x across cluster][0m
[2m[36m(inference pid=21879)[0m   [35mpixel_std[0m[32m [repeated 8x across cluster][0m
[2m[36m(inference pid=21898)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 4x across cluster][0m
[2m[36m(inference pid=21898)[0m   [35mpixel_std[0m[32m [repeated 8x across cluster][0m
[2m[36m(inference pid=21877)[0m The checkpoint state_dict contains keys that are not used by the model:[32m [repeated 4x across cluster][0m
[2m[36m(inference pid=21877)[0m   [35mpixel_std[0m[32m [repeated 8x across cluster][0m
[2m[36m(inference pid=21879)[0m The check

working on image: 3
Time Elapsed:	245.6804
working on image: 4
Time Elapsed:	247.1074
working on image: 5
Time Elapsed:	248.2457
working on image: 6
Time Elapsed:	248.2457
working on image: 7
Time Elapsed:	248.2458
working on image: 8
Time Elapsed:	248.2458
working on image: 9
Time Elapsed:	248.2458
working on image: 10
Time Elapsed:	248.2458
working on image: 11
Time Elapsed:	248.2459
working on image: 12
Time Elapsed:	248.2459
working on image: 13
Time Elapsed:	248.2459
working on image: 14
Time Elapsed:	248.2459
working on image: 15
Time Elapsed:	248.2460
working on image: 16
Time Elapsed:	248.2460
working on image: 17
Time Elapsed:	248.2460
working on image: 18
Time Elapsed:	248.2461
working on image: 19
Time Elapsed:	248.2461
working on image: 20
Time Elapsed:	248.2461
working on image: 21
Time Elapsed:	248.2461
working on image: 22
Time Elapsed:	248.2462
working on image: 23
Time Elapsed:	248.2462
working on image: 24
Time Elapsed:	248.2462
working on image: 25
Time Elapsed:	248.

In [270]:
print(len(results_remain_blk2))
print(results_remain_blk2[5])
print([i for i in range(1,5,1)])

339
[]
[1, 2, 3, 4]


In [271]:
df_remain_blk2 = results_remain_blk2[0][0]
for i in range(1,5,1):
    newdf = results_remain_blk2[i][0]
    df_remain_blk2 = pd.concat([df_remain_blk2, newdf], axis=0)
df_remain_blk2.to_csv(os.path.join(output_dir,"block2_segmentation_part2.txt"),index=False)

# Inference on block 3 images and only two scenes per image

### ray.shutdown()


In [19]:
all_files_blk3 = [os.path.join(path,name) 
        for path, dirs, files in os.walk(os.path.join(data_dir, 'Block3/'))
        for name in files
        if name.endswith('.czi')]

In [16]:
ids_blk3 = [inference.remote(os.path.join(path,name), 'Block3/', diffs[1], predictor) 
        for path, dirs, files in os.walk(os.path.join(data_dir, 'Block3/'))
        for name in files
        if name.endswith('.czi')]

#results_blk3 = list() #creates a new dataframe that's empty
t0 = time.time()
for i in range(1,400,1):
    ready, not_ready = ray.wait(ids_blk3, num_returns = 1)
    print('image:', i) 
    results_blk3.append(ray.get(ready))
    ids_blk3 = not_ready
    if not ids_blk3: 
        break
    print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))
print('Total Time Elapsed:\t{:.4f}'.format(time.time() - t0))

[2m[36m(raylet)[0m Spilled 2186 MiB, 13 objects, write throughput 836 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.
[2m[36m(raylet)[0m Spilled 4205 MiB, 25 objects, write throughput 780 MiB/s.
[2m[36m(raylet)[0m Spilled 8241 MiB, 49 objects, write throughput 659 MiB/s.
[2m[36m(raylet)[0m Spilled 16820 MiB, 100 objects, write throughput 623 MiB/s.
[2m[36m(raylet)[0m Spilled 32799 MiB, 195 objects, write throughput 610 MiB/s.


image: 1
Time Elapsed:	0.0081
image: 2
Time Elapsed:	20.9100
image: 3
Time Elapsed:	35.3598
image: 4
Time Elapsed:	54.0779
image: 5
Time Elapsed:	98.7557
image: 6
Time Elapsed:	116.8642
image: 7
Time Elapsed:	129.0020
image: 8
Time Elapsed:	153.8187
image: 9
Time Elapsed:	197.2197
image: 10
Time Elapsed:	321.8907
image: 11
Time Elapsed:	323.5673
image: 12
Time Elapsed:	329.8097
image: 13
Time Elapsed:	379.1117
image: 14
Time Elapsed:	395.9407
image: 15
Time Elapsed:	402.5625
image: 16
Time Elapsed:	412.4789
image: 17
Time Elapsed:	437.1791
image: 18
Time Elapsed:	483.1547
image: 19
Time Elapsed:	492.8017
image: 20
Time Elapsed:	590.9222
image: 21
Time Elapsed:	645.0212
image: 22
Time Elapsed:	717.7729
image: 23
Time Elapsed:	726.7687
image: 24
Time Elapsed:	731.1083
image: 25
Time Elapsed:	747.7204
image: 26
Time Elapsed:	756.3941
image: 27
Time Elapsed:	780.4761
image: 28
Time Elapsed:	895.8038
image: 29
Time Elapsed:	954.6699
image: 30
Time Elapsed:	994.6259
image: 31
Time Elapsed:	1

RayTaskError(IndexError): [36mray::inference()[39m (pid=973, ip=127.0.0.1)
  File "/var/folders/mv/p4jr8xs52gl7rk_wf4pn6mn80000gn/T/ipykernel_929/813364551.py", line 58, in inference
IndexError: list index out of range

In [18]:
print(len(results_blk3))

88


In [29]:
print(len(all_files_blk3))
print(all_files_blk3[332])
print(all_files_blk3[331:333])

333
/Volumes/easystore/GWAS 2022/Block3/PI92270/PI92270_C01_R12.czi
['/Volumes/easystore/GWAS 2022/Block3/PI656065/PI656065_C06_R15.czi', '/Volumes/easystore/GWAS 2022/Block3/PI92270/PI92270_C01_R12.czi']


In [22]:
ray.init(num_cpus=18, ignore_reinit_error=True)

2023-09-29 16:05:23,661	INFO worker.py:1612 -- Started a local Ray instance. View the dashboard at [1m[32mhttp://127.0.0.1:8265 [39m[22m


0,1
Python version:,3.8.18
Ray version:,2.6.3
Dashboard:,http://127.0.0.1:8265


In [31]:
ids_blk3 = [inference.remote(path, 'Block3/', diffs[1], predictor) 
        for path in all_files_blk3[88:333]]

#results_blk3 = list() #creates a new dataframe that's empty
t0 = time.time()
for i in range(1,400,1):
    ready, not_ready = ray.wait(ids_blk3, num_returns = 1)
    print('image:', i) 
    results_blk3.append(ray.get(ready))
    ids_blk3 = not_ready
    if not ids_blk3: 
        break
    print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))
print('Total Time Elapsed:\t{:.4f}'.format(time.time() - t0))

[2m[36m(raylet)[0m Spilled 2691 MiB, 16 objects, write throughput 940 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.
[2m[36m(raylet)[0m Spilled 4541 MiB, 27 objects, write throughput 976 MiB/s.
[2m[36m(raylet)[0m Spilled 8410 MiB, 50 objects, write throughput 1011 MiB/s.
[2m[36m(raylet)[0m Spilled 16483 MiB, 98 objects, write throughput 884 MiB/s.
[2m[36m(raylet)[0m Spilled 32799 MiB, 195 objects, write throughput 732 MiB/s.


image: 1
Time Elapsed:	133.3312
image: 2
Time Elapsed:	135.5714
image: 3
Time Elapsed:	144.0336
image: 4
Time Elapsed:	157.1833
image: 5


RayTaskError(IndexError): [36mray::inference()[39m (pid=3663, ip=127.0.0.1)
  File "/var/folders/mv/p4jr8xs52gl7rk_wf4pn6mn80000gn/T/ipykernel_929/813364551.py", line 58, in inference
IndexError: list index out of range

In [33]:
ray memory

SyntaxError: invalid syntax (1159013323.py, line 1)

In [278]:
allpath2block3img = [os.path.join(path,name) for path, dirs, files in os.walk(os.path.join(data_dir,'Block3/'))
for name in files
if name.endswith('.czi')]

KeyboardInterrupt: 

In [281]:
print(len(results_blk3[0]))

IndexError: list index out of range

In [None]:
df_blk3 = results_blk3[0][0]
for i in range(1,len(results_blk3),1):
    newdf = results_blk3[i][0]
    df_blk3 = pd.concat([df_blk3, newdf], axis=0)
#df.to_csv(os.path.join(output_dir,"block3_segmentation.txt"),index=False)

In [None]:
ids = [inference.remote(os.path.join(path,name), 'Block8/', diffs[2], cfg) 
        for path, dirs, files in os.walk(os.path.join(data_dir, 'Block8/'))
        for name in files
        if name.endswith('.czi')]

results = list() #creates a new dataframe that's empty
t0 = time.time()
for i in range(1,400,1):
    ready, not_ready = ray.wait(ids, num_returns = 1)
    print('working on image:', i) 
    results.append(ray.get(ready))
    ids = not_ready
    if not ids: 
        break
    print('Time Elapsed:\t{:.4f}'.format(time.time() - t0))

In [None]:
df = results[0][0]
for i in range(1,len(results),1):
    newdf = results[i][0]
    df = pd.concat([df, newdf], axis=0)
df.to_csv(os.path.join(output_dir,"block8_segmentation.txt"),index=False)

In [2]:
import math
a = math.nan
print(type(a))

<class 'float'>
