In [1]:
import copy
import fnmatch
import json
import getpass
import os
import pathlib
import datetime
                    
from dask.distributed import LocalCluster, SSHCluster 
from laserfarm import Retiler, DataProcessing, GeotiffWriter, MacroPipeline, Classification
from laserfarm.remote_utils import get_wdclient, get_info_remote, list_remote

def last_modified(opts, remote_path):
    info = get_info_remote(get_wdclient(opts), remote_path.as_posix())
    format_ = '%a, %d %b %Y %H:%M:%S GMT'
    return datetime.datetime.strptime(info['modified'], format_)

# Macro-Pipeline Workflow - Classify Points Using Cadastre Data

## Set Run-Specific Input

Choose whether you want to i) run all input files, ii) run the only input files listed in `filename`, or iii) run the input that was updated since the last workflow run.

In [21]:
path_root = pathlib.Path('/project/lidarac/Data')

# dCache path to a set of targets
path_input = path_root / 'Targets_all/point_density'

# dCache path where to copy the classified targets
path_output = path_root / 'AHN4_mask/TOP10NL_2021_shapefiles/classified' 

run = 'all' # 'all', 'updated', 'from_file'
#filename = 'classification_failed.json'  # if run is 'from_file', set name of file with input file names
assert run in ['all', 'updated', 'from_file']

## Check Connection to Remote Storage

In [22]:

tiles = [el for el in path_input.iterdir() if el.match('tile_*_*.ply')]

#tiles = [t for t in list_remote(get_wdclient(wd_opts), remote_path_input.as_posix())
#         if fnmatch.fnmatch(t, 'tile_*_*.ply')]
print('Found: {} tiles'.format(len(tiles)))
if run == 'updated':
    # determine which tiles have been updated since last run
    tiles = [t for t in tiles if last_modified(wd_opts, remote_path_input/t) > last_run]
elif run == 'from_file':
    with open(filename, 'r') as f:
        tiles_read = json.load(f)
    # check whether all files are available on dCache
    assert all([t in tiles for t in tiles_read]), f'Some of the files in {filename} are not in remote dir'
    tiles = tiles_read
print('Retrieve and classify: {} tiles'.format(len(tiles)))

Found: 25427 tiles
Retrieve and classify: 25427 tiles


## Setup Cluster

Setup Dask cluster used for all the macro-pipeline calculations.

In [15]:
from dask.distributed import Client

client = Client("tcp://10.0.1.207:41553")
client

0,1
Connection method: Direct,
Dashboard: /proxy/8787/status,

0,1
Comm: tcp://10.0.1.207:41553,Workers: 2
Dashboard: /proxy/8787/status,Total threads: 12
Started: 28 minutes ago,Total memory: 96.00 GiB

0,1
Comm: tcp://10.0.0.88:42737,Total threads: 6
Dashboard: /proxy/8787/status,Memory: 48.00 GiB
Nanny: tcp://10.0.0.88:41593,
Local directory: /tmp/dask-scratch-space/worker-k1xon92g,Local directory: /tmp/dask-scratch-space/worker-k1xon92g
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 162.21 MiB,Spilled bytes: 0 B
Read bytes: 1.03 GiB,Write bytes: 909.60 kiB

0,1
Comm: tcp://10.0.2.120:33031,Total threads: 6
Dashboard: /proxy/8787/status,Memory: 48.00 GiB
Nanny: tcp://10.0.2.120:35801,
Local directory: /tmp/dask-scratch-space/worker-tfr986gf,Local directory: /tmp/dask-scratch-space/worker-tfr986gf
Tasks executing:,Tasks in memory:
Tasks ready:,Tasks in flight:
CPU usage: 2.0%,Last seen: Just now
Memory usage: 162.84 MiB,Spilled bytes: 0 B
Read bytes: 786.77 MiB,Write bytes: 786.51 kiB


## Classify Target Points

Classify the target points according to the ground type, using Cadastre data.

In [23]:
# path where the shapefiles extracted from the cadastre data are available
shp_dir = path_root / 'AHN4_mask/TOP10NL_2021_shapefiles' 

# setup input dictionary to configure the classification pipeline
# NOTE: for the classification we have mounted the dCache storage with rclone to access shp files
classification_input = {
    'setup_local_fs': {'input_folder': path_input.as_posix(),
                       'output_folder': path_output.as_posix()},
    'locate_shp': {'shp_dir': shp_dir.as_posix()},
    'classification': {'ground_type': 6},
    'export_point_cloud': {}
}


# write input dictionary to JSON file
#with open('classification_input.json', 'w') as f:
#    json.dump(classification_input, f)

In [None]:
macro = MacroPipeline()


# add pipeline list to macro-pipeline object and set the corresponding labels
macro.tasks = [Classification(t).config(classification_input) for t in tiles]
macro.set_labels([os.path.splitext(tile)[0] for tile in tiles])

#macro.setup_cluster(cluster=cluster)
macro.setup_cluster(cluster="tcp://10.0.1.207:41553")


# run!
macro.run()

# save outcome results and write name of failed pipelines to file
macro.print_outcome(to_file='classification.out')
failed = macro.get_failed_pipelines()
if failed:
    with open('classification_failed.json', 'w') as f:
        json.dump(['.'.join([pip.label, 'ply']) for pip in failed], f)
    raise RuntimeError('Some of the pipelines have failed')

## Terminate cluster

In [None]:
# macro.shutdown()

In [9]:
macro.client.close()