In [1]:
import copy
import fnmatch
import json
import getpass
import os
import pathlib
import datetime
                    
from dask.distributed import Client, SSHCluster
from laserfarm import Retiler, DataProcessing, GeotiffWriter, MacroPipeline
from laserfarm.remote_utils import get_wdclient, get_info_remote, list_remote

# Macro-Pipeline AHN4 Workflow - Normalization


Choose whether you want to run all input files or run the only input files listed in `filename`.

In [52]:
path_root = pathlib.Path('/project/lidarac/Data/AHN4')

# path to retiled files 
path_input = path_root / 'retiled'

# path to normalized files
path_output = path_input.parent / 'normalized'

run = 'from_file' # 'all', 'from_file'
filename = 'normalize_failed.json'  # if run is 'from_file', set name of file with input file names
assert run in ['all', 'from_file']

In [None]:
tiles = [el for el in path_input.iterdir() if el.match('tile_*_*/')]
print('Found: {} tiles'.format(len(tiles)))
if run == 'from_file':
    with open(filename, 'r') as f:
        tiles_read = json.load(f)
    tiles_read = [path_input/f for f in tiles_read]
    # check whether all files are available on dCache
    assert all([f in tiles for f in tiles_read]), f'Some of the tiles in {filename} are not in input dir'
    tiles = tiles_read
print('Normalize: {} tiles'.format(len(tiles)))

## Setup Cluster

Setup Dask cluster used for the macro-pipeline calculation.

In [None]:
from dask.distributed import Client

client = Client("tcp://10.0.2.38:35305")
client

## Normalization

Generate the normalized height for each point.

In [54]:
# setup input dictionary to configure the normalization pipeline
normalization_input = {
    'setup_local_fs': {'input_folder': path_input.as_posix(),
                       'output_folder': path_output.as_posix()},
    'load': {'attributes': 'all'},
    # Filter out artifically high points - give overflow error when writing 
    'apply_filter': {'filter_type':'select_below',
                     'attribute': 'z',
                     'threshold': 10000.}, # remove non-physically heigh points
    # filter point cloud using polygons 
    #'apply_filter': {'filter_type':'select_polygon',
    #                 'polygon_string': '/project/lidarac/Data/Cliptest/shapefile/Clip_shape_exploded.shp',
    #                 'read_from_file': True
    #                 },                  
    'normalize': 1,
    'clear_cache' : {},
}

# write input dictionary to JSON file
with open('normalize.json', 'w') as f:
    json.dump(normalization_input, f)
    

In [55]:
macro = MacroPipeline()

# add pipeline list to macro-pipeline object and set the corresponding labels
for tile in tiles:
    dp = DataProcessing(tile.name, label=tile.name)
    normalization_input_ = copy.deepcopy(normalization_input)
    normalization_input_['export_point_cloud'] = {'filename': '{}.laz'.format(tile.name),
                                                  'overwrite': True}
    dp.config(normalization_input_)
    macro.add_task(dp)

macro.setup_cluster(cluster="tcp://10.0.1.12:43843")

# run!
macro.run()

# save outcome results and check that no error occurred before continuing
macro.print_outcome(to_file='normalize.out')

failed = macro.get_failed_pipelines()
if failed:
    with open('normalize_failed.json', 'w') as f:
        json.dump([pip.label for pip in failed], f)
    raise RuntimeError('Some of the pipelines have failed')

## Terminate cluster

In [None]:
macro.shutdown()

## Troubleshooting 

### Cancel all jobs and restart the notebook

Copy and paste these lines in a separate Python shell. If the Dask dashboard shows that some tasks are still queued to be processed, run the lines again - this should clear the scheduler up and give back control to the current notebook. Normally proceed to terminate the cluster and restart the notebook.

In [None]:
from dask.distributed import Client, Future
client = Client('tcp://145.100.59.123:8786')
futures = [Future(key) for key in client.who_has().keys()]
client.cancel(futures)