In [1]:
import copy
import fnmatch
import json
import getpass
import os
import pathlib
import datetime
from dask.distributed import Client, SSHCluster
from laserfarm import Retiler, GeotiffWriter, MacroPipeline
from laserfarm.remote_utils import get_wdclient, get_info_remote, list_remote                    

# Macro-Pipeline AHN4 Workflow - GeoTIFF Export (Powerline Mask)

## Set Run-Specific Input

Choose whether you want to run all input files or run the only input files listed in `filename`.

In [98]:
path_root = pathlib.Path('/project/lidarac/Data/AHN4')

# path to normalized files 
path_input = path_root / 'targets_powerline'

# path to targets
path_output = path_input.parent / 'geotiff_powerline'

run = 'all'  # 'all', 'from_file'
#filename = 'powerline_geotiff_export_non-ground_failed.json'  # if run is 'from_file', set name of file with input file names
assert run in ['all', 'from_file']

In [None]:
features = [el for el in path_input.iterdir() if not el.match('tile_*_*.log')]
print('Found: {} features'.format(len(features)))
if run == 'from_file':
    with open(filename, 'r') as f:
        features_read = json.load(f)
    features_read = [path_input/f for f in features_read]
    # check whether all files are available on dCache
    assert all([f in features for f in features_read]), f'Some of the features in {filename} are not in input dir'
    features = features_read
print('Extract geotiffs for: {} features'.format(len(features)))

## Setup Cluster

Setup Dask cluster used for all the macro-pipeline calculations.

In [None]:
from dask.distributed import Client

client = Client("tcp://10.0.1.12:33245")
client

## GeoTIFF Export

Export the rasterized features from the target grid to GeoTIFF files.

In [100]:
# output handle: AHN4 dataset, features, target grid spacing 10m, normalization grid spacing 1m, all points
output_handle = 'AHN4_powerline_mask_'

# setup input dictionary to configure the geotiff export pipeline
geotiff_export_input_nonground = {
    'parse_point_cloud': {},
    'data_split': {'xSub': 1, 'ySub': 1},
    'create_subregion_geotiffs': {'output_handle': output_handle},
}

# write input dictionary to JSON file
with open('geotiff_export_input_non-ground.json', 'w') as f:
    json.dump(geotiff_export_input_nonground, f)

In [101]:
macro = MacroPipeline()

for feature in features:
    gw = GeotiffWriter(bands=feature.name, label=feature.name)
    geotiff_export_input_nonground_ = copy.deepcopy(geotiff_export_input_nonground)
    geotiff_export_input_nonground_['setup_local_fs'] = {
        'input_folder': feature.as_posix(),
        'output_folder': path_output.as_posix()
    }
    gw.config(geotiff_export_input_nonground_)
    macro.add_task(gw)

macro.setup_cluster(cluster="tcp://10.0.1.12:33245")

# run!
macro.run()

# save outcome results and write name of failed pipelines to file
macro.print_outcome(to_file='geotiff_export_non-ground.out')
failed = macro.get_failed_pipelines()
if failed:
    with open('geotiff_export_non-ground_failed.json', 'w') as f:
        json.dump([pip.label for pip in failed], f)
    raise RuntimeError('Some of the pipelines have failed')

## Terminate cluster

In [27]:
macro.shutdown()

## Troubleshooting 

### Cancel all jobs and restart the notebook

Copy and paste these lines in a separate Python shell. If the Dask dashboard shows that some tasks are still queued to be processed, run the lines again - this should clear the scheduler up and give back control to the current notebook. Normally proceed to terminate the cluster and restart the notebook.

In [None]:
from dask.distributed import Client, Future
client = Client('tcp://145.100.59.123:8786')
futures = [Future(key) for key in client.who_has().keys()]
client.cancel(futures)