In [1]:
import copy
import fnmatch
import json
import getpass
import os
import pathlib
import datetime
                    
from dask.distributed import Client, SSHCluster
from laserchicken import register_new_feature_extractor
from laserchicken.feature_extractor.band_ratio_feature_extractor import BandRatioFeatureExtractor

from laserfarm import Retiler, DataProcessing, GeotiffWriter, MacroPipeline
from laserfarm.remote_utils import get_wdclient, get_info_remote, list_remote

# Macro-Pipeline AHN4 Workflow - Powerline Extraction (mask)

## Set Run-Specific Input

Choose whether you want to run all input files or run the only input files listed in `filename`.

In [46]:
path_root = pathlib.Path('/project/lidarac/Data/AHN4')
# path to normalized files 
path_input = path_root / 'normalized'
# path to targets
path_output = path_input.parent / 'targets_powerline'
run = 'from_file'  # 'all', 'from_file'
filename = 'feature_extraction_non-ground_failed.json'  # if run is 'from_file', set name of file with input file names
assert run in ['all', 'from_file']

In [None]:
tiles = [el for el in path_input.iterdir() if el.match('tile_*_*.laz')]
print('Found: {} tiles'.format(len(tiles)))
if run == 'from_file':
    with open(filename, 'r') as f:
        tiles_read = json.load(f)
    tiles_read = [path_input/f for f in tiles_read]
    # check whether all files are available on dCache
    assert all([f in tiles for f in tiles_read]), f'Some of the tiles in {filename} are not in input dir'
    tiles = tiles_read
print('Retrieve and extract features for: {} tiles'.format(len(tiles)))

## Setup Cluster

Setup Dask cluster used for all the macro-pipeline calculations.

In [None]:
from dask.distributed import Client

client = Client("tcp://10.0.1.12:43843")
client

## Feature Extraction

We extract features for the only non-ground points.

In [66]:
# details of the retiling schema
grid = {
    'min_x': -113107.81,
    'max_x': 398892.19,
    'min_y': 214783.87,
    'max_y': 726783.87,
    'n_tiles_side': 512
}

# target mesh size
tile_mesh_size = 10

# list of features
features = 'max_norm_z'

# setup input dictionary to configure the feature extraction pipeline
feature_extraction_input_non_ground = {
    'setup_local_fs': {'input_folder': path_input.as_posix(),
                       'output_folder': path_output.as_posix()},
    'load': {'attributes': ['raw_classification', 'normalized_height']}, 
    'apply_filter': {
        'filter_type': 'select_equal',
        'attribute': 'raw_classification',
        #unclassified (1), ground (2), buildings (6), water (9), wire conductor (14), artificial objects (26), never classified (0)
        'value': 14  #extract points belong to powerlines
    },
    'generate_targets': {
        'tile_mesh_size' : tile_mesh_size,
        'validate' : True,
        'validate_precision': 0.01,  # solves numerical issues for 6 tiles which have points on the edge
        **grid
    },
    'extract_features': {
        'feature_names': features,
        'volume_type': 'cell',
        'volume_size': tile_mesh_size
    },
    'export_targets': {
        'attributes': features,
        'multi_band_files': False,
        'overwrite': True
    },
    'clear_cache' : {},
}

# write input dictionary to JSON file
with open('feature_extraction_non-ground.json', 'w') as f:
    json.dump(feature_extraction_input_non_ground, f)

In [None]:
macro = MacroPipeline()

# extract the tile indices from the tile names
tile_indices = [[int(el) for el in tile.name.split('.')[0].split('_')[1:]] for tile in tiles]

# add pipeline list to macro-pipeline object and set the corresponding labels
macro.tasks = [DataProcessing(t.name, tile_index=idx).config(feature_extraction_input_non_ground) 
               for t, idx in zip(tiles, tile_indices)]
macro.set_labels([os.path.splitext(tile.name)[0] for tile in tiles])

macro.setup_cluster(cluster="tcp://10.0.1.12:43843")

# run!
macro.run()

# save outcome results and write name of failed pipelines to file
macro.print_outcome(to_file='feature_extraction_non-ground.out')
failed = macro.get_failed_pipelines()
if failed:
    with open('feature_extraction_non-ground_failed.json', 'w') as f:
        json.dump(['.'.join([pip.label, 'laz']) for pip in failed], f)
    raise RuntimeError('Some of the pipelines have failed')

## Terminate cluster

In [None]:
macro.shutdown()

## Troubleshooting 

### Cancel all jobs and restart the notebook

Copy and paste these lines in a separate Python shell. If the Dask dashboard shows that some tasks are still queued to be processed, run the lines again - this should clear the scheduler up and give back control to the current notebook. Normally proceed to terminate the cluster and restart the notebook.

In [None]:
from dask.distributed import Client, Future
client = Client('tcp://145.100.59.123:8786')
futures = [Future(key) for key in client.who_has().keys()]
client.cancel(futures)