# Splitting large LAZ files

In [1]:
import laspy
import numpy as np
import os

import dask.bag as db
from dask.distributed import LocalCluster

In [2]:
local_path_AHN4 = "/project/lidarac/Data/Regions/Oostvaarderplassen/AHN4"

laz_files = [f for f in os.listdir(local_path_AHN4) if f.endswith('.LAZ')]


In [3]:
laz_files

['C_26BN2.LAZ',
 'C_26AZ2.LAZ',
 'C_26BN1.LAZ',
 'C_26AN2.LAZ',
 'C_26BZ1.LAZ',
 'C_26BZ2.LAZ',
 'C_20DZ2.LAZ',
 'C_20DZ1.LAZ']

In [14]:
os.getcwd()

'/project/lidarac/Software/Yifang/JupyterDaskOnSLURM/AHN4_processing'

In [15]:
os.chdir('/project/lidarac/Data/Regions/Oostvaarderplassen/AHN4')

In [4]:
max_filesize = 250 * 2**20  # desired max file size (in bytes)

In [5]:
max_filesize

262144000

In [6]:
len(laz_files)

8

In [16]:
LAZ_COMPRESSION_FACTOR = 7

def save_chunk_to_laz_file(in_filename, 
                           out_filename, 
                           offset, 
                           n_points):
    """Read points from a LAS/LAZ file and write them to a new file."""
    
    points = np.array([])
    
    with laspy.open(in_filename) as in_file:
        with laspy.open(out_filename, 
                        mode="w", 
                        header=in_file.header) as out_file:
            in_file.seek(offset)
            points = in_file.read_points(n_points)
            out_file.write_points(points)
    return len(points)

def split_strategy(filename, max_filesize):
    """Set up splitting strategy for a LAS/LAZ file."""
    with laspy.open(filename) as f:
        bytes_per_point = (
            f.header.point_format.num_standard_bytes +
            f.header.point_format.num_extra_bytes
        )
        n_points = f.header.point_count
    n_points_target = int(
        max_filesize * LAZ_COMPRESSION_FACTOR / bytes_per_point
    )
    stem, ext = os.path.splitext(filename)
    return [
        (filename, f"{stem}-{n}{ext}", offset, n_points_target)
        for n, offset in enumerate(range(0, n_points, n_points_target))
    ]

In [17]:
# set up calculation
files = db.from_sequence(laz_files) 
input_args = files.map(split_strategy, max_filesize=max_filesize) \
    .flatten() \
    .unzip(4)  # unpack input arguments
res = db.map(save_chunk_to_laz_file, *input_args)

In [19]:
pwd

'/project/lidarac/Data/Regions/Oostvaarderplassen/AHN4'

In [7]:

cluster = LocalCluster(processes=True, 
                       n_workers=2, 
                       threads_per_worker=1, 
                       local_directory='C:/temp')

cluster

VBox(children=(HTML(value='<h2>LocalCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n    …

In [9]:
from dask.distributed import Client

client = Client("tcp://10.0.1.203:34053")
client

0,1
Connection method: Direct,
Dashboard: /proxy/8787/status,

0,1
Comm: tcp://10.0.1.203:34053,Workers: 2
Dashboard: /proxy/8787/status,Total threads: 4
Started: 12 hours ago,Total memory: 32.00 GiB

0,1
Comm: tcp://10.0.0.226:38569,Total threads: 2
Dashboard: /proxy/8787/status,Memory: 16.00 GiB
Nanny: tcp://10.0.0.226:44325,
Local directory: /tmp/dask-worker-space/worker-uqc25hyu,Local directory: /tmp/dask-worker-space/worker-uqc25hyu
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 4.13 GiB,Spilled bytes: 0 B
Read bytes: 286.1677036799539 B,Write bytes: 1.06 kiB

0,1
Comm: tcp://10.0.0.146:46405,Total threads: 2
Dashboard: /proxy/8787/status,Memory: 16.00 GiB
Nanny: tcp://10.0.0.146:34355,
Local directory: /tmp/dask-worker-space/worker-po392j05,Local directory: /tmp/dask-worker-space/worker-po392j05
Tasks executing: 0,Tasks in memory: 0
Tasks ready: 0,Tasks in flight: 0
CPU usage: 2.0%,Last seen: Just now
Memory usage: 4.01 GiB,Spilled bytes: 0 B
Read bytes: 2.32 MiB,Write bytes: 17.39 kiB


In [18]:
# start Dask cluster before this cell!
tot_points = res.compute()

FileNotFoundError: [Errno 2] No such file or directory: 'C_26AZ2.LAZ'

In [15]:
# splitted points
sum(tot_points)

NameError: name 'tot_points' is not defined

In [11]:
pwd

'/project/lidarac/Software/Yifang/JupyterDaskOnSLURM/AHN4_processing'

In [None]:
472795263

In [90]:
cluster.close()