# Splitting large LAZ files

In [1]:
import laspy
import numpy as np
import os
import glob
import dask.bag as db
from dask.distributed import LocalCluster

In [2]:
local_path_AHN4 = "/project/lidarac/Data/AHN4/las"

laz_files = [f for f in os.listdir(local_path_AHN4) if f.endswith('.LAZ')]


In [3]:
max_filesize = 250 * 2**20  # desired max file size (in bytes)

In [10]:
LAZ_COMPRESSION_FACTOR = 7

def save_chunk_to_laz_file(in_filename, 
                           out_filename, 
                           offset, 
                           n_points):
    """Read points from a LAS/LAZ file and write them to a new file."""
    
    points = np.array([])
    
    with laspy.open(in_filename) as in_file:
        with laspy.open(out_filename, 
                        mode="w", 
                        header=in_file.header) as out_file:
            in_file.seek(offset)
            points = in_file.read_points(n_points)
            out_file.write_points(points)
    return len(points)

def split_strategy(filename, max_filesize):
    """Set up splitting strategy for a LAS/LAZ file."""
    with laspy.open(filename) as f:
        bytes_per_point = (
            f.header.point_format.num_standard_bytes +
            f.header.point_format.num_extra_bytes
        )
        n_points = f.header.point_count
    n_points_target = int(
        max_filesize * LAZ_COMPRESSION_FACTOR / bytes_per_point
    )
    stem, ext = os.path.splitext(filename)
    return [
        (filename, f"{stem}-{n}{ext}", offset, n_points_target)
        for n, offset in enumerate(range(0, n_points, n_points_target))
    ]

In [11]:
# set up calculation
files = db.from_sequence(laz_files) 
input_args = files.map(split_strategy, max_filesize=max_filesize) \
    .flatten() \
    .unzip(4)  # unpack input arguments
res = db.map(save_chunk_to_laz_file, *input_args)

In [12]:
# start Dask cluster before this cell!
tot_points = res.compute()

In [None]:
# splitted points
sum(tot_points)

In [90]:
cluster.close()