In [1]:
import os

import dask.dataframe as dd
import dask_geopandas
import dcachefs
import fsspec
import laspy
import numpy as np
import pandas as pd

from dask.delayed import delayed

ERROR 1: PROJ: proj_create_from_database: Open of /opt/conda/envs/jupyterdask/share/proj failed


In [2]:
from dask.distributed import Client
client = Client("localhost:8786")

In [3]:
laz_file = "dcache://pnfs/grid.sara.nl/data/projects.nl/eecolidar/01_Escience/ALS/Netherlands/ahn3_current/ahn3_current_TOP10NL_ud20200323/C_25GN1.LAZ"
chunk_size = 100 * 2**20  # 1 GB

In [4]:
# ~2.5 GB compressed -> ~20 GB
fs = fsspec.get_filesystem_class("dcache")()
fs.du(laz_file) / 2**30 

2.508162605576217

In [5]:
@delayed
def read_chunk_from_laz_file(urlpath, offset, n_points):
    """" Read chunk from LAZ file and return it as a DataFrame. """
    import dcachefs
    with fsspec.open(urlpath, mode="rb", block_size=2**20) as f:
        with laspy.open(f) as las_file:
            las_file.seek(offset)
            points = las_file.read_points(n_points)
    df = pd.DataFrame.from_records(points.array)
    df[["X", "Y", "Z"]] = las_file.header.offsets + \
        df[["X", "Y", "Z"]] * las_file.header.scales
    return df

In [6]:
def read_laz(urlpath, chunk_size):
    """ Read LAZ file as a delayed Dask dataframe. """
    with fsspec.open(urlpath, mode="rb", block_size=2**20) as f:
        with laspy.open(f) as las_file:
            dtype = las_file.header.point_format.dtype()
            n_points = las_file.header.point_count
    meta = {
        k: v[0] if k not in ["X", "Y", "Z"] else np.dtype("float64")
        for k, v in dtype.fields.items()
    }
    nbytes_per_point = sum([v.itemsize for v in meta.values()])
    points_per_partition = chunk_size // nbytes_per_point 
    dfs = [
        read_chunk_from_laz_file(
            urlpath, 
            offset=offset, 
            n_points=points_per_partition, 
        ) 
        for offset in range(0, n_points, points_per_partition)
    ]
    return dd.from_delayed(dfs, meta=meta)

In [7]:
ddf = read_laz(laz_file, chunk_size)
ddf

Unnamed: 0_level_0,X,Y,Z,intensity,bit_fields,raw_classification,scan_angle_rank,user_data,point_source_id,gps_time
npartitions=202,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
,float64,float64,float64,uint16,uint8,uint8,int8,uint8,uint16,float64
,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...


In [8]:
geometry = dask_geopandas.points_from_xy(ddf, x="X", y="Y")
geometry

Dask GeoSeries Structure:
npartitions=202
    geometry
         ...
      ...   
         ...
         ...
dtype: geometry
Dask Name: points_from_xy, 606 tasks

In [9]:
ddf["geometry"] = geometry
dgdf = dask_geopandas.from_dask_dataframe(ddf, geometry="geometry")
dgdf

Unnamed: 0_level_0,X,Y,Z,intensity,bit_fields,raw_classification,scan_angle_rank,user_data,point_source_id,gps_time,geometry
npartitions=202,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
,float64,float64,float64,uint16,uint8,uint8,int8,uint8,uint16,float64,geometry
,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...


In [10]:
# add coordinate reference system
dgdf = dgdf.set_crs("EPSG:28992")

In [11]:
# sample fraction of points
sample = dgdf.sample(frac=0.0001)

In [12]:
# pull result to client
res = sample.compute()