# GlobalPointData development sandbox: GEDI as a cloud parquet dataset

In this notebook, I leverage `pyarrow`, `s3fs`, and `polars` to interact with cloud point dataset GEDI (~1.2TB). 

`s3fs` helps setting up endpoint, and `anon=True` keeps the script credential free.

`pyarrow` is used for accessing parquet dataset, which is another level of parquet file. It lift the level to a parquet dataset/folder structure. 

`polars` provides a lighting fast `DataFrame` library. It is a higher-level package alongside the API of Python/Rust, allowing **lazy loading** and **parallel reading**  


The dataset structure of GEDI L2 is (e.g. 073E_32S, 2019-09): `gedi-ard/level2/gedi.l2v002_pnt_20190418_20230316_go_epsg.4326_v20231219.parquet/tile=072E_32S/year=2019/month=9`

## Set up packages, s3 file system and file path

In [24]:
import os
os.environ['USE_PYGEOS'] = '0'
from pyarrow.dataset import dataset
from s3fs import S3FileSystem
import polars as pl
import pyarrow.fs as fs

In [11]:
httpfs = S3FileSystem(
      endpoint_url='https://s3.eu-central-1.wasabisys.com',
      anon=True
   )

In [12]:
object_path = 'gedi-ard/level2/gedi.l2v002_pnt_20190418_20230316_go_epsg.4326_v20231219.parquet'

## Access a single tile of the dataset throughout time series

In [17]:
tile = '002W_07N'

In [14]:
subset_path = object_path + f'/tile={tile}'
pyarrow_dataset = dataset(
    source = subset_path,
    format = 'parquet',
    filesystem=httpfs
)

In [15]:
df_default = pl.scan_pyarrow_dataset(pyarrow_dataset).with_columns((pl.col("rh100","rh99","rh98",'rh97',"rh95",'rh75','rh50','rh25','sensitivity',
                                                             "rh100_a1","rh99_a1","rh98_a1","rh97_a1","rh95_a1",'rh75_a1','rh50_a1','rh25_a1',
                                                             "rh100_a2","rh99_a2","rh98_a2","rh97_a2","rh95_a2",'rh75_a2','rh50_a2','rh25_a2',
                                                             "rh100_a3","rh99_a3","rh98_a3","rh97_a3","rh95_a3",'rh75_a3','rh50_a3','rh25_a3',
                                                             "rh100_a4","rh99_a4","rh98_a4","rh97_a4","rh95_a4",'rh75_a4','rh50_a4','rh25_a4',
                                                             "rh100_a5","rh99_a5","rh98_a5","rh97_a5","rh95_a5",'rh75_a5','rh50_a5','rh25_a5',
                                                             "rh100_a6","rh99_a6","rh98_a6","rh97_a6","rh95_a6",'rh75_a6','rh50_a6','rh25_a6',
                                                             'sensitivity_a1','sensitivity_a2','sensitivity_a3','sensitivity_a4','sensitivity_a5','sensitivity_a6',
                                                              "omega","pgap_theta","cover","rhog","rhov"
                                                             )*0.0001,
                                                       pl.col('elev_lowestmode',
                                                             'elev_lowestmode_a1','elev_lowestmode_a2','elev_lowestmode_a3','elev_lowestmode_a4','elev_lowestmode_a5','elev_lowestmode_a6',
                                                             'fhd_normal')*0.01,
                                                       pl.col('rg','rv')*0.1,
                                                       pl.col('pai')*0.001))

In [16]:
%%time
df_default.collect().to_pandas()

CPU times: user 9.76 s, sys: 1.95 s, total: 11.7 s
Wall time: 10.8 s


Unnamed: 0,delta_time,beamname,shotnumber,latitude,longitude,elev_lowestmode,rh100,rh99,rh98,rh97,...,rg,rv,rhog,selected_rg_algorithm,rhov,selected_l2a_algorithm,fhd_normal,surface_flag,leaf_off_flag,l2b_quality_flag
0,55804047,0,46570000300284960,7.978240,-2.937224,314.55,-1.9272,-2.3773,-2.6772,-2.8673,...,184.7,2220.5,0.4,1,0.6,10,2.21,True,True,True
1,55804047,0,46570000300284960,7.977406,-2.936614,316.21,-1.3653,-1.9252,-2.3352,-2.6352,...,437.1,3834.5,0.4,1,0.6,10,2.30,True,True,True
2,55804047,0,46570000300284960,7.976572,-2.936004,320.83,2.5364,1.7464,1.1463,0.5863,...,3649.0,1718.7,0.4,1,0.6,2,1.92,True,True,True
3,55804047,0,46570000300284968,7.976155,-2.935698,322.47,0.8864,0.1764,-0.2736,-0.6136,...,3771.3,1085.0,0.4,1,0.6,2,1.95,True,True,True
4,55804047,0,46570000300284968,7.975318,-2.935085,321.65,-3.0472,2.7164,2.3063,1.9664,...,4327.1,1784.1,0.4,1,0.6,2,2.19,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294969,161712314,7,236621100300485760,7.182092,-2.003121,314.25,1.2328,0.2927,-0.1573,-0.4873,...,2140.4,4470.0,0.4,1,0.6,2,2.50,True,True,True
294970,161712314,7,236621100300485760,7.181255,-2.002517,314.90,-1.2773,-2.2473,-3.1173,2.7663,...,4866.8,2516.9,0.4,1,0.6,2,2.35,True,True,True
294971,161712314,7,236621100300485760,7.180835,-2.002214,321.99,0.2163,-0.6837,-1.1637,-1.5036,...,6075.7,869.3,0.4,1,0.6,2,1.76,True,True,True
294972,161712314,7,236621100300485760,7.180416,-2.001912,325.37,0.4064,-0.3836,-0.7936,-1.1237,...,5516.5,1387.2,0.4,1,0.6,2,1.86,True,True,True


## Access a single tile of the dataset at certain year month

In [20]:
tile = '002W_07N'
year = 2020
month = 9

In [21]:
subset_path = object_path + f'/tile={tile}/year={year}/month={month}'
pyarrow_dataset = dataset(
    source = subset_path,
    format = 'parquet',
    filesystem=httpfs
)

In [22]:
df_default = pl.scan_pyarrow_dataset(pyarrow_dataset).with_columns((pl.col("rh100","rh99","rh98",'rh97',"rh95",'rh75','rh50','rh25','sensitivity',
                                                             "rh100_a1","rh99_a1","rh98_a1","rh97_a1","rh95_a1",'rh75_a1','rh50_a1','rh25_a1',
                                                             "rh100_a2","rh99_a2","rh98_a2","rh97_a2","rh95_a2",'rh75_a2','rh50_a2','rh25_a2',
                                                             "rh100_a3","rh99_a3","rh98_a3","rh97_a3","rh95_a3",'rh75_a3','rh50_a3','rh25_a3',
                                                             "rh100_a4","rh99_a4","rh98_a4","rh97_a4","rh95_a4",'rh75_a4','rh50_a4','rh25_a4',
                                                             "rh100_a5","rh99_a5","rh98_a5","rh97_a5","rh95_a5",'rh75_a5','rh50_a5','rh25_a5',
                                                             "rh100_a6","rh99_a6","rh98_a6","rh97_a6","rh95_a6",'rh75_a6','rh50_a6','rh25_a6',
                                                             'sensitivity_a1','sensitivity_a2','sensitivity_a3','sensitivity_a4','sensitivity_a5','sensitivity_a6',
                                                              "omega","pgap_theta","cover","rhog","rhov"
                                                             )*0.0001,
                                                       pl.col('elev_lowestmode',
                                                             'elev_lowestmode_a1','elev_lowestmode_a2','elev_lowestmode_a3','elev_lowestmode_a4','elev_lowestmode_a5','elev_lowestmode_a6',
                                                             'fhd_normal')*0.01,
                                                       pl.col('rg','rv')*0.1,
                                                       pl.col('pai')*0.001))

In [23]:
%%time
df_default.collect().to_pandas()

CPU times: user 474 ms, sys: 99.2 ms, total: 573 ms
Wall time: 515 ms


Unnamed: 0,delta_time,beamname,shotnumber,latitude,longitude,elev_lowestmode,rh100,rh99,rh98,rh97,...,rg,rv,rhog,selected_rg_algorithm,rhov,selected_l2a_algorithm,fhd_normal,surface_flag,leaf_off_flag,l2b_quality_flag
0,84494046,0,98060000200079984,7.916067,-2.998779,311.66,-3.1872,2.6863,2.2064,1.8664,...,2612.8,2355.7,0.4,1,0.6,2,2.23,True,True,True
1,84494046,0,98060000200080000,7.921085,-2.995140,306.31,1.9763,1.4564,1.1564,0.8963,...,813.5,2492.0,0.4,1,0.6,10,1.96,True,True,True
2,84494046,0,98060000200080000,7.921504,-2.994836,302.59,-2.2572,-2.5872,-2.8173,-3.0373,...,583.1,1608.5,0.4,1,0.6,10,2.29,True,True,True
3,84494046,0,98060000200080080,7.956258,-2.969684,357.10,0.5964,0.0663,-0.2637,-0.5337,...,2525.8,554.1,0.4,1,0.6,2,2.03,True,True,True
4,84494046,0,98060000200080080,7.956678,-2.969381,351.71,-0.3472,-1.0572,-1.5373,-1.9873,...,2104.4,2261.4,0.4,1,0.6,2,2.41,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5967,85514057,7,99891100200107712,7.998098,-2.953189,344.86,0.0328,-0.9372,-1.3572,-1.6173,...,2530.2,11612.8,0.4,1,0.6,1,2.24,True,True,True
5968,85514057,7,99891100200107712,7.998516,-2.952886,345.28,-0.9073,-2.3273,-2.7773,-3.0773,...,2498.1,12476.5,0.4,1,0.6,2,2.19,True,True,True
5969,85514057,7,99891100200107712,7.998935,-2.952583,346.45,2.7692,1.5992,0.7792,-0.5309,...,2203.2,7248.8,0.4,1,0.6,2,2.41,True,True,True
5970,85514057,7,99891100200107712,7.999353,-2.952280,347.39,0.6327,-0.6472,-1.2373,-1.6173,...,2160.1,9029.9,0.4,1,0.6,1,2.20,True,True,True
