# Explore PyArrow, GeoArrow, GeoParquet for Optimal Performance

Work on Issue [#1 Leverage GeoArrow to speedup vector data processing & viz](https://github.com/WikiWatershed/global-hydrography/issues/1)


# Imports & Setup

In [1]:
from pathlib import Path

import fsspec
import numpy as np
import pandas as pd
import geopandas as gpd
import pyogrio
import pyarrow as pa

## Importing geoarrow slows down some funcitons, so we do it later for testing
# import geoarrow.pyarrow as ga
# import geoarrow.pandas as _

In [2]:
# Confirm your current working directory (cwd) and repo/project directory
working_dir = Path.cwd()
project_dir = working_dir.parent
data_dir = project_dir / 'data_temp' # a temporary data directory that we .gitignore
data_dir

PosixPath('/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp')

In [3]:
# Create local file system using fsspec library
local_fs = fsspec.filesystem('local') 

In [4]:
tdx_dir = data_dir / 'nga'
local_fs.ls(tdx_dir)

['/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/TDX_streamnet_1020011530_01.gpkg',
 '/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/TDX_streamnet_7020038340_01.parquet',
 '/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/TDX_streamreach_basins_7020038340_01.gpkg',
 '/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/.DS_Store',
 '/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/test.json',
 '/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/download.php?file=hydrobasins_level2',
 '/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/test.zip',
 '/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/TDX_streamreach_basins_1020011530_01.gpkg',
 '/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/hydrobasins_level2.geojson',
 '/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/TDX_streamreach_

In [5]:
tdx_basins_7020038340_fp = tdx_dir / 'TDX_streamreach_basins_7020038340_01.gpkg'
tdx_stream_7020038340_fp = tdx_dir / 'TDX_streamnet_7020038340_01.gpkg'

In [6]:
local_fs.info(tdx_stream_7020038340_fp)['size']

702283776

In [7]:
pyogrio.list_layers(tdx_stream_7020038340_fp)

array([['TDX_streamnet_7020038340_01', 'LineString']], dtype=object)

In [8]:
pyogrio.read_info(tdx_stream_7020038340_fp, layer=0)

{'layer_name': 'TDX_streamnet_7020038340_01',
 'crs': 'EPSG:4326',
 'encoding': 'UTF-8',
 'fields': array(['LINKNO', 'DSLINKNO', 'USLINKNO1', 'USLINKNO2', 'DSNODEID',
        'strmOrder', 'Length', 'Magnitude', 'DSContArea', 'strmDrop',
        'Slope', 'StraightL', 'USContArea', 'WSNO', 'DOUTEND', 'DOUTSTART',
        'DOUTMID'], dtype=object),
 'dtypes': array(['int32', 'int32', 'int32', 'int32', 'int64', 'int32', 'float64',
        'int32', 'float64', 'float64', 'float64', 'float64', 'float64',
        'int32', 'float64', 'float64', 'float64'], dtype=object),
 'fid_column': 'fid',
 'geometry_name': 'geom',
 'geometry_type': 'LineString',
 'features': 140097,
 'total_bounds': (-89.8212222222222,
  24.5589999999989,
  -66.1413333333321,
  46.4454444444444),
 'driver': 'GPKG',
 'capabilities': {'random_read': True,
  'fast_set_next_by_index': True,
  'fast_spatial_filter': True,
  'fast_feature_count': True,
  'fast_total_bounds': True},
 'layer_metadata': {'DBF_DATE_LAST_UPDATE': '202

# Read Arrow directly
https://pyogrio.readthedocs.io/en/latest/api.html#pyogrio.read_arrow

Returns a tuple! 
- dict of meta information about the data source
- pyarrow Table with data.

In [9]:
# pyogrio.read_arrow() avoids conversion to GeoDataframe
# Takes about 3 sec for Anthony
pa_info, pa_table = pyogrio.read_arrow(
    tdx_stream_7020038340_fp,
)
pa_info

{'crs': 'EPSG:4326',
 'encoding': 'UTF-8',
 'fields': array(['LINKNO', 'DSLINKNO', 'USLINKNO1', 'USLINKNO2', 'DSNODEID',
        'strmOrder', 'Length', 'Magnitude', 'DSContArea', 'strmDrop',
        'Slope', 'StraightL', 'USContArea', 'WSNO', 'DOUTEND', 'DOUTSTART',
        'DOUTMID'], dtype=object),
 'geometry_type': 'LineString',
 'geometry_name': 'geom',
 'fid_column': 'fid'}

In [10]:
pa_table

pyarrow.Table
LINKNO: int32
DSLINKNO: int32
USLINKNO1: int32
USLINKNO2: int32
DSNODEID: int64
strmOrder: int32
Length: double
Magnitude: int32
DSContArea: double
strmDrop: double
Slope: double
StraightL: double
USContArea: double
WSNO: int32
DOUTEND: double
DOUTSTART: double
DOUTMID: double
geom: binary
----
LINKNO: [[0,1,593,1777,2,...,114546,114547,115730,115731,116915],[117507,118099,118691,119283,119876,...,478,1070,1071,1662,2254],[2845,2846,2847,3438,3439,...,587,1180,1772,588,589]]
DSLINKNO: [[1777,2369,1777,2369,4146,...,146515,115139,164275,116323,117507],[118099,141779,119875,119875,128163,...,35406,33630,35406,38366,39550],[33630,38958,41326,41918,42510,...,-1,-1,-1,-1,-1]]
USLINKNO1: [[-1,-1,-1,0,-1,...,113954,113955,62448,106852,108034],[116915,1476,100930,16867,119284,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1]]
USLINKNO2: [[-1,-1,-1,593,-1,...,91459,89682,63042,89091,883],[882,117507,12130,16275,42915,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1]]
D

In [11]:
pa_info['crs']

'EPSG:4326'

In [12]:
type(pa_table)

pyarrow.lib.Table

# Benchmark Read Methods on local file

`timeit` is more accurate than `time`, for three reasons:

- it repeats the tests many times to eliminate the influence of other tasks on your machine, such as disk flushing and OS scheduling.
- it disables the garbage collector to prevent that process from skewing the results by scheduling a collection run at an inopportune moment.
- it picks the most accurate timer for your OS. See timeit.default_timer.

From https://stackoverflow.com/questions/17579357/time-time-vs-timeit-timeit


## Before GeoArrow Import

In [13]:
%%timeit
pyogrio.list_layers(tdx_stream_7020038340_fp)

3.52 ms ± 104 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:
%%timeit
pyogrio.read_info(tdx_stream_7020038340_fp, layer=0)

3.88 ms ± 73.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### `pyogrio.read_arrow()`

In [15]:
%%timeit
pyogrio.read_arrow(tdx_stream_7020038340_fp)

522 ms ± 10.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [16]:
pa_info, pa_table = pyogrio.read_arrow(tdx_stream_7020038340_fp)

In [17]:
%%timeit
# Looks like the conversion to pandas is fast
# So combining methods could be performant
pa_table.to_pandas()

434 ms ± 5.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


#### to Pandas DF
NOTE that geom is a non-typed binary object

In [18]:
pa_geo_df = pa_table.to_pandas()
pa_geo_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   LINKNO      140097 non-null  int32  
 1   DSLINKNO    140097 non-null  int32  
 2   USLINKNO1   140097 non-null  int32  
 3   USLINKNO2   140097 non-null  int32  
 4   DSNODEID    140097 non-null  int64  
 5   strmOrder   140097 non-null  int32  
 6   Length      140097 non-null  float64
 7   Magnitude   140097 non-null  int32  
 8   DSContArea  140097 non-null  float64
 9   strmDrop    140097 non-null  float64
 10  Slope       140097 non-null  float64
 11  StraightL   140097 non-null  float64
 12  USContArea  140097 non-null  float64
 13  WSNO        140097 non-null  int32  
 14  DOUTEND     140097 non-null  float64
 15  DOUTSTART   140097 non-null  float64
 16  DOUTMID     140097 non-null  float64
 17  geom        140097 non-null  object 
dtypes: float64(9), int32(7), int64(1), object(1)

In [19]:
type(pa_geo_df.geom[0])

bytes

### `pyogrio.read_dataframe()`

In [20]:
%%timeit
pyogrio.read_dataframe(
    tdx_stream_7020038340_fp, 
    layer=0,
    use_arrow=True, 
)

2.02 s ± 18.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [21]:
# %%timeit
# pyogrio.read_dataframe(
#     tdx_stream_7020038340_fp, 
#     layer=0,
#     use_arrow=False, 
# )
# 8.54 s ± 4.94 s per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [22]:
pyo_gdf = pyogrio.read_dataframe(
    tdx_stream_7020038340_fp, 
    layer=0,
    use_arrow=True, 
)
pyo_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   LINKNO      140097 non-null  int32   
 1   DSLINKNO    140097 non-null  int32   
 2   USLINKNO1   140097 non-null  int32   
 3   USLINKNO2   140097 non-null  int32   
 4   DSNODEID    140097 non-null  int64   
 5   strmOrder   140097 non-null  int32   
 6   Length      140097 non-null  float64 
 7   Magnitude   140097 non-null  int32   
 8   DSContArea  140097 non-null  float64 
 9   strmDrop    140097 non-null  float64 
 10  Slope       140097 non-null  float64 
 11  StraightL   140097 non-null  float64 
 12  USContArea  140097 non-null  float64 
 13  WSNO        140097 non-null  int32   
 14  DOUTEND     140097 non-null  float64 
 15  DOUTSTART   140097 non-null  float64 
 16  DOUTMID     140097 non-null  float64 
 17  geometry    140097 non-null  geometry
dtypes: float64(9), g

### `gpd.read_file()`

In [23]:
%%timeit
gpd.read_file(
    tdx_stream_7020038340_fp, 
    engine='pyogrio',
    use_arrow=True,
)

1.73 s ± 11.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [24]:
gpd_gdf = gpd.read_file(
    tdx_stream_7020038340_fp, 
    engine='pyogrio',
    use_arrow=True,
)
gpd_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   LINKNO      140097 non-null  int32   
 1   DSLINKNO    140097 non-null  int32   
 2   USLINKNO1   140097 non-null  int32   
 3   USLINKNO2   140097 non-null  int32   
 4   DSNODEID    140097 non-null  int64   
 5   strmOrder   140097 non-null  int32   
 6   Length      140097 non-null  float64 
 7   Magnitude   140097 non-null  int32   
 8   DSContArea  140097 non-null  float64 
 9   strmDrop    140097 non-null  float64 
 10  Slope       140097 non-null  float64 
 11  StraightL   140097 non-null  float64 
 12  USContArea  140097 non-null  float64 
 13  WSNO        140097 non-null  int32   
 14  DOUTEND     140097 non-null  float64 
 15  DOUTSTART   140097 non-null  float64 
 16  DOUTMID     140097 non-null  float64 
 17  geometry    140097 non-null  geometry
dtypes: float64(9), g

In [25]:
%%timeit
gpd.read_file(
    tdx_stream_7020038340_fp, 
    engine='pyogrio',
    use_arrow=False,
)

2.9 s ± 78.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [26]:
# %%timeit
# gpd.read_file(
#     tdx_stream_7020038340_fp, 
#     # engine='pyogrio',
#     use_arrow=False,
# )
# 50.3 s ± 1.34 s per loop (mean ± std. dev. of 7 runs, 1 loop each)

## After `geoarrow.pyarrow.io` import
http://geoarrow.org/geoarrow-python/main/pyarrow.html#module-geoarrow.pyarrow.io

In [27]:
from geoarrow.pyarrow import io

### `ga.io.read_pyogrio_table()`

A new approach using `geoarrow-pyarrow` [IO helpers](http://geoarrow.org/geoarrow-python/main/pyarrow.html#module-geoarrow.pyarrow.io):
- http://geoarrow.org/geoarrow-python/main/pyarrow.html#geoarrow.pyarrow.io.read_pyogrio_table

In [28]:
%%timeit
io.read_pyogrio_table(tdx_stream_7020038340_fp)

516 ms ± 8.91 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [29]:
io.read_pyogrio_table(tdx_stream_7020038340_fp)

pyarrow.Table
LINKNO: int32
DSLINKNO: int32
USLINKNO1: int32
USLINKNO2: int32
DSNODEID: int64
strmOrder: int32
Length: double
Magnitude: int32
DSContArea: double
strmDrop: double
Slope: double
StraightL: double
USContArea: double
WSNO: int32
DOUTEND: double
DOUTSTART: double
DOUTMID: double
geom: extension<geoarrow.wkb<WkbType>>
----
LINKNO: [[0,1,593,1777,2,...,114546,114547,115730,115731,116915],[117507,118099,118691,119283,119876,...,478,1070,1071,1662,2254],[2845,2846,2847,3438,3439,...,587,1180,1772,588,589]]
DSLINKNO: [[1777,2369,1777,2369,4146,...,146515,115139,164275,116323,117507],[118099,141779,119875,119875,128163,...,35406,33630,35406,38366,39550],[33630,38958,41326,41918,42510,...,-1,-1,-1,-1,-1]]
USLINKNO1: [[-1,-1,-1,0,-1,...,113954,113955,62448,106852,108034],[116915,1476,100930,16867,119284,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1]]
USLINKNO2: [[-1,-1,-1,593,-1,...,91459,89682,63042,89091,883],[882,117507,12130,16275,42915,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1

In [30]:
%%time
gpd.read_file(
    tdx_stream_7020038340_fp, 
    engine='pyogrio',
    use_arrow=True,
).info()
# 15.7 s after importing geoarrow.pyarrow.io 

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   LINKNO      140097 non-null  int32   
 1   DSLINKNO    140097 non-null  int32   
 2   USLINKNO1   140097 non-null  int32   
 3   USLINKNO2   140097 non-null  int32   
 4   DSNODEID    140097 non-null  int64   
 5   strmOrder   140097 non-null  int32   
 6   Length      140097 non-null  float64 
 7   Magnitude   140097 non-null  int32   
 8   DSContArea  140097 non-null  float64 
 9   strmDrop    140097 non-null  float64 
 10  Slope       140097 non-null  float64 
 11  StraightL   140097 non-null  float64 
 12  USContArea  140097 non-null  float64 
 13  WSNO        140097 non-null  int32   
 14  DOUTEND     140097 non-null  float64 
 15  DOUTSTART   140097 non-null  float64 
 16  DOUTMID     140097 non-null  float64 
 17  geometry    140097 non-null  geometry
dtypes: float64(9), g

## After GeoArrow Import

In [31]:
import geoarrow.pyarrow as ga
import geoarrow.pandas as _

### `pyogrio.read_arrow()` with `geoarrow`

In [32]:
%%timeit
pyogrio.read_arrow(tdx_stream_7020038340_fp)

548 ms ± 46.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [33]:
ga_pa_info, ga_pa_table = pyogrio.read_arrow(tdx_stream_7020038340_fp)
ga_pa_info

{'crs': 'EPSG:4326',
 'encoding': 'UTF-8',
 'fields': array(['LINKNO', 'DSLINKNO', 'USLINKNO1', 'USLINKNO2', 'DSNODEID',
        'strmOrder', 'Length', 'Magnitude', 'DSContArea', 'strmDrop',
        'Slope', 'StraightL', 'USContArea', 'WSNO', 'DOUTEND', 'DOUTSTART',
        'DOUTMID'], dtype=object),
 'geometry_type': 'LineString',
 'geometry_name': 'geom',
 'fid_column': 'fid'}

In [34]:
ga_pa_table

pyarrow.Table
LINKNO: int32
DSLINKNO: int32
USLINKNO1: int32
USLINKNO2: int32
DSNODEID: int64
strmOrder: int32
Length: double
Magnitude: int32
DSContArea: double
strmDrop: double
Slope: double
StraightL: double
USContArea: double
WSNO: int32
DOUTEND: double
DOUTSTART: double
DOUTMID: double
geom: extension<geoarrow.wkb<WkbType>>
----
LINKNO: [[0,1,593,1777,2,...,114546,114547,115730,115731,116915],[117507,118099,118691,119283,119876,...,478,1070,1071,1662,2254],[2845,2846,2847,3438,3439,...,587,1180,1772,588,589]]
DSLINKNO: [[1777,2369,1777,2369,4146,...,146515,115139,164275,116323,117507],[118099,141779,119875,119875,128163,...,35406,33630,35406,38366,39550],[33630,38958,41326,41918,42510,...,-1,-1,-1,-1,-1]]
USLINKNO1: [[-1,-1,-1,0,-1,...,113954,113955,62448,106852,108034],[116915,1476,100930,16867,119284,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1]]
USLINKNO2: [[-1,-1,-1,593,-1,...,91459,89682,63042,89091,883],[882,117507,12130,16275,42915,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1

NOTE: same read speed, but  geom is now extension<geoarrow.wkb<WkbType>>

#### to Pandas Geoarrow DF

`geom` type is `geoarrow.pandas.lib.GeoArrowExtensionScalar`

In [35]:
%%timeit
# Looks like the conversion to pandas is exceptionally fast!
# So combining methods could be performant
ga_pa_table.to_pandas()

1.86 ms ± 57.4 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [36]:
ga_pa_df = ga_pa_table.to_pandas()

In [37]:
ga_pa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [38]:
type(ga_pa_df.geom[0])

geoarrow.pandas.lib.GeoArrowExtensionScalar

In [39]:
%%timeit
# Can convert to geopandas geometry
ga.to_geopandas(ga_pa_df.geom)

1.09 s ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [40]:
# Can convert to geopandas geometry
gpd_geometry = ga.to_geopandas(ga_pa_df.geom)
gpd_geometry

0         LINESTRING (-69.67822 46.41356, -69.67822 46.4...
1         LINESTRING (-69.68589 46.40778, -69.686 46.407...
2         LINESTRING (-69.67822 46.41356, -69.67811 46.4...
3         LINESTRING (-69.68589 46.40778, -69.68589 46.4...
4         LINESTRING (-69.687 46.37911, -69.687 46.379, ...
                                ...                        
140092    LINESTRING (-81.59922 24.64033, -81.59911 24.6...
140093    LINESTRING (-81.63022 24.61767, -81.63011 24.6...
140094    LINESTRING (-81.60144 24.58478, -81.60156 24.5...
140095    LINESTRING (-81.64478 24.57489, -81.64489 24.5...
140096    LINESTRING (-81.68 24.559, -81.68011 24.55911,...
Length: 140097, dtype: geometry

In [41]:
type(gpd_geometry[0])

shapely.geometry.linestring.LineString

### `pyogrio.read_dataframe()` with geoarrow

In [42]:
# %%timeit
# pyogrio.read_dataframe(
#     tdx_stream_7020038340_fp, 
#     layer=0,
#     use_arrow=True, # 50% faster, but doesn't seem to work with s3
# )
# 24.5 s ± 1.01 s per loop (mean ± std. dev. of 7 runs, 1 loop each)

NOTE: this is 6.4 times slower!  13.7 sec vs 2.15 s before importing geoarrow

### `gpd.read_file()` with geoarrow

In [43]:
# %%timeit
# gpd.read_file(
#     tdx_stream_7020038340_fp, 
#     engine='pyogrio',
#     use_arrow=True,
# )
# 22.5 s ± 1.26 s per loop (mean ± std. dev. of 7 runs, 1 loop each)

NOTE: this is 9.0 times slower!  14.1 sec vs 1.57 s before importign geoarrow

## Conclusions on Read Method Performance

Reading all fields in the 702.3 MB 'TDX_streamnet_7020038340_01.gpkg' file.

| Function                   | engine  | use_arrow | geoarrow | time             | relative |
|----------------------------|---------|-----------|----------|------------------|---------:|
| `pyogrio.read_arrow()`     | NA      | NA        | No       | 535 ms ± 12.9 ms | 1.0      |
| `pyogrio.read_dataframe()` | NA      | True      | No       | 2.24 s ± 18.8 ms | 4.2      |
| `pyogrio.read_dataframe()` | NA      | False     | No       | 8.54 s ± 4.94 s  | 16.0     |
| `gpd.read_file()`          | pyogrio | True      | No       | 1.97 s ± 10.5 ms | 3.7      |
| `gpd.read_file()`          | pyogrio | False     | No       | 3.06 s ± 29.1 ms | 5.7      |
| `gpd.read_file()`          | fiona   | False     | No       | 50.3 s ± 1.34 s  | 94.0     |
| `pyogrio.read_arrow()`     | NA      | NA        | Yes      | 552 ms ± 18.2 ms | 1.0      |
| `pyogrio.read_dataframe()` | NA      | True      | Yes      | 13.7 s ± 391 ms  | 25.6     |
| `gpd.read_file()`          | pyogrio | True      | Yes      | 14.1 s ± 1.11 s  | 26.4     |
| `io.read_pyogrio_table()`  | pyogrio | True      | Yes      | 563 ms ± 27.9 ms | 1.1      |



`pyogrio.read_arrow()` is ~4x faster than the fastest alternative method.

`gpd.read_file(fp, engine='pyogrio', use_arrow=True)` is the 2nd fastest method, but only before importing GeoArrow.

`gpd.read_file()` is the slowest method, 96x slower than `pyogrio.read_arrow()` and 16.6x slower than adding `engine='pyogrio', use_arrow=True` arguments!

Importing GeoArrow massively slows down `pyogrio.read_dataframe()` and `gpd.read_file()`. Read speeds for `pyogrio.read_arrow()` do not change.

NOTE:
- Based on commit 06e2d6d after updating the environment to leverage [GDAL v3.9](https://github.com/OSGeo/gdal/blob/v3.9.0/NEWS.md) and install the new [geoarrow-python](https://github.com/geoarrow/geoarrow-python) in commit 6552d34, with the following library versions:
    ```yml
    - gdal  =3.9.0
    - pyogrio =0.8.0
    - pyarrow =16.1.0
    - geopandas =0.14.4
    ```
- `pyogrio.read_dataframe(use_arrow=True)` took 5-8 seconds (2-3 times slower!) when running a previous environment with gdal-3.8.5. It is critical to use the specified environment!

# Convert Arrow Dtypes

## Convert non-geom only

In [44]:
# Try converting to arrow dtypes to save storage and speed computation
# NOTE: Can't convert geometry to geoarrow using this menthod
df = gpd_gdf.drop(columns='geometry')
df.info()

pa_df = df.convert_dtypes(dtype_backend='pyarrow')
pa_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 17 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   LINKNO      140097 non-null  int32  
 1   DSLINKNO    140097 non-null  int32  
 2   USLINKNO1   140097 non-null  int32  
 3   USLINKNO2   140097 non-null  int32  
 4   DSNODEID    140097 non-null  int64  
 5   strmOrder   140097 non-null  int32  
 6   Length      140097 non-null  float64
 7   Magnitude   140097 non-null  int32  
 8   DSContArea  140097 non-null  float64
 9   strmDrop    140097 non-null  float64
 10  Slope       140097 non-null  float64
 11  StraightL   140097 non-null  float64
 12  USContArea  140097 non-null  float64
 13  WSNO        140097 non-null  int32  
 14  DOUTEND     140097 non-null  float64
 15  DOUTSTART   140097 non-null  float64
 16  DOUTMID     140097 non-null  float64
dtypes: float64(9), int32(7), int64(1)
memory usage: 14.4 MB
<class 'pandas.core.frame.Data

In [45]:
type(df.LINKNO[0])

numpy.int32

In [46]:
type(pa_df.LINKNO[0])

int

PyArrow doesn't seem to save memory for non-geomery fields

## Convert GeoArrow PyArrow Table

In [47]:
# converting other dtypes has an error
# ga_pa_df.convert_dtypes(dtype_backend='pyarrow')

In [48]:
# TODO: Try `ga_pa_table[1].to_pandas()` with a type mapper
# test_ga_df = ga_pa_table[1].to_pandas()

# Benchmark Write Methods

## Compare Numpy vs Arrow backed dataframes with no geometry

from otherwise identical Pandas dataframes created above.

### Numpy-backed DF

In [49]:
test_df_parquet_path = data_dir / 'test_df.parquet'

In [50]:
%%timeit
df.to_parquet(test_df_parquet_path, compression='zstd',)

138 ms ± 2.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [51]:
local_fs.info(test_df_parquet_path)['size']

9244772

In [52]:
%%timeit
pd.read_parquet(test_df_parquet_path)

11.2 ms ± 1.31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Arrow-backed DF

In [53]:
test_pa_df_parquet_path = data_dir / 'test_pa_df.parquet'

In [54]:
%%timeit
pa_df.to_parquet(test_pa_df_parquet_path, compression='zstd',)

147 ms ± 9.35 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [55]:
local_fs.info(test_pa_df_parquet_path)['size']

9245108

In [56]:
%%timeit
pd.read_parquet(test_pa_df_parquet_path)

9.53 ms ± 80.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Conclusion: about the same write speed and storage size!

## Compare geometry dtypes
Three different types of dataframes with geometry info:
- `gpd_gdf`: GeoPandas Dataframe with Shapely geometry
- `pa_geo_df`: Pandas Dataframe converted from pyarrow table BEFORE importing GeoArrow
- `ga_pa_df`: Pandas Dataframe converted from pyarrow table AFTER importing GeoArrow

## Geopandas GDF

In [57]:
# Geometry
gpd_gdf.geometry.dtype

<geopandas.array.GeometryDtype at 0x16a813050>

In [58]:
type(gpd_gdf.geometry[0])

shapely.geometry.linestring.LineString

In [59]:
gpd_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   LINKNO      140097 non-null  int32   
 1   DSLINKNO    140097 non-null  int32   
 2   USLINKNO1   140097 non-null  int32   
 3   USLINKNO2   140097 non-null  int32   
 4   DSNODEID    140097 non-null  int64   
 5   strmOrder   140097 non-null  int32   
 6   Length      140097 non-null  float64 
 7   Magnitude   140097 non-null  int32   
 8   DSContArea  140097 non-null  float64 
 9   strmDrop    140097 non-null  float64 
 10  Slope       140097 non-null  float64 
 11  StraightL   140097 non-null  float64 
 12  USContArea  140097 non-null  float64 
 13  WSNO        140097 non-null  int32   
 14  DOUTEND     140097 non-null  float64 
 15  DOUTSTART   140097 non-null  float64 
 16  DOUTMID     140097 non-null  float64 
 17  geometry    140097 non-null  geometry
dtypes: float64(9), g

In [60]:
test_gpd_gdf_parquet_path = data_dir / 'test_gpd_gdf.parquet'

## Test Geoparquet compressions

In [61]:
%%timeit
gpd_gdf.to_parquet(
    test_gpd_gdf_parquet_path, 
    compression='zstd', # default is 'snappy'
)
# snappy: 3.52 s ± 54.8 ms
# brotli: 53.8 s ± 1.06 s
#    lz4: 3.75 s ± 39.5 ms
#   zstd: 4.15 s ± 47.8 ms
#   None: 2.72 s ± 37.6 ms

4.2 s ± 117 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [62]:
local_fs.info(test_gpd_gdf_parquet_path)['size']

# snappy: 300048527
# brotli: 109245316
#    lz4: 314102847
#   zstd: 186790117
#   None: 593366150

186790357

In [63]:
%%timeit
# Re-read entire file
gpd.read_parquet(test_gpd_gdf_parquet_path)

# snappy: 2.22 s ± 43.2 ms
# brotli: 3.26 s ± 40.1 ms
#    lz4: 2.10 s ± 65 ms
#   zstd: 2.37 s ± 16.2 ms
#   None: 2.02 s ± 64.2 ms

2.49 s ± 29.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [92]:
columns_to_read = list(gpd_gdf.columns)
columns_to_read.remove('geometry')
print(columns_to_read)

['LINKNO', 'DSLINKNO', 'USLINKNO1', 'USLINKNO2', 'DSNODEID', 'strmOrder', 'Length', 'Magnitude', 'DSContArea', 'strmDrop', 'Slope', 'StraightL', 'USContArea', 'WSNO', 'DOUTEND', 'DOUTSTART', 'DOUTMID']


In [66]:
%%timeit
# Re-read non-geometry columns, using Pandas method
pd.read_parquet(test_gpd_gdf_parquet_path, columns=columns_to_read)

10.7 ms ± 152 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### vs Geopackage

In [68]:
# Save to Geopackage
test_gpd_gdf_gpkg_path = data_dir / 'test_gpd_gdf.gpkg'

In [69]:
%%timeit
gpd_gdf.to_file(test_gpd_gdf_gpkg_path, layer='TDX_streamnet_7020038340_01', driver='GPKG')

10.1 s ± 269 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [70]:
local_fs.info(test_gpd_gdf_gpkg_path)['size']

703127552

## Write Compression Performance

Benchmarks based on reading all fields in the **702.3 MB** `TDX_streamnet_7020038340_01.gpkg` file.

| Compression | `gdf.to_parquet()` time | relative write speed | size in bytes             | relative size | `gpd.read_parquet()` time | relative read speed |
|-------------|-------------------------|----------------------|---------------------------|---------------|---------------------------|---------------------|
| snappy      |  3.52 s ± 54.8 ms       | 1.0                  |              300,048,527  | 2.7           |  2.22 s ± 43.2 ms         | 1.1                 |
| brotli      |  53.8 s ± 1.06 s        | 15.3                 |              109,245,316  | 1.0           |  3.26 s ± 40.1 ms         | 1.6                 |
| lz4         |  3.75 s ± 39.5 ms       | 1.1                  |              314,102,847  | 2.9           |  2.10 s ± 65 ms           | 1.0                 |
| zstd        |  4.15 s ± 47.8 ms       | 1.2                  |              186,790,117  | 1.7           |  2.37 s ± 16.2            | 1.1                 |
| none        | 2.72 s ± 37.6 ms        | 0.8                  |              593,366,150  | 5.4           | 2.02 s ± 64.2 ms          | 1.0                 |

NOTE: Writing the GeoDataframe back to GeoPackage, using `gdf.to_file(path, driver='GPKG')`, takes 11.4 s ± 823 ms and with a file size of 702.1 MB. This should have automatically used the SOZip compression. https://gdal.org/drivers/vector/gpkg.html#compressed-files. So writing to GeoPackage is both much slower and uses more storage than GeoParquet

### Conclusions

Use `zstd` compression, because it provides the highest levels of compression (31% the size of non-compressed) with read speeds that are are only 17% slower.

# Benchmark other Write Methods

Three different types of dataframes with geometry info:
- `gpd_gdf`: GeoPandas Dataframe with Shapely geometry (DONE above)
- `pa_geo_df`: Pandas Dataframe converted from pyarrow table BEFORE importing GeoArrow
- `ga_pa_df`: Pandas Dataframe converted from pyarrow table AFTER importing GeoArrow

### Pandas DF

In [71]:
pa_geo_df.info()
type(pa_geo_df.geom[0])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   LINKNO      140097 non-null  int32  
 1   DSLINKNO    140097 non-null  int32  
 2   USLINKNO1   140097 non-null  int32  
 3   USLINKNO2   140097 non-null  int32  
 4   DSNODEID    140097 non-null  int64  
 5   strmOrder   140097 non-null  int32  
 6   Length      140097 non-null  float64
 7   Magnitude   140097 non-null  int32  
 8   DSContArea  140097 non-null  float64
 9   strmDrop    140097 non-null  float64
 10  Slope       140097 non-null  float64
 11  StraightL   140097 non-null  float64
 12  USContArea  140097 non-null  float64
 13  WSNO        140097 non-null  int32  
 14  DOUTEND     140097 non-null  float64
 15  DOUTSTART   140097 non-null  float64
 16  DOUTMID     140097 non-null  float64
 17  geom        140097 non-null  object 
dtypes: float64(9), int32(7), int64(1), object(1)

bytes

In [72]:
test_pa_geo_df_parquet_path = data_dir / 'test_pa_geo_df.parquet'

In [73]:
%%timeit
pa_geo_df.to_parquet(test_pa_geo_df_parquet_path, compression='zstd')

2.74 s ± 45.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [74]:
# File size
local_fs.info(test_pa_geo_df_parquet_path)['size']

186784470

In [75]:
%%timeit
# Re-read entire file
pd.read_parquet(test_pa_geo_df_parquet_path)

1.59 s ± 31.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [76]:
%%timeit
# Re-read non-geometry columns, using Pandas method
pd.read_parquet(test_pa_geo_df_parquet_path, columns=columns_to_read)

10 ms ± 35.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Pandas GeoArrow DF

In [77]:
ga_pa_df.geom.dtype

GeoArrowExtensionDtype(geoarrow.wkb <PROJJSON:{
  "$schema": "https://p...>)

In [78]:
type(ga_pa_df.geom[0])

geoarrow.pandas.lib.GeoArrowExtensionScalar

In [79]:
test_ga_pa_df_parquet_path = data_dir / 'test_ga_pa_df.parquet'

In [80]:
%%timeit
# Standard Pandas write method
ga_pa_df.to_parquet(test_ga_pa_df_parquet_path, compression='zstd')

1.98 s ± 36 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [81]:
# File size
local_fs.info(test_ga_pa_df_parquet_path)['size']

186793666

In [82]:
%%timeit
io.read_geoparquet_table(test_ga_pa_df_parquet_path)

1.43 s ± 14.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [83]:
io.read_geoparquet_table(test_ga_pa_df_parquet_path)

pyarrow.Table
LINKNO: int32
DSLINKNO: int32
USLINKNO1: int32
USLINKNO2: int32
DSNODEID: int64
strmOrder: int32
Length: double
Magnitude: int32
DSContArea: double
strmDrop: double
Slope: double
StraightL: double
USContArea: double
WSNO: int32
DOUTEND: double
DOUTSTART: double
DOUTMID: double
geom: extension<geoarrow.wkb<WkbType>>
----
LINKNO: [[0,1,593,1777,2,...,478,1070,1071,1662,2254],[2845,2846,2847,3438,3439,...,587,1180,1772,588,589]]
DSLINKNO: [[1777,2369,1777,2369,4146,...,35406,33630,35406,38366,39550],[33630,38958,41326,41918,42510,...,-1,-1,-1,-1,-1]]
USLINKNO1: [[-1,-1,-1,0,-1,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1]]
USLINKNO2: [[-1,-1,-1,593,-1,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1]]
DSNODEID: [[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1]]
strmOrder: [[1,1,1,2,1,...,1,1,1,1,1],[1,1,1,1,1,...,1,1,1,1,1]]
Length: [[3847.9,2251.3,1469.3,1050.9,3551,...,1871.3,2777.5,4006,2436.9,719.9],[4964,3361.3,5683.5,1705.4,1552.7

#### GeoArrow only works with Pandas, not GeoPandas!

In [84]:
# Re-read entire file
# The standard Pandas method produces an error!
# pd.read_parquet(test_ga_pa_df_parquet_path)

### GeoArrow PyArrow Table

Docs for `pyarrrow.Table`: https://arrow.apache.org/docs/python/generated/pyarrow.Table.html

In [85]:
%%timeit
# Geoarrow write method
io.write_geoparquet_table(ga_pa_table, test_ga_pa_df_parquet_path, compression='zstd')

1.97 s ± 25.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [86]:
# File size
local_fs.info(test_ga_pa_df_parquet_path)['size']

186782750

#### Read GeoParquet saved by GeoArrow

In [87]:
%%timeit
# Read GeoParquet saved by GeoArrow
io.read_geoparquet_table(test_ga_pa_df_parquet_path)

1.43 s ± 17.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [88]:
io.read_geoparquet_table(test_ga_pa_df_parquet_path)

pyarrow.Table
LINKNO: int32
DSLINKNO: int32
USLINKNO1: int32
USLINKNO2: int32
DSNODEID: int64
strmOrder: int32
Length: double
Magnitude: int32
DSContArea: double
strmDrop: double
Slope: double
StraightL: double
USContArea: double
WSNO: int32
DOUTEND: double
DOUTSTART: double
DOUTMID: double
geom: extension<geoarrow.wkb<WkbType>>
----
LINKNO: [[0,1,593,1777,2,...,478,1070,1071,1662,2254],[2845,2846,2847,3438,3439,...,587,1180,1772,588,589]]
DSLINKNO: [[1777,2369,1777,2369,4146,...,35406,33630,35406,38366,39550],[33630,38958,41326,41918,42510,...,-1,-1,-1,-1,-1]]
USLINKNO1: [[-1,-1,-1,0,-1,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1]]
USLINKNO2: [[-1,-1,-1,593,-1,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1]]
DSNODEID: [[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1]]
strmOrder: [[1,1,1,2,1,...,1,1,1,1,1],[1,1,1,1,1,...,1,1,1,1,1]]
Length: [[3847.9,2251.3,1469.3,1050.9,3551,...,1871.3,2777.5,4006,2436.9,719.9],[4964,3361.3,5683.5,1705.4,1552.7

#### Read GeoParquet saved by GeoPandas

In [89]:
%%timeit
# Read GeoParquet saved by GeoPandas
io.read_geoparquet_table(test_gpd_gdf_parquet_path)

1.32 s ± 16.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [90]:
io.read_geoparquet_table(test_gpd_gdf_parquet_path)

pyarrow.Table
LINKNO: int32
DSLINKNO: int32
USLINKNO1: int32
USLINKNO2: int32
DSNODEID: int64
strmOrder: int32
Length: double
Magnitude: int32
DSContArea: double
strmDrop: double
Slope: double
StraightL: double
USContArea: double
WSNO: int32
DOUTEND: double
DOUTSTART: double
DOUTMID: double
geometry: extension<geoarrow.wkb<WkbType>>
----
LINKNO: [[0,1,593,1777,2,...,478,1070,1071,1662,2254],[2845,2846,2847,3438,3439,...,587,1180,1772,588,589]]
DSLINKNO: [[1777,2369,1777,2369,4146,...,35406,33630,35406,38366,39550],[33630,38958,41326,41918,42510,...,-1,-1,-1,-1,-1]]
USLINKNO1: [[-1,-1,-1,0,-1,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1]]
USLINKNO2: [[-1,-1,-1,593,-1,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1]]
DSNODEID: [[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1],[-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1]]
strmOrder: [[1,1,1,2,1,...,1,1,1,1,1],[1,1,1,1,1,...,1,1,1,1,1]]
Length: [[3847.9,2251.3,1469.3,1050.9,3551,...,1871.3,2777.5,4006,2436.9,719.9],[4964,3361.3,5683.5,1705.4,15

# Downcast Numerical Dtypes
Ideas and functions from https://medium.com/@atanudan/pandas-dataframe-performance-optimization-8b87db24c2c4

In [116]:
downcast_gdf = gpd_gdf.copy(deep=True)
downcast_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   LINKNO      140097 non-null  int32   
 1   DSLINKNO    140097 non-null  int32   
 2   USLINKNO1   140097 non-null  int32   
 3   USLINKNO2   140097 non-null  int32   
 4   DSNODEID    140097 non-null  int64   
 5   strmOrder   140097 non-null  int32   
 6   Length      140097 non-null  float64 
 7   Magnitude   140097 non-null  int32   
 8   DSContArea  140097 non-null  float64 
 9   strmDrop    140097 non-null  float64 
 10  Slope       140097 non-null  float64 
 11  StraightL   140097 non-null  float64 
 12  USContArea  140097 non-null  float64 
 13  WSNO        140097 non-null  int32   
 14  DOUTEND     140097 non-null  float64 
 15  DOUTSTART   140097 non-null  float64 
 16  DOUTMID     140097 non-null  float64 
 17  geometry    140097 non-null  geometry
dtypes: float64(9), g

In [117]:
downcast_gdf.select_dtypes(int).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   LINKNO     140097 non-null  int32
 1   DSLINKNO   140097 non-null  int32
 2   USLINKNO1  140097 non-null  int32
 3   USLINKNO2  140097 non-null  int32
 4   DSNODEID   140097 non-null  int64
 5   strmOrder  140097 non-null  int32
 6   Magnitude  140097 non-null  int32
 7   WSNO       140097 non-null  int32
dtypes: int32(7), int64(1)
memory usage: 4.8 MB


In [119]:
int_columns = downcast_gdf.select_dtypes(int).columns.tolist()
downcast_gdf[int_columns] = downcast_gdf[int_columns].apply(
    pd.to_numeric, 
    downcast='integer',
    dtype_backend='pyarrow'
)
downcast_gdf[int_columns].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 8 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   LINKNO     140097 non-null  int32[pyarrow]
 1   DSLINKNO   140097 non-null  int32[pyarrow]
 2   USLINKNO1  140097 non-null  int32[pyarrow]
 3   USLINKNO2  140097 non-null  int32[pyarrow]
 4   DSNODEID   140097 non-null  int8[pyarrow] 
 5   strmOrder  140097 non-null  int8[pyarrow] 
 6   Magnitude  140097 non-null  int16[pyarrow]
 7   WSNO       140097 non-null  int32[pyarrow]
dtypes: int16[pyarrow](1), int32[pyarrow](5), int8[pyarrow](2)
memory usage: 3.2 MB


In [120]:
float_columns = downcast_gdf.select_dtypes(float).columns.tolist()
downcast_gdf[float_columns] = downcast_gdf[float_columns].apply(
    pd.to_numeric, 
    downcast='float',
    dtype_backend='pyarrow'
)

In [121]:
downcast_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype          
---  ------      --------------   -----          
 0   LINKNO      140097 non-null  int32[pyarrow] 
 1   DSLINKNO    140097 non-null  int32[pyarrow] 
 2   USLINKNO1   140097 non-null  int32[pyarrow] 
 3   USLINKNO2   140097 non-null  int32[pyarrow] 
 4   DSNODEID    140097 non-null  int8[pyarrow]  
 5   strmOrder   140097 non-null  int8[pyarrow]  
 6   Length      140097 non-null  double[pyarrow]
 7   Magnitude   140097 non-null  int16[pyarrow] 
 8   DSContArea  140097 non-null  float[pyarrow] 
 9   strmDrop    140097 non-null  float[pyarrow] 
 10  Slope       140097 non-null  float[pyarrow] 
 11  StraightL   140097 non-null  double[pyarrow]
 12  USContArea  140097 non-null  float[pyarrow] 
 13  WSNO        140097 non-null  int32[pyarrow] 
 14  DOUTEND     140097 non-null  double[pyarrow]
 15  DOUTSTART   140097 non-nul

In [122]:
test_downcast_gdf_parquet_path = data_dir / 'test_downcast_gdf.parquet'

In [123]:
%%timeit
downcast_gdf.to_parquet(
    test_downcast_gdf_parquet_path, 
    compression='zstd', 
)
#         original: 4.15 s ± 47.8 ms
#   downcast numpy: 4.4 s ± 82.9 ms
# downcast pyarrow: 4.52 s ± 117 ms


4.52 s ± 117 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [124]:
local_fs.info(test_downcast_gdf_parquet_path)['size']
#         original: 186790117
#   downcast numpy: 186190994
# downcast pyarrow: 186191314


186191314

In [125]:
%%timeit
# Re-read entire file
gpd.read_parquet(test_downcast_gdf_parquet_path)

#         original: 2.37 s ± 16.2 ms
#   downcast numpy: 2.57 s ± 35.7 ms
# downcast pyarrow: 2.6 s ± 28 ms

2.6 s ± 28 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [126]:
gpd.read_parquet(test_downcast_gdf_parquet_path).info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype          
---  ------      --------------   -----          
 0   LINKNO      140097 non-null  int32[pyarrow] 
 1   DSLINKNO    140097 non-null  int32[pyarrow] 
 2   USLINKNO1   140097 non-null  int32[pyarrow] 
 3   USLINKNO2   140097 non-null  int32[pyarrow] 
 4   DSNODEID    140097 non-null  int8[pyarrow]  
 5   strmOrder   140097 non-null  int8[pyarrow]  
 6   Length      140097 non-null  double[pyarrow]
 7   Magnitude   140097 non-null  int16[pyarrow] 
 8   DSContArea  140097 non-null  float[pyarrow] 
 9   strmDrop    140097 non-null  float[pyarrow] 
 10  Slope       140097 non-null  float[pyarrow] 
 11  StraightL   140097 non-null  double[pyarrow]
 12  USContArea  140097 non-null  float[pyarrow] 
 13  WSNO        140097 non-null  int32[pyarrow] 
 14  DOUTEND     140097 non-null  double[pyarrow]
 15  DOUTSTART   140097 non-nul

### Conclusions on Downscaling

It doesn't save time or space, although it does save memory!