# Generate a Modified Nested Set Index from NGA TDX-Hydro

This notebook demonstrates how to use functions in the [WikiWatershed/global-hydrography](https://github.com/WikiWatershed/global-hydrography) package to generate a modified nested set index using the TDX-Hydro datasets released by the [US National Geospatial-Intelligence Agency (NGA)](https://www.nga.mil).

This example notebook assumes that you have already downloaded the applicable data using the example provided in the `1_GetData.ipynb` notebook. This notebook also assumes that you will have completed the necessary setup steps outline in the **[Installation Instructions](README.md#get-started)** (and also completed as part of the notebook `1_GetData.ipynb`) 

# Python Imports

In this step we will import the necessary python dependencies for this example

In [1]:
from pathlib import Path
import re
from importlib import reload

import pyogrio
import geopandas as gpd
import pandas as pd

import global_hydrography as gh
from global_hydrography.delineation.mnsi import MNSI_FIELDS
from global_hydrography.preprocess import TDXPreprocessor
from global_hydrography.process import DISSOLVE_ROOT_ID, ELEMENT_COUNT


In [2]:
# Explore the namespace for global-hydrography modules, functions, etc.
dir(gh)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'delineation',
 'io',
 'mnsi',
 'preprocess',
 'process']

# Compile files that need to be processed

In this step we will compile a list of the files that need to be processed to have a modified nested set index. Note this step assumes that you have downloaded the files to the same directory and used the same naming convention as the `1_GetData.ipynb` example notebook. If you have opted to use a different location or naming convention you will need to modify this step accordingly.

In [3]:
# Confirm your current working directory (cwd) and repo/project directory
working_dir = Path.cwd()
project_dir = working_dir.parent
data_dir = project_dir / 'data_temp' # a temporary data directory that we .gitignore
tdx_dir = data_dir / 'nga'

In [4]:
#Scan the files in the data directory and only pull of the streamnet (blueline) files
files_to_process = []
for item in tdx_dir.iterdir():
    if item.is_file() and 'streamnet' in item.name:
        files_to_process.append(item)

In [5]:
files_to_process

[PosixPath('/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/TDX_streamnet_1020011530_01.gpkg'),
 PosixPath('/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/TDX_streamnet_7020038340_01.parquet'),
 PosixPath('/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/TDX_streamnet_1020011530_01_mnsi_test.parquet'),
 PosixPath('/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/TDX_streamnet_7020038340_01_mnsi_test.parquet'),
 PosixPath('/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/TDX_streamnet_7020038340_01.gpkg'),
 PosixPath('/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/TDX_streamnet_7020038340_01_mnsi.parquet')]

# Explore Workflows and Test Functions

## Read Files

In [6]:
file = files_to_process[4]
file.name

'TDX_streamnet_7020038340_01.gpkg'

In [7]:
# Confirm that we've selected a raw GeoPackage
assert file.suffix == '.gpkg'

In [8]:
#parse the file name to get the HDX Basin Id
tdx_hydro_region = int(re.search("\d{10}",file.name).group(0))
tdx_hydro_region

7020038340

In [9]:
# Confirm US East Coast HydroRegion
assert tdx_hydro_region == 7020038340

In [10]:
info = pyogrio.read_info(file, layer=0)
info

{'layer_name': 'TDX_streamnet_7020038340_01',
 'crs': 'EPSG:4326',
 'encoding': 'UTF-8',
 'fields': array(['LINKNO', 'DSLINKNO', 'USLINKNO1', 'USLINKNO2', 'DSNODEID',
        'strmOrder', 'Length', 'Magnitude', 'DSContArea', 'strmDrop',
        'Slope', 'StraightL', 'USContArea', 'WSNO', 'DOUTEND', 'DOUTSTART',
        'DOUTMID'], dtype=object),
 'dtypes': array(['int32', 'int32', 'int32', 'int32', 'int64', 'int32', 'float64',
        'int32', 'float64', 'float64', 'float64', 'float64', 'float64',
        'int32', 'float64', 'float64', 'float64'], dtype=object),
 'fid_column': 'fid',
 'geometry_name': 'geom',
 'geometry_type': 'LineString',
 'features': 140097,
 'total_bounds': (-89.8212222222222,
  24.5589999999989,
  -66.1413333333321,
  46.4454444444444),
 'driver': 'GPKG',
 'capabilities': {'random_read': True,
  'fast_set_next_by_index': True,
  'fast_spatial_filter': True,
  'fast_feature_count': True,
  'fast_total_bounds': True},
 'layer_metadata': {'DBF_DATE_LAST_UPDATE': '202

In [11]:
info['layer_name']

'TDX_streamnet_7020038340_01'

In [12]:
int(re.search("\d{10}",info['layer_name']).group(0))

7020038340

In [13]:
info['layer_metadata']

{'DBF_DATE_LAST_UPDATE': '2021-12-08'}

In [14]:
#open the file as GeoDataFrame, using the fastest direct method from issue #1
gdf = gpd.read_file(file, engine='pyogrio', layer=0, use_arrow=True)
gdf.info()
gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   LINKNO      140097 non-null  int32   
 1   DSLINKNO    140097 non-null  int32   
 2   USLINKNO1   140097 non-null  int32   
 3   USLINKNO2   140097 non-null  int32   
 4   DSNODEID    140097 non-null  int64   
 5   strmOrder   140097 non-null  int32   
 6   Length      140097 non-null  float64 
 7   Magnitude   140097 non-null  int32   
 8   DSContArea  140097 non-null  float64 
 9   strmDrop    140097 non-null  float64 
 10  Slope       140097 non-null  float64 
 11  StraightL   140097 non-null  float64 
 12  USContArea  140097 non-null  float64 
 13  WSNO        140097 non-null  int32   
 14  DOUTEND     140097 non-null  float64 
 15  DOUTSTART   140097 non-null  float64 
 16  DOUTMID     140097 non-null  float64 
 17  geometry    140097 non-null  geometry
dtypes: float64(9), g

Unnamed: 0,LINKNO,DSLINKNO,USLINKNO1,USLINKNO2,DSNODEID,strmOrder,Length,Magnitude,DSContArea,strmDrop,Slope,StraightL,USContArea,WSNO,DOUTEND,DOUTSTART,DOUTMID,geometry
0,0,1777,-1,-1,-1,1,3847.9,1,9567845.0,42.07,0.010933,3233.7,5254867.5,0,45853.6,49701.4,47777.5,"LINESTRING (-69.67822 46.41356, -69.67822 46.4..."
1,1,2369,-1,-1,-1,1,2251.3,1,8768556.0,34.66,0.015397,1749.2,4320561.0,1,44802.7,47054.1,45928.4,"LINESTRING (-69.68589 46.40778, -69.686 46.407..."
2,593,1777,-1,-1,-1,1,1469.3,1,8466694.0,11.98,0.008153,1286.2,4319318.0,593,45853.6,47322.9,46588.3,"LINESTRING (-69.67822 46.41356, -69.67811 46.4..."
3,1777,2369,0,593,-1,2,1050.9,2,19939082.0,0.91,0.00087,871.8,18034788.0,1777,44802.7,45853.6,45328.2,"LINESTRING (-69.68589 46.40778, -69.68589 46.4..."
4,2,4146,-1,-1,-1,1,3551.0,1,9120895.0,67.48,0.019002,2593.6,5267176.0,2,41041.1,44591.7,42816.4,"LINESTRING (-69.687 46.37911, -69.687 46.379, ..."


In [15]:
gdf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [16]:
gdf.strmOrder.value_counts()

strmOrder
1    70866
2    33723
3    17752
4     9098
5     4998
6     3386
7      274
Name: count, dtype: int64

## Preprocess GeoDataFrame

In [17]:
# Create an instance of the preprocessor class from our package
preprocessor = TDXPreprocessor()

In [18]:
test0_gdf = gdf.copy()

In [19]:
%%timeit
# apply preprocessing to make linkno globally unique
preprocessor.tdx_to_global_linkno(test0_gdf, tdx_hydro_region)

2.78 ms ± 1.77 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [20]:
# apply preprocessing to make linkno globally unique
preprocessed_gdf = preprocessor.tdx_to_global_linkno(gdf, tdx_hydro_region)
preprocessed_gdf.info()
preprocessed_gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 18 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   LINKNO      140097 non-null  int32   
 1   DSLINKNO    140097 non-null  int32   
 2   USLINKNO1   140097 non-null  int32   
 3   USLINKNO2   140097 non-null  int32   
 4   DSNODEID    140097 non-null  int64   
 5   strmOrder   140097 non-null  int32   
 6   Length      140097 non-null  float64 
 7   Magnitude   140097 non-null  int32   
 8   DSContArea  140097 non-null  float64 
 9   strmDrop    140097 non-null  float64 
 10  Slope       140097 non-null  float64 
 11  StraightL   140097 non-null  float64 
 12  USContArea  140097 non-null  float64 
 13  WSNO        140097 non-null  int32   
 14  DOUTEND     140097 non-null  float64 
 15  DOUTSTART   140097 non-null  float64 
 16  DOUTMID     140097 non-null  float64 
 17  geometry    140097 non-null  geometry
dtypes: float64(9), g

Unnamed: 0,LINKNO,DSLINKNO,USLINKNO1,USLINKNO2,DSNODEID,strmOrder,Length,Magnitude,DSContArea,strmDrop,Slope,StraightL,USContArea,WSNO,DOUTEND,DOUTSTART,DOUTMID,geometry
0,750000000,750001777,-1,-1,-1,1,3847.9,1,9567845.0,42.07,0.010933,3233.7,5254867.5,0,45853.6,49701.4,47777.5,"LINESTRING (-69.67822 46.41356, -69.67822 46.4..."
1,750000001,750002369,-1,-1,-1,1,2251.3,1,8768556.0,34.66,0.015397,1749.2,4320561.0,1,44802.7,47054.1,45928.4,"LINESTRING (-69.68589 46.40778, -69.686 46.407..."
2,750000593,750001777,-1,-1,-1,1,1469.3,1,8466694.0,11.98,0.008153,1286.2,4319318.0,593,45853.6,47322.9,46588.3,"LINESTRING (-69.67822 46.41356, -69.67811 46.4..."
3,750001777,750002369,750000000,750000593,-1,2,1050.9,2,19939082.0,0.91,0.00087,871.8,18034788.0,1777,44802.7,45853.6,45328.2,"LINESTRING (-69.68589 46.40778, -69.68589 46.4..."
4,750000002,750004146,-1,-1,-1,1,3551.0,1,9120895.0,67.48,0.019002,2593.6,5267176.0,2,41041.1,44591.7,42816.4,"LINESTRING (-69.687 46.37911, -69.687 46.379, ..."


## Drop Useless Columns

In [21]:
preprocessor.tdx_drop_useless_columns(preprocessed_gdf)
preprocessed_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   LINKNO      140097 non-null  int32   
 1   DSLINKNO    140097 non-null  int32   
 2   USLINKNO1   140097 non-null  int32   
 3   USLINKNO2   140097 non-null  int32   
 4   strmOrder   140097 non-null  int32   
 5   Length      140097 non-null  float64 
 6   Magnitude   140097 non-null  int32   
 7   DSContArea  140097 non-null  float64 
 8   strmDrop    140097 non-null  float64 
 9   Slope       140097 non-null  float64 
 10  StraightL   140097 non-null  float64 
 11  USContArea  140097 non-null  float64 
 12  DOUTEND     140097 non-null  float64 
 13  DOUTSTART   140097 non-null  float64 
 14  DOUTMID     140097 non-null  float64 
 15  geometry    140097 non-null  geometry
dtypes: float64(9), geometry(1), int32(6)
memory usage: 13.9 MB


In [22]:
# Drop columns with no value. See `sandbox/explore_data_sources.ipynb`
useless_columns = [
    'WSNO', # identical values to 'LINKNO'
    'DSNODEID', # all -1
]
columns_to_drop = []
for column in useless_columns:
    if any(test0_gdf.columns.isin([column])):
        columns_to_drop.append(column)

columns_to_drop

['WSNO', 'DSNODEID']

In [23]:
# Produces error if already dropped
# processed_gdf.drop(columns=columns_to_drop, inplace=True)

In [24]:
# Test function to avoid missing columns error
preprocessor.tdx_drop_useless_columns(preprocessed_gdf)

## Compute Modified Nested Set Index

In [29]:
test_gdf = preprocessed_gdf.copy(deep=True)

In [30]:
test_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 16 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   LINKNO      140097 non-null  int32   
 1   DSLINKNO    140097 non-null  int32   
 2   USLINKNO1   140097 non-null  int32   
 3   USLINKNO2   140097 non-null  int32   
 4   strmOrder   140097 non-null  int32   
 5   Length      140097 non-null  float64 
 6   Magnitude   140097 non-null  int32   
 7   DSContArea  140097 non-null  float64 
 8   strmDrop    140097 non-null  float64 
 9   Slope       140097 non-null  float64 
 10  StraightL   140097 non-null  float64 
 11  USContArea  140097 non-null  float64 
 12  DOUTEND     140097 non-null  float64 
 13  DOUTSTART   140097 non-null  float64 
 14  DOUTMID     140097 non-null  float64 
 15  geometry    140097 non-null  geometry
dtypes: float64(9), geometry(1), int32(6)
memory usage: 13.9 MB


In [32]:
%%timeit
# compute the modified nested set index is quite fast!! 
# test_gdf = preprocessed_gdf.copy(deep=True)
# gh.mnsi.modified_nest_set_index(test_gdf)
# 6.17 s ± 73.9 ms on old laptop
# 2.29 s ± 26.8 ms on new laptop

2.29 s ± 26.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [39]:
%%timeit
# Recompute with sorted index, in mnsi.py
# test_gdf = preprocessed_gdf.copy(deep=True)
# gh.mnsi.modified_nest_set_index(test_gdf)
# 2.34 s ± 27.4 ms on new laptop
# same speed because using a dict method internally

2.34 s ± 27.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


**CONCLUSION**: Sorting the index isn't faster, but it does save memory.

In [28]:
#compute the modified nested set index
mnsi_gdf = gh.mnsi.modified_nest_set_index(processed_gdf)
mnsi_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 140097 entries, 0 to 140096
Data columns (total 19 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   LINKNO         140097 non-null  int32   
 1   DSLINKNO       140097 non-null  int32   
 2   USLINKNO1      140097 non-null  int32   
 3   USLINKNO2      140097 non-null  int32   
 4   ROOT_ID        140097 non-null  int32   
 5   DISCOVER_TIME  140097 non-null  int32   
 6   FINISH_TIME    140097 non-null  int32   
 7   strmOrder      140097 non-null  int32   
 8   Length         140097 non-null  float64 
 9   Magnitude      140097 non-null  int32   
 10  DSContArea     140097 non-null  float64 
 11  strmDrop       140097 non-null  float64 
 12  Slope          140097 non-null  float64 
 13  StraightL      140097 non-null  float64 
 14  USContArea     140097 non-null  float64 
 15  DOUTEND        140097 non-null  float64 
 16  DOUTSTART      140097 non-null  float64 
 17  DO

In [29]:
mnsi_gdf.crs

<Geographic 2D CRS: EPSG:4326>
Name: WGS 84
Axis Info [ellipsoidal]:
- Lat[north]: Geodetic latitude (degree)
- Lon[east]: Geodetic longitude (degree)
Area of Use:
- name: World.
- bounds: (-180.0, -90.0, 180.0, 90.0)
Datum: World Geodetic System 1984 ensemble
- Ellipsoid: WGS 84
- Prime Meridian: Greenwich

In [30]:
mnsi_gdf.index

RangeIndex(start=0, stop=140097, step=1)

In [31]:
# Set 'LINKNO' as index, to facilitate selection
# and interoperability with basins
mnsi_gdf.set_index('LINKNO', inplace=True)

In [32]:
mnsi_gdf[['DSLINKNO', 'USLINKNO1', 'USLINKNO2', 
          'ROOT_ID', 'DISCOVER_TIME', 'FINISH_TIME' ]]

Unnamed: 0_level_0,DSLINKNO,USLINKNO1,USLINKNO2,ROOT_ID,DISCOVER_TIME,FINISH_TIME
LINKNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
750000000,750001777,-1,-1,750021317,52,53
750000001,750002369,-1,-1,750021317,49,50
750000593,750001777,-1,-1,750021317,51,52
750001777,750002369,750000000,750000593,750021317,50,53
750000002,750004146,-1,-1,750021317,47,48
...,...,...,...,...,...,...
750000587,-1,-1,-1,750000587,1,2
750001180,-1,-1,-1,750001180,1,2
750001772,-1,-1,-1,750001772,1,2
750000588,-1,-1,-1,750000588,1,2


In [33]:
mnsi_gdf.ROOT_ID.value_counts().sort_values()

ROOT_ID
750005168        1
750000464        1
750075251        1
750000462        1
750070517        1
             ...  
750170058     4979
750301090     5929
750093395     6453
750127466     9771
750236632    14095
Name: count, Length: 1635, dtype: int64

In [34]:
mnsi_gdf.DISCOVER_TIME.value_counts().sort_values()

DISCOVER_TIME
11091       1
11095       1
11119       1
11121       1
11099       1
         ... 
5         526
4         526
2         734
3         734
1        1635
Name: count, Length: 14095, dtype: int64

## Write GeoParquet File

In [35]:
tdx_parquet_path = tdx_dir / f"{info['layer_name']}_mnsi_test.parquet"
tdx_parquet_path


PosixPath('/Users/aaufdenkampe/Documents/Python/global-hydrography/data_temp/nga/TDX_streamnet_7020038340_01_mnsi_test.parquet')

In [36]:
# Test to see if adding metadata to gdf gets saved to parquet
mnsi_gdf.source_info = info['layer_metadata']
mnsi_gdf.source_info

  super().__setattr__(attr, val)


{'DBF_DATE_LAST_UPDATE': '2021-12-08'}

In [37]:
#write back to a file
mnsi_gdf.to_parquet(tdx_parquet_path, compression='zstd')

In [38]:
# The metadata does not get written to the Parquet file.
# TODO: implement function for this. See work in NuvESG repo.
pyogrio.read_info(tdx_parquet_path)

{'layer_name': 'TDX_streamnet_7020038340_01_mnsi_test',
 'crs': 'EPSG:4326',
 'encoding': 'UTF-8',
 'fields': array(['DSLINKNO', 'USLINKNO1', 'USLINKNO2', 'ROOT_ID', 'DISCOVER_TIME',
        'FINISH_TIME', 'strmOrder', 'Length', 'Magnitude', 'DSContArea',
        'strmDrop', 'Slope', 'StraightL', 'USContArea', 'DOUTEND',
        'DOUTSTART', 'DOUTMID', 'LINKNO'], dtype=object),
 'dtypes': array(['int32', 'int32', 'int32', 'int32', 'int32', 'int32', 'int32',
        'float64', 'int32', 'float64', 'float64', 'float64', 'float64',
        'float64', 'float64', 'float64', 'float64', 'int32'], dtype=object),
 'fid_column': '',
 'geometry_name': 'geometry',
 'geometry_type': 'LineString',
 'features': 140097,
 'total_bounds': (-89.82122222222222,
  24.558999999998896,
  -66.14133333333214,
  46.44544444444445),
 'driver': 'Parquet',
 'capabilities': {'random_read': False,
  'fast_set_next_by_index': True,
  'fast_spatial_filter': False,
  'fast_feature_count': True,
  'fast_total_bounds': Tr

# Alternate: Create Basins with MNSI

To only read the basin's file for delinating the upstream watershed boundary.

Using new functions in source directory to streamline production.

In [39]:
# Reload module after editing
reload(gh.process)

<module 'global_hydrography.process' from '/Users/aaufdenkampe/Documents/Python/global-hydrography/src/global_hydrography/process.py'>

## `select_tdx_files()` function

In [40]:
tdx_hydro_region = 7020038340

In [41]:
# Test new select_tdx_files()
streamnet_file, basins_file = gh.process.select_tdx_files(
    tdx_dir, tdx_hydro_region,'.gpkg')

streamnet_file.name

'TDX_streamnet_7020038340_01.gpkg'

In [42]:
basins_file.name

'TDX_streamreach_basins_7020038340_01.gpkg'

## Process basins file

In [43]:
# open basins file as GeoDataFrame
basins_gdf = gpd.read_file(basins_file, engine='pyogrio', layer=0, use_arrow=True)
basins_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 140053 entries, 0 to 140052
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   streamID  140053 non-null  int64   
 1   geometry  140053 non-null  geometry
dtypes: geometry(1), int64(1)
memory usage: 2.1 MB


In [44]:
# Rename 'streamID' to 'LINKNO' and in32 to facilitate interoperability 
# with streamnet files
basins_gdf.rename(columns={'streamID':'LINKNO'}, inplace=True)
basins_gdf['LINKNO'] = basins_gdf['LINKNO'].astype('int32')

# apply preprocessing to make linkno globally unique
preprocessor.tdx_to_global_linkno(basins_gdf, tdx_hydro_region)

# Set 'LINKNO' as index, to facilitate selection
basins_gdf.set_index('LINKNO', inplace=True)

basins_gdf.info()
basins_gdf.index


<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 140053 entries, 750000001 to 750327711
Data columns (total 1 columns):
 #   Column    Non-Null Count   Dtype   
---  ------    --------------   -----   
 0   geometry  140053 non-null  geometry
dtypes: geometry(1)
memory usage: 1.6 MB


Index([750000001, 750000002, 750000003, 750000004, 750000005, 750000006,
       750000007, 750000008, 750000009, 750000010,
       ...
       750322975, 750323566, 750323567, 750324159, 750324751, 750325343,
       750325935, 750326527, 750327119, 750327711],
      dtype='int32', name='LINKNO', length=140053)

## Move MNSI fields from streamnet to basins

In [94]:
streamnet_mnsi_gdf = mnsi_gdf.copy(deep=True)

In [95]:
columns_to_merge = ['DSContArea', 'USContArea']

# Merge confirms that their LINKNO values match
# Although there are not as many basins as there are stream reaches!
basins_test_gdf = pd.merge(
    basins_gdf, 
    streamnet_mnsi_gdf[columns_to_merge], 
    how='right', 
    on='LINKNO',
)
basins_test_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 140097 entries, 750000000 to 750000589
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   geometry    140053 non-null  geometry
 1   DSContArea  140097 non-null  float64 
 2   USContArea  140097 non-null  float64 
dtypes: float64(2), geometry(1)
memory usage: 3.7 MB


In [96]:
%%timeit
# Merges are very fast!
pd.merge(basins_gdf, streamnet_mnsi_gdf[columns_to_merge], how='right', on='LINKNO')

7.03 ms ± 183 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [97]:
# Pandas insert allows placement of column at a specified location.
ROOT = "ROOT_ID"
basins_test_gdf.insert(0, ROOT, streamnet_mnsi_gdf[ROOT])
basins_test_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 140097 entries, 750000000 to 750000589
Data columns (total 4 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   ROOT_ID     140097 non-null  int32   
 1   geometry    140053 non-null  geometry
 2   DSContArea  140097 non-null  float64 
 3   USContArea  140097 non-null  float64 
dtypes: float64(2), geometry(1), int32(1)
memory usage: 4.3 MB


In [98]:
%%time
# Insert is even faster!
DISCOVER = "DISCOVER_TIME"
basins_test_gdf.insert(0, DISCOVER, streamnet_mnsi_gdf[DISCOVER])

CPU times: user 357 μs, sys: 172 μs, total: 529 μs
Wall time: 422 μs


In [99]:
basins_test_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 140097 entries, 750000000 to 750000589
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   DISCOVER_TIME  140097 non-null  int32   
 1   ROOT_ID        140097 non-null  int32   
 2   geometry       140053 non-null  geometry
 3   DSContArea     140097 non-null  float64 
 4   USContArea     140097 non-null  float64 
dtypes: float64(2), geometry(1), int32(2)
memory usage: 4.8 MB


## Streams with no Basins

In [None]:
# Explore stream links with no basin geometry.
streams_no_basin_gdf = streamnet_mnsi_gdf[basins_test_gdf.geometry==None]
streams_no_basin_gdf.info()
streams_no_basin_gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 44 entries, 750000000 to 750020103
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   DSLINKNO       44 non-null     int32   
 1   USLINKNO1      44 non-null     int32   
 2   USLINKNO2      44 non-null     int32   
 3   ROOT_ID        44 non-null     int32   
 4   DISCOVER_TIME  44 non-null     int32   
 5   FINISH_TIME    44 non-null     int32   
 6   strmOrder      44 non-null     int32   
 7   Length         44 non-null     float64 
 8   Magnitude      44 non-null     int32   
 9   DSContArea     44 non-null     float64 
 10  strmDrop       44 non-null     float64 
 11  Slope          44 non-null     float64 
 12  StraightL      44 non-null     float64 
 13  USContArea     44 non-null     float64 
 14  DOUTEND        44 non-null     float64 
 15  DOUTSTART      44 non-null     float64 
 16  DOUTMID        44 non-null     float64 
 17  geometry       44 n

Unnamed: 0_level_0,DSLINKNO,USLINKNO1,USLINKNO2,ROOT_ID,DISCOVER_TIME,FINISH_TIME,strmOrder,Length,Magnitude,DSContArea,strmDrop,Slope,StraightL,USContArea,DOUTEND,DOUTSTART,DOUTMID,geometry
LINKNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
750000000,750001777,-1,-1,750021317,52,53,1,3847.9,1,9567845.0,42.07,0.010933,3233.7,5254868.0,45853.6,49701.4,47777.5,"LINESTRING (-69.67822 46.41356, -69.67822 46.4..."
750100709,750101301,750090644,750068149,750100710,5,16,3,0.0,6,95732660.0,0.0,0.0,0.0,95732660.0,7889.2,7889.2,7889.2,"LINESTRING (-69.74322 43.89322, -69.74322 43.8..."
750155209,750155801,750154617,750113177,750170058,1767,4980,7,0.0,1607,24103590000.0,0.0,0.0,0.0,24103590000.0,234019.3,234019.3,234019.3,"LINESTRING (-73.75744 42.54056, -73.75744 42.5..."
750127463,750128055,750141672,750010841,750129283,3471,4886,6,0.0,708,10990470000.0,0.0,0.0,0.0,10990470000.0,423795.8,423795.8,423795.8,"LINESTRING (-78.23989 39.65089, -78.23989 39.6..."
750099079,750099671,750055269,750055861,750102638,89,96,2,0.0,4,87144990.0,0.0,0.0,0.0,87144990.0,68889.3,68889.3,68889.3,"LINESTRING (-76.08133 38.47233, -76.08133 38.4..."


In [None]:
streams_no_basin_gdf.Length.value_counts()

Length
0.0       43
3847.9     1
Name: count, dtype: int64

**NOTE: All but one have zero stream length. The one with a lenghth is a headwater stream at the edge of the TDXHydroRegion.

In [None]:
# Write to GeoPackage to explore in QGIS
pyogrio.write_dataframe(streams_no_basin_gdf, tdx_dir / 'streams_no_basins.gpkg')
# looks like random mistakes


In [None]:
# Drop no-geometry rows from basins gdf
basins_test_gdf.drop(streams_no_basin_gdf.index.to_list(), inplace=True)
basins_test_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 140053 entries, 750000001 to 750000589
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   DISCOVER_TIME  140053 non-null  int32   
 1   ROOT_ID        140053 non-null  int32   
 2   geometry       140053 non-null  geometry
 3   DSContArea     140053 non-null  float64 
 4   USContArea     140053 non-null  float64 
dtypes: float64(2), geometry(1), int32(2)
memory usage: 4.8 MB


## Use `create_basins_mnsi()` function
That combines above

In [None]:
reload(gh.process)
reload(gh.mnsi)

<module 'global_hydrography.delineation.mnsi' from '/Users/aaufdenkampe/Documents/Python/global-hydrography/src/global_hydrography/delineation/mnsi.py'>

In [None]:
basins_mnsi_gdf, streams_no_basin_gdf2 = gh.process.create_basins_mnsi(
    basins_gdf,
    streamnet_mnsi_gdf,
)
basins_mnsi_gdf.info()
basins_mnsi_gdf.head()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 140053 entries, 750000001 to 750000589
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   geometry       140053 non-null  geometry
 1   ROOT_ID        140053 non-null  int32   
 2   FINISH_TIME    140053 non-null  int32   
 3   DISCOVER_TIME  140053 non-null  int32   
dtypes: geometry(1), int32(3)
memory usage: 3.2 MB


Unnamed: 0_level_0,geometry,ROOT_ID,FINISH_TIME,DISCOVER_TIME
LINKNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
750000001,"POLYGON ((-69.71706 46.42639, -69.71572 46.426...",750021317,50,49
750000593,"POLYGON ((-69.70117 46.4495, -69.70106 46.4495...",750021317,52,51
750001777,"POLYGON ((-69.68828 46.41661, -69.68783 46.416...",750021317,53,50
750000002,"POLYGON ((-69.71939 46.39428, -69.71928 46.394...",750021317,48,47
750000592,"POLYGON ((-69.61317 46.46194, -69.61272 46.461...",750021317,41,40


In [None]:
streamnet_mnsi_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 140097 entries, 750000000 to 750000589
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   DSLINKNO       140097 non-null  int32   
 1   USLINKNO1      140097 non-null  int32   
 2   USLINKNO2      140097 non-null  int32   
 3   ROOT_ID        140097 non-null  int32   
 4   DISCOVER_TIME  140097 non-null  int32   
 5   FINISH_TIME    140097 non-null  int32   
 6   strmOrder      140097 non-null  int32   
 7   Length         140097 non-null  float64 
 8   Magnitude      140097 non-null  int32   
 9   DSContArea     140097 non-null  float64 
 10  strmDrop       140097 non-null  float64 
 11  Slope          140097 non-null  float64 
 12  StraightL      140097 non-null  float64 
 13  USContArea     140097 non-null  float64 
 14  DOUTEND        140097 non-null  float64 
 15  DOUTSTART      140097 non-null  float64 
 16  DOUTMID        140097 non-null  float64 
 

## Use `compute_dissolve_groups()`

In [114]:
reload(gh.process)

<module 'global_hydrography.process' from '/Users/aaufdenkampe/Documents/Python/global-hydrography/src/global_hydrography/process.py'>

In [136]:
streamnet_mnsi_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 140097 entries, 750000000 to 750000589
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   DSLINKNO       140097 non-null  int32   
 1   USLINKNO1      140097 non-null  int32   
 2   USLINKNO2      140097 non-null  int32   
 3   ROOT_ID        140097 non-null  int32   
 4   DISCOVER_TIME  140097 non-null  int32   
 5   FINISH_TIME    140097 non-null  int32   
 6   strmOrder      140097 non-null  int32   
 7   Length         140097 non-null  float64 
 8   Magnitude      140097 non-null  int32   
 9   DSContArea     140097 non-null  float64 
 10  strmDrop       140097 non-null  float64 
 11  Slope          140097 non-null  float64 
 12  StraightL      140097 non-null  float64 
 13  USContArea     140097 non-null  float64 
 14  DOUTEND        140097 non-null  float64 
 15  DOUTSTART      140097 non-null  float64 
 16  DOUTMID        140097 non-null  float64 
 

In [137]:
streamnet_mnsi_gdf.sort_index(inplace=True)
streamnet_mnsi_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 140097 entries, 750000000 to 750327711
Data columns (total 18 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   DSLINKNO       140097 non-null  int32   
 1   USLINKNO1      140097 non-null  int32   
 2   USLINKNO2      140097 non-null  int32   
 3   ROOT_ID        140097 non-null  int32   
 4   DISCOVER_TIME  140097 non-null  int32   
 5   FINISH_TIME    140097 non-null  int32   
 6   strmOrder      140097 non-null  int32   
 7   Length         140097 non-null  float64 
 8   Magnitude      140097 non-null  int32   
 9   DSContArea     140097 non-null  float64 
 10  strmDrop       140097 non-null  float64 
 11  Slope          140097 non-null  float64 
 12  StraightL      140097 non-null  float64 
 13  USContArea     140097 non-null  float64 
 14  DOUTEND        140097 non-null  float64 
 15  DOUTSTART      140097 non-null  float64 
 16  DOUTMID        140097 non-null  float64 
 

In [138]:
# Compute predissolve group on streamnet file
# NOTE: takes 6m 11s on Anthony's Mac M3-Max
streamnet_mnsi_groups_gdf = gh.process.compute_dissolve_groups(
    streamnet_mnsi_gdf, 
    max_elements=200, 
    min_elements=125,
)
# 6m 22.1s for non-sorted index
# 6m 27.4s for sorted index

Previous elements 140097, new elements 90439
Previous elements 90439, new elements 73800
Previous elements 73800, new elements 64200
Previous elements 64200, new elements 58686
Previous elements 58686, new elements 54406
Previous elements 54406, new elements 51169
Previous elements 51169, new elements 48577
Previous elements 48577, new elements 46602
Previous elements 46602, new elements 45320
Previous elements 45320, new elements 44449
Previous elements 44449, new elements 43522
Previous elements 43522, new elements 42686
Previous elements 42686, new elements 42172
Previous elements 42172, new elements 41974
Previous elements 41974, new elements 41974
No progress was made. New min threshold is 100
Previous elements 41974, new elements 33149
Min threshold reset to 125
Previous elements 33149, new elements 29377
Previous elements 29377, new elements 26257
Previous elements 26257, new elements 23494
Previous elements 23494, new elements 21897
Previous elements 21897, new elements 20811
P

In [116]:
streamnet_mnsi_groups_gdf.info()
streamnet_mnsi_groups_gdf

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 140097 entries, 750000000 to 750000589
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   DSLINKNO          140097 non-null  int32   
 1   USLINKNO1         140097 non-null  int32   
 2   USLINKNO2         140097 non-null  int32   
 3   ROOT_ID           140097 non-null  int32   
 4   DISCOVER_TIME     140097 non-null  int32   
 5   FINISH_TIME       140097 non-null  int32   
 6   ELEMENT_COUNT     140097 non-null  int32   
 7   DISSOLVE_ROOT_ID  140097 non-null  int32   
 8   strmOrder         140097 non-null  int32   
 9   Length            140097 non-null  float64 
 10  Magnitude         140097 non-null  int32   
 11  DSContArea        140097 non-null  float64 
 12  strmDrop          140097 non-null  float64 
 13  Slope             140097 non-null  float64 
 14  StraightL         140097 non-null  float64 
 15  USContArea        140097 non-null  fl

Unnamed: 0_level_0,DSLINKNO,USLINKNO1,USLINKNO2,ROOT_ID,DISCOVER_TIME,FINISH_TIME,ELEMENT_COUNT,DISSOLVE_ROOT_ID,strmOrder,Length,Magnitude,DSContArea,strmDrop,Slope,StraightL,USContArea,DOUTEND,DOUTSTART,DOUTMID,geometry
LINKNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
750000000,750001777,-1,-1,750021317,52,53,1,750021317,1,3847.9,1,9567845.0,42.07,0.010933,3233.7,5254867.5,45853.6,49701.4,47777.5,"LINESTRING (-69.67822 46.41356, -69.67822 46.4..."
750000001,750002369,-1,-1,750021317,49,50,1,750021317,1,2251.3,1,8768556.0,34.66,0.015397,1749.2,4320561.0,44802.7,47054.1,45928.4,"LINESTRING (-69.68589 46.40778, -69.686 46.407..."
750000593,750001777,-1,-1,750021317,51,52,1,750021317,1,1469.3,1,8466694.0,11.98,0.008153,1286.2,4319318.0,45853.6,47322.9,46588.3,"LINESTRING (-69.67822 46.41356, -69.67811 46.4..."
750001777,750002369,750000000,750000593,750021317,50,53,3,750021317,2,1050.9,2,19939082.0,0.91,0.000870,871.8,18034788.0,44802.7,45853.6,45328.2,"LINESTRING (-69.68589 46.40778, -69.68589 46.4..."
750000002,750004146,-1,-1,750021317,47,48,1,750021317,1,3551.0,1,9120895.0,67.48,0.019002,2593.6,5267176.0,41041.1,44591.7,42816.4,"LINESTRING (-69.687 46.37911, -69.687 46.379, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750000587,-1,-1,-1,750000587,1,2,1,750000587,1,2354.1,1,10233235.0,0.00,0.000000,1721.9,7569312.0,0.0,2354.1,1177.1,"LINESTRING (-81.59922 24.64033, -81.59911 24.6..."
750001180,-1,-1,-1,750001180,1,2,1,750001180,1,1326.7,1,9136435.0,0.00,0.000000,1072.3,4495984.5,0.0,1326.7,663.4,"LINESTRING (-81.63022 24.61767, -81.63011 24.6..."
750001772,-1,-1,-1,750001772,1,2,1,750001772,1,1000.1,1,4879280.0,0.00,0.000000,738.8,4387448.5,0.0,1000.1,500.0,"LINESTRING (-81.60144 24.58478, -81.60156 24.5..."
750000588,-1,-1,-1,750000588,1,2,1,750000588,1,2044.7,1,5911555.0,0.76,0.000370,1396.2,4346421.0,0.0,2044.7,1022.4,"LINESTRING (-81.64478 24.57489, -81.64489 24.5..."


In [117]:
streamnet_mnsi_groups_gdf.ELEMENT_COUNT.value_counts()

ELEMENT_COUNT
1      70882
3      16073
5       8087
7       5168
9       3675
       ...  
154       48
194       47
200       43
192       43
188       41
Name: count, Length: 200, dtype: int64

In [132]:
# Test insert columns
test2_gdf = streamnet_mnsi_gdf.copy()
insert_loc = test2_gdf.columns.get_loc(gh.mnsi.FINISH)+1
test2_gdf.insert(
    insert_loc, 
    ELEMENT_COUNT, 
    test2_gdf[gh.mnsi.FINISH] - test2_gdf[gh.mnsi.DISCOVER],
)
test2_gdf.insert(
    insert_loc+1, 
    DISSOLVE_ROOT_ID, 
    None,
)
test2_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 140097 entries, 750000000 to 750000589
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   DSLINKNO          140097 non-null  int32   
 1   USLINKNO1         140097 non-null  int32   
 2   USLINKNO2         140097 non-null  int32   
 3   ROOT_ID           140097 non-null  int32   
 4   DISCOVER_TIME     140097 non-null  int32   
 5   FINISH_TIME       140097 non-null  int32   
 6   ELEMENT_COUNT     140097 non-null  int32   
 7   DISSOLVE_ROOT_ID  0 non-null       object  
 8   strmOrder         140097 non-null  int32   
 9   Length            140097 non-null  float64 
 10  Magnitude         140097 non-null  int32   
 11  DSContArea        140097 non-null  float64 
 12  strmDrop          140097 non-null  float64 
 13  Slope             140097 non-null  float64 
 14  StraightL         140097 non-null  float64 
 15  USContArea        140097 non-null  fl

In [133]:
## Move MNSI fields from streamnet to basins ##
fields_to_copy = [*MNSI_FIELDS, ELEMENT_COUNT, DISSOLVE_ROOT_ID]
# Move fields 
basins_mnsi_groups_gdf, streams_no_basin_gdf = gh.process.create_basins_mnsi(
    basins_gdf,
    streamnet_mnsi_groups_gdf,
    fields_to_copy=fields_to_copy
)
basins_mnsi_groups_gdf.info()
streams_no_basin_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 140053 entries, 750000001 to 750000589
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   geometry          140053 non-null  geometry
 1   ROOT_ID           140053 non-null  int32   
 2   FINISH_TIME       140053 non-null  int32   
 3   DISCOVER_TIME     140053 non-null  int32   
 4   ELEMENT_COUNT     140053 non-null  int32   
 5   DISSOLVE_ROOT_ID  140053 non-null  int32   
dtypes: geometry(1), int32(5)
memory usage: 4.3 MB
<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 44 entries, 750000000 to 750020103
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype   
---  ------            --------------  -----   
 0   DSLINKNO          44 non-null     int32   
 1   USLINKNO1         44 non-null     int32   
 2   USLINKNO2         44 non-null     int32   
 3   ROOT_ID           44 non-null     int32   
 4   DISCOVER_TIME     4

In [134]:
basins_mnsi_groups_gdf.DISSOLVE_ROOT_ID.value_counts()

DISSOLVE_ROOT_ID
750231896    200
750180676    200
750207928    200
750311726    200
750136940    200
            ... 
750043999      1
750045775      1
750041038      1
750040446      1
750000589      1
Name: count, Length: 2328, dtype: int64

In [135]:
# Move fields, no copy
basins_gdf, streams_no_basin_gdf = gh.process.create_basins_mnsi(
    basins_gdf,
    streamnet_mnsi_groups_gdf,
    fields_to_copy=fields_to_copy
)
basins_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 140053 entries, 750000001 to 750000589
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   geometry          140053 non-null  geometry
 1   ROOT_ID           140053 non-null  int32   
 2   FINISH_TIME       140053 non-null  int32   
 3   DISCOVER_TIME     140053 non-null  int32   
 4   ELEMENT_COUNT     140053 non-null  int32   
 5   DISSOLVE_ROOT_ID  140053 non-null  int32   
dtypes: geometry(1), int32(5)
memory usage: 4.3 MB


## Drop fields

In [None]:
streamnet_mnsi_gdf.drop(columns=gh.mnsi.MNSI_FIELDS)


Unnamed: 0_level_0,DSLINKNO,USLINKNO1,USLINKNO2,strmOrder,Length,Magnitude,DSContArea,strmDrop,Slope,StraightL,USContArea,DOUTEND,DOUTSTART,DOUTMID,geometry
LINKNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
750000000,750001777,-1,-1,1,3847.9,1,9567845.0,42.07,0.010933,3233.7,5254867.5,45853.6,49701.4,47777.5,"LINESTRING (-69.67822 46.41356, -69.67822 46.4..."
750000001,750002369,-1,-1,1,2251.3,1,8768556.0,34.66,0.015397,1749.2,4320561.0,44802.7,47054.1,45928.4,"LINESTRING (-69.68589 46.40778, -69.686 46.407..."
750000593,750001777,-1,-1,1,1469.3,1,8466694.0,11.98,0.008153,1286.2,4319318.0,45853.6,47322.9,46588.3,"LINESTRING (-69.67822 46.41356, -69.67811 46.4..."
750001777,750002369,750000000,750000593,2,1050.9,2,19939082.0,0.91,0.000870,871.8,18034788.0,44802.7,45853.6,45328.2,"LINESTRING (-69.68589 46.40778, -69.68589 46.4..."
750000002,750004146,-1,-1,1,3551.0,1,9120895.0,67.48,0.019002,2593.6,5267176.0,41041.1,44591.7,42816.4,"LINESTRING (-69.687 46.37911, -69.687 46.379, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
750000587,-1,-1,-1,1,2354.1,1,10233235.0,0.00,0.000000,1721.9,7569312.0,0.0,2354.1,1177.1,"LINESTRING (-81.59922 24.64033, -81.59911 24.6..."
750001180,-1,-1,-1,1,1326.7,1,9136435.0,0.00,0.000000,1072.3,4495984.5,0.0,1326.7,663.4,"LINESTRING (-81.63022 24.61767, -81.63011 24.6..."
750001772,-1,-1,-1,1,1000.1,1,4879280.0,0.00,0.000000,738.8,4387448.5,0.0,1000.1,500.0,"LINESTRING (-81.60144 24.58478, -81.60156 24.5..."
750000588,-1,-1,-1,1,2044.7,1,5911555.0,0.76,0.000370,1396.2,4346421.0,0.0,2044.7,1022.4,"LINESTRING (-81.64478 24.57489, -81.64489 24.5..."


In [None]:
DISCOVER = "DISCOVER_TIME"
FINISH = "FINISH_TIME"
ROOT = "ROOT_ID"
# at column locations right after other LINK info
for f in (FINISH, DISCOVER, ROOT):
    basins_gdf.insert(0, f, None)

In [None]:
gdf_dict = {
        'streamnet': 'x',
        'streamreach_basins_mnsi': 'y',
        'streams_no_basin': 'z',
    }
for dataset, gdf in gdf_dict.items():
    print(dataset, gdf)

streamnet x
streamreach_basins_mnsi y
streams_no_basin z
