# Explore issues with duplicate basins

Email thread on "Upload Stream Data to S3" with Terence, with problem described on Dec. 3, 2024.

Anthony's first round of exploration in QGIS uncovered:

Those duplicates come from two TDX-HydroRegions:

- "1020018110": "13", (Congo River Basin, Africa)
- "5020049720": "54", (Australia, Australia and Oceania)

## Python Imports

In [1]:
import os
from pathlib import Path
from importlib import reload

import fsspec
# import s3fs
# import numpy as np
import pandas as pd
import geopandas as gpd
import pyogrio
import pyarrow as pa

In [2]:
# Custom functions for Global Hydrography
import global_hydrography as gh

## Paths

In [13]:
# Confirm your current working directory (cwd) and repo/project directory
working_dir = Path.cwd()
project_dir = working_dir.parent
data_dir = project_dir / 'data_temp' # a temporary data directory that we .gitignore
tdx_dir = data_dir / 'nga'
raw_dir = tdx_dir / 'TDX_HydroRaw'
mnsi_dir = tdx_dir / 'TDX_MNSI_Output'

# Congo
"1020018110": "13", (Congo River Basin, Africa)

## Processed parquet file

In [25]:
# Processed file delivered to E84
basins_congo_mnsi_fp = (
    mnsi_dir /
    f'TDX_streamreach_basins_mnsi_1020018110_01.parquet'
)
basins_congo_mnsi_fp.exists()

True

In [27]:
basins_congo_mnsi_gdf = gpd.read_parquet(basins_congo_mnsi_fp)
basins_congo_mnsi_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 924556 entries, 130000001 to 130006845
Data columns (total 6 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   geometry          924556 non-null  geometry
 1   ROOT_ID           924556 non-null  int32   
 2   FINISH_TIME       924556 non-null  int32   
 3   DISCOVER_TIME     924556 non-null  int32   
 4   ELEMENT_COUNT     924556 non-null  int32   
 5   DISSOLVE_ROOT_ID  924556 non-null  int32   
dtypes: geometry(1), int32(5)
memory usage: 28.2 MB


In [32]:
basins_congo_mnsi_gdf.index

Index([130000001, 130001717, 130000003, 130000005, 130001715, 130003429,
       130000006, 130001718, 130003430, 130005141,
       ...
       130008557, 130008557, 130008557, 130001709, 130001710, 130003421,
       130003421, 130003421, 130006845, 130006845],
      dtype='int32', name='LINKNO', length=924556)

In [33]:
basins_congo_mnsi_gdf.index.value_counts()

LINKNO
130565963    2105
130356958    1923
130690836    1774
130642990    1772
130565970    1767
             ... 
130070783       1
130069072       1
130069071       1
130065646       1
130495509       1
Name: count, Length: 515349, dtype: int64

In [34]:
basins_congo_mnsi_gdf.index.unique()

Index([130000001, 130001717, 130000003, 130000005, 130001715, 130003429,
       130000006, 130001718, 130003430, 130005141,
       ...
       130003419, 130005131, 130001708, 130003420, 130005133, 130008557,
       130001709, 130001710, 130003421, 130006845],
      dtype='int32', name='LINKNO', length=515349)

In [36]:
basins_congo_mnsi_gdf.loc[130565963]

Unnamed: 0_level_0,geometry,ROOT_ID,FINISH_TIME,DISCOVER_TIME,ELEMENT_COUNT,DISSOLVE_ROOT_ID
LINKNO,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
130565963,"POLYGON ((29.97561 -6.76683, 29.97561 -6.76717...",130747391,321017,321004,13,130586507
130565963,"POLYGON ((29.9755 -6.76717, 29.9755 -6.76728, ...",130747391,321017,321004,13,130586507
130565963,"POLYGON ((29.97539 -6.76728, 29.97539 -6.76739...",130747391,321017,321004,13,130586507
130565963,"POLYGON ((29.97528 -6.76739, 29.97528 -6.7675,...",130747391,321017,321004,13,130586507
130565963,"POLYGON ((29.97517 -6.7675, 29.97517 -6.76761,...",130747391,321017,321004,13,130586507
...,...,...,...,...,...,...
130565963,"POLYGON ((29.74228 -7.00039, 29.74228 -7.0005,...",130747391,321017,321004,13,130586507
130565963,"POLYGON ((29.74217 -7.0005, 29.74217 -7.00061,...",130747391,321017,321004,13,130586507
130565963,"POLYGON ((29.74206 -7.00061, 29.74206 -7.00072...",130747391,321017,321004,13,130586507
130565963,"POLYGON ((29.74194 -7.00072, 29.74194 -7.00083...",130747391,321017,321004,13,130586507


### We definitely delivered duplicate rows!!!

## Raw GeoPackage File

In [28]:
basins_congo_raw_fp = (
    raw_dir /
    f'1020018110-basins.gpkg'
)
basins_congo_raw_fp.exists()

True

In [29]:
pyogrio.read_info(basins_congo_raw_fp)

{'layer_name': 'basins',
 'crs': 'EPSG:4326',
 'encoding': 'UTF-8',
 'fields': array(['streamID'], dtype=object),
 'dtypes': array(['int32'], dtype=object),
 'fid_column': 'fid',
 'geometry_name': 'geom',
 'geometry_type': 'Polygon',
 'features': 924556,
 'total_bounds': (6.4583888888889,
  -18.0690555555561,
  34.018944444445,
  9.26127777777778),
 'driver': 'GPKG',
 'capabilities': {'random_read': True,
  'fast_set_next_by_index': True,
  'fast_spatial_filter': True,
  'fast_feature_count': True,
  'fast_total_bounds': True},
 'layer_metadata': None,
 'dataset_metadata': None}

In [31]:
# Get number of features
pyogrio.read_info(basins_congo_raw_fp)['features']

924556

In [39]:
basins_congo_raw_df = pyogrio.read_dataframe(
    basins_congo_raw_fp, 
    layer=0,
    # columns=['streamID'],
    read_geometry=False, # False takes half the time 
    use_arrow=True, # 50% faster, but doesn't seem to work with s3
)

In [40]:
basins_congo_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 924556 entries, 0 to 924555
Data columns (total 1 columns):
 #   Column    Non-Null Count   Dtype
---  ------    --------------   -----
 0   streamID  924556 non-null  int32
dtypes: int32(1)
memory usage: 3.5 MB


In [41]:
basins_congo_raw_df.streamID.value_counts()

streamID
565963    2105
356958    1923
690836    1774
642990    1772
565970    1767
          ... 
236846       1
200894       1
481662       1
103310       1
1710         1
Name: count, Length: 515349, dtype: int64

### Raw data has same duplicates!

In [42]:
# Check streams
streams_congo_raw_fp = (
    raw_dir /
    f'1020018110-streamnet.gpkg'
)
streams_congo_raw_fp.exists()

True

In [43]:
# Get number of features
pyogrio.read_info(streams_congo_raw_fp)['features']

515435

In [44]:
streams_congo_raw_df = pyogrio.read_dataframe(
    streams_congo_raw_fp, 
    layer=0,
    # columns=['streamID'],
    read_geometry=False, # False takes half the time 
    use_arrow=True, # 50% faster, but doesn't seem to work with s3
)

In [45]:
streams_congo_raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 515435 entries, 0 to 515434
Data columns (total 17 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   LINKNO      515435 non-null  int32  
 1   DSLINKNO    515435 non-null  int32  
 2   USLINKNO1   515435 non-null  int32  
 3   USLINKNO2   515435 non-null  int32  
 4   DSNODEID    515435 non-null  int64  
 5   strmOrder   515435 non-null  int32  
 6   Length      515435 non-null  float64
 7   Magnitude   515435 non-null  int32  
 8   DSContArea  515435 non-null  float64
 9   strmDrop    515435 non-null  float64
 10  Slope       515435 non-null  float64
 11  StraightL   515435 non-null  float64
 12  USContArea  515435 non-null  float64
 13  WSNO        515435 non-null  int32  
 14  DOUTEND     515435 non-null  float64
 15  DOUTSTART   515435 non-null  float64
 16  DOUTMID     515435 non-null  float64
dtypes: float64(9), int32(7), int64(1)
memory usage: 53.1 MB


In [46]:
streams_congo_raw_df.LINKNO.value_counts()

LINKNO
1         1
440923    1
464889    1
463182    1
463179    1
         ..
188862    1
188861    1
188859    1
187150    1
6845      1
Name: count, Length: 515435, dtype: int64

In [48]:
streams_congo_raw_df[streams_congo_raw_df.LINKNO==565963]

Unnamed: 0,LINKNO,DSLINKNO,USLINKNO1,USLINKNO2,DSNODEID,strmOrder,Length,Magnitude,DSContArea,strmDrop,Slope,StraightL,USContArea,WSNO,DOUTEND,DOUTSTART,DOUTMID
370844,565963,586507,454698,312595,-1,2,41.9,7,188982528.0,0.0,0.0,38.9,188662496.0,565963,3710881.8,3710923.5,3710902.5
