In [50]:
# imports 
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from shapely.geometry import LineString, Point

from processing import *

## Read the shapefile width and depth data
A simple near-global database of bankfull widths and depths (along with confidence intervals) was developed based on hydraulic geometry equations and the HydroSHEDS hydrography data set. The bankfull width estimates were evaluated with widths derived from Landsat imagery for reaches of nine major rivers, showing errors ranging from 8 to 62% (correlation of 0.88), although it was difficult to verify whether the satellite observations corresponded to bankfull conditions. Bankfull depth estimates were compared with in situ measurements at sites in the Ohio and Willamette rivers, producing a mean error of 24%. 

In [43]:
# note: this takes long to read in
shapefile_dir = "../data/wd_data/asia"
gdf = gpd.read_file(shapefile_dir)

gdf.head()

KeyboardInterrupt: 

In [46]:
gdf.columns

Index(['cat', 'a_cat', 'a_ARCID', 'a_UP_CELLS', 'a_AREA', 'a_WIDTH',
       'a_WIDTH5', 'a_WIDTH95', 'a_DEPTH', 'a_DEPTH5', 'a_DEPTH95', 'b_cat',
       'geometry', 'points', 'keep'],
      dtype='object')

In [47]:
filtered_gdf = gdf[['a_AREA', 'a_WIDTH', 'a_DEPTH', 'geometry']].copy()
filtered_gdf.rename(columns={'a_AREA': 'area', 'a_WIDTH': 'width', 'a_DEPTH': 'depth'}, inplace=True)
filtered_gdf.head()


Unnamed: 0,area,width,depth,geometry
0,54.92,8.8,0.32,"LINESTRING (93.77500 59.99375, 93.78125 59.99375)"
1,57.18,4.47,0.19,"LINESTRING (99.39167 60.00000, 99.39792 59.993..."
2,35.49,7.08,0.27,"LINESTRING (114.46250 59.99167, 114.45625 59.9..."
3,2179.32,75.47,1.69,"LINESTRING (161.53333 59.99375, 161.55208 59.9..."
4,442.76,25.02,0.71,"LINESTRING (91.80208 59.99375, 91.80625 59.98958)"


In [51]:
# get midpoints of geometries column LINESTRINGS to use for matching 
# also takes long to run

filtered_gdf['points'] = None
for index, row in filtered_gdf.iterrows():
    # Compute the midpoint of the LINESTRING
    geometry = row['geometry']
    midpoint = geometry.interpolate(0.5, normalized=True)
    filtered_gdf.at[index, 'points'] = midpoint

In [52]:
filtered_gdf.shape

(928305, 5)

In [54]:
# Filter gdf to keep only the points in the right area
filtered_gdf['keep'] = filtered_gdf['points'].apply(filter_points)
filtered_gdf = filtered_gdf[filtered_gdf['keep']]
filtered_gdf = filtered_gdf.drop(columns=['keep', 'geometry'])
filtered_gdf = filtered_gdf.rename(columns={'points': 'geometry'})

filtered_gdf.shape

(38064, 4)

In [55]:
filtered_gdf.head()

Unnamed: 0,area,width,depth,geometry
564359,1307.9,96.03,2.04,POINT (86.58480647246033 30.998526860871845)
564369,1671.54,109.7,2.26,POINT (94.7526935275386 30.998526860871845)
564426,63.24,18.57,0.57,POINT (83.08602686087211 30.997916666665983)
564427,63.24,18.57,0.57,POINT (83.09809538841131 30.993749999999316)
564428,402.03,50.65,1.24,POINT (83.30416666666625 30.999999999999314)


## Matching

In [11]:
# read the cleaned data 
dis = pd.read_csv('../clean_data/concatenated_data.csv') # same as overlapping_coords in geo_exploration notebook
dis.head()

Unnamed: 0,time,lon,lat,dis24,geometry
0,2015-01-01,83.75,29.85,31.251619,POINT (83.75 29.85)
1,2015-01-01,84.05,29.55,40.950394,POINT (84.05000000000001 29.549999999999997)
2,2015-01-01,84.35,29.55,44.086777,POINT (84.35000000000002 29.549999999999997)
3,2015-01-01,84.75,29.25,56.088715,POINT (84.75 29.25)
4,2015-01-01,85.45,29.25,78.481102,POINT (85.45000000000005 29.25)


In [14]:
unique_river_points = dis['geometry'].unique()

In [68]:
def get_matched_width_depth(discharge_data, width_depth_data, nn=1):
    """Match width and depth data to discharge data based on geometry."""
    
    # Create a BallTree using the geometry coordinates from width_depth_data
    tree = BallTree(width_depth_data['geometry'].apply(lambda x: [x.coords[0][0], x.coords[0][1]]).tolist())

    # Initialize lists to store matched width and depth values
    matched_width = []
    matched_depth = []

    # Iterate over rows in the discharge data
    for idx, row in discharge_data.iterrows():
        # Find the nearest point in the width_depth_data DataFrame
        _, ind = tree.query([[row['geometry'].x, row['geometry'].y]], k=nn)

        if len(ind) > 0:
            # Get the width and depth values from the nearest point
            nearest_width = width_depth_data.iloc[ind[0][0]]['width']
            nearest_depth = width_depth_data.iloc[ind[0][0]]['depth']
            matched_width.append(nearest_width)
            matched_depth.append(nearest_depth)

    # Add matched width and depth values to the discharge data DataFrame
    discharge_data['matched_width'] = matched_width
    discharge_data['matched_depth'] = matched_depth

    return discharge_data

In [69]:
from shapely.wkt import loads

# Assuming your DataFrame is named df and the geometry column is named 'geometry'
dis['geometry'] = dis['geometry'].apply(loads)

TypeError: Expected bytes or string, got Point

In [70]:
type(dis['geometry'][0])

shapely.geometry.point.Point

In [71]:
# river points should be a list of tuples of coords to match i.e. from the cleaned data 
matched = get_matched_width_depth(dis, filtered_gdf, 3)

In [72]:
matched.head()

Unnamed: 0,time,lon,lat,dis24,geometry,matched_width,matched_depth
0,2015-01-01,83.75,29.85,31.251619,POINT (83.75 29.85),300.57,4.96
1,2015-01-01,84.05,29.55,40.950394,POINT (84.05000000000001 29.549999999999997),478.6,7.13
2,2015-01-01,84.35,29.55,44.086777,POINT (84.35000000000002 29.549999999999997),489.12,7.25
3,2015-01-01,84.75,29.25,56.088715,POINT (84.75 29.25),538.02,7.81
4,2015-01-01,85.45,29.25,78.481102,POINT (85.45000000000005 29.25),611.42,8.63


In [75]:
matched.to_csv('../clean_data/width_depth_data.csv', index=False)