In [25]:
# imports 
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from shapely.geometry import LineString, Point

from processing import *

## Read the shapefile width and depth data
A simple near-global database of bankfull widths and depths (along with confidence intervals) was developed based on hydraulic geometry equations and the HydroSHEDS hydrography data set. The bankfull width estimates were evaluated with widths derived from Landsat imagery for reaches of nine major rivers, showing errors ranging from 8 to 62% (correlation of 0.88), although it was difficult to verify whether the satellite observations corresponded to bankfull conditions. Bankfull depth estimates were compared with in situ measurements at sites in the Ohio and Willamette rivers, producing a mean error of 24%. 

In [2]:
# note: this takes long to read in
shapefile_dir = "../data/wd_data/asia"
gdf = gpd.read_file(shapefile_dir)

gdf.head()

Unnamed: 0,cat,a_cat,a_ARCID,a_UP_CELLS,a_AREA,a_WIDTH,a_WIDTH5,a_WIDTH95,a_DEPTH,a_DEPTH5,a_DEPTH95,b_cat,b_value,b_label,geometry
0,1,255,255,103,54.92,8.8,3.15,24.9,0.32,0.14,0.74,1,1,,"LINESTRING (93.77500 59.99375, 93.78125 59.99375)"
1,2,257,257,108,57.18,4.47,1.64,12.3,0.19,0.08,0.43,1,1,,"LINESTRING (99.39167 60.00000, 99.39792 59.993..."
2,3,263,263,107,35.49,7.08,2.56,19.84,0.27,0.12,0.62,1,1,,"LINESTRING (114.46250 59.99167, 114.45625 59.9..."
3,4,266,266,108,2179.32,75.47,24.81,232.6,1.69,0.72,4.13,1,1,,"LINESTRING (161.53333 59.99375, 161.55208 59.9..."
4,5,269,269,3363,442.76,25.02,8.6,73.78,0.71,0.31,1.71,1,1,,"LINESTRING (91.80208 59.99375, 91.80625 59.98958)"


In [12]:
gdf.drop(columns=['b_value', 'b_label'], inplace=True)
gdf.columns

Index(['cat', 'a_cat', 'a_ARCID', 'a_UP_CELLS', 'a_AREA', 'a_WIDTH',
       'a_WIDTH5', 'a_WIDTH95', 'a_DEPTH', 'a_DEPTH5', 'a_DEPTH95', 'b_cat',
       'geometry'],
      dtype='object')

In [15]:
# get midpoints of geometries column LINESTRINGS to use for matching 

gdf['points'] = None
for index, row in gdf.iterrows():
    # Compute the midpoint of the LINESTRING
    geometry = row['geometry']
    midpoint = geometry.interpolate(0.5, normalized=True)
    gdf.at[index, 'points'] = midpoint

Unnamed: 0,cat,a_cat,a_ARCID,a_UP_CELLS,a_AREA,a_WIDTH,a_WIDTH5,a_WIDTH95,a_DEPTH,a_DEPTH5,a_DEPTH95,b_cat,geometry,points
0,1,255,255,103,54.92,8.8,3.15,24.9,0.32,0.14,0.74,1,"LINESTRING (93.77500 59.99375, 93.78125 59.99375)",POINT (93.7781249999994 59.99374999999885)
1,2,257,257,108,57.18,4.47,1.64,12.3,0.19,0.08,0.43,1,"LINESTRING (99.39167 60.00000, 99.39792 59.993...",POINT (99.39773794492093 59.993928721743906)
2,3,263,263,107,35.49,7.08,2.56,19.84,0.27,0.12,0.62,1,"LINESTRING (114.46250 59.99167, 114.45625 59.9...",POINT (114.4579018608716 59.99626480579299)
3,4,266,266,108,2179.32,75.47,24.81,232.6,1.69,0.72,4.13,1,"LINESTRING (161.53333 59.99375, 161.55208 59.9...",POINT (161.54270833333166 59.99374999999885)
4,5,269,269,3363,442.76,25.02,8.6,73.78,0.71,0.31,1.71,1,"LINESTRING (91.80208 59.99375, 91.80625 59.98958)",POINT (91.8041666666661 59.991666666665516)


In [17]:
gdf.shape

(928305, 14)

In [18]:
# Function to filter points based on longitude and latitude values
def filter_points(geometry):
    # Extract longitude and latitude from the Point geometry
    longitude, latitude = geometry.x, geometry.y
    
    # Check if longitude is between 82 and 96 and latitude is between 22 and 31
    if 82 <= longitude <= 96 and 22 <= latitude <= 31:
        return True
    else:
        return False

In [24]:
# Filter gdf to keep only the points in the right area
gdf['keep'] = gdf['points'].apply(filter_points)
filtered_gdf = gdf[gdf['keep']]
filtered_gdf.drop(columns=['keep', 'geometry'])
filtered_gdf = filtered_gdf.rename(columns={'points': 'geometry'})
filtered_gdf.shape

(38064, 15)

In [None]:
def get_closest(df, river_points, nn=5):
    """Find the closest point in the wd data to each point in the existing river points."""
    # river points should be a column of points
    
    # Create an empty DataFrame to store the closest points
    closest_points = pd.DataFrame(columns=df.columns)

    # Create a BallTree for data
    tree = BallTree(df['geometry'].apply(lambda x: [x.coords[0][0], x.coords[0][1]]).tolist())
    
    # Find the closest points in the data to each point in river_points
    for true_point in river_points:
        _, ind = tree.query([[true_point.x, true_point.y]], k=nn)  # Find NNs --> 1 is too few

        if len(ind) > 0:
            # Choose closest point among 5 NNs
            min_distance = float('inf')
            closest_row = None

            for i in ind[0]:
                distance = calculate_distance(df.iloc[i]['geometry'], true_point)
                if distance < min_distance:
                    min_distance = distance
                    closest_row = df.iloc[i]
            
            closest_points.loc[len(closest_points.index)] = closest_row

    print('Finished finding closest points, converting to geo.')
    

    ## fix from here 
    geo_points = gpd.GeoDataFrame(closest_points, 
                                  crs={'init':'epsg:4326'}, 
                                  geometry=closest_points['geometry'])
    
    geo_points['time'] = pd.to_datetime(geo_points['time'])
    geo_points['time'] = geo_points['time'].dt.date

    return geo_points


In [None]:
get_closest(df, river_points, nn=5)

matched = get_closest(filtered_df_2015, river_points, 3)


In [3]:
dis = pd.read_csv('../clean_data/concatenated_data.csv')