In [25]:
# section 1 load all the necessary modules and packages
import glob
import time
import geopandas          as      gpd
import netCDF4            as      nc4
import numpy              as      np
import pandas             as      pd
import xarray             as      xr
from   shapely.geometry   import  Polygon
import matplotlib.pyplot  as      plt
import warnings
import sys
from   shapely.geometry   import  Polygon
import os
import shutil
import platform

def copy_files (source, destination):
    file_list = glob.glob(source)
    for file_path in file_list:
        shutil.copy(file_path, destination)


In [26]:
folder_to_save = './shp_output/'
extension = ''

if platform.system().lower() == "darwin": # on personal computer, shold be changed accodingly
    location_of_HydroLakes = '/Volumes/F:/hydrography/hydrolakes/HydroLAKES_polys_v10_shp/HydroLAKES_polys_v10_shp/HydroLAKES_polys_v10.*'
    location_of_HDMA_riv = '/Volumes/F:/hydrography/HDMA/processed/hdma_global_stream.*'
    location_of_HDMA_cat = '/Volumes/F:/hydrography/HDMA/processed/hdma_global_catch_v2_*.gpkg/*'
elif platform.system().lower() == "linux": # on remote computer, should be changed accordigly
    location_of_HydroLakes = '/home/shg096/data/hydrolakes/HydroLAKES_polys_v10_shp/HydroLAKES_polys_v10_shp/HydroLAKES_polys_v10.*'
    location_of_HDMA_riv = '/home/shg096/data/HDMA/processed/hdma_global_stream.*'
    location_of_HDMA_cat = '/home/shg096/data/HDMA/processed/hdma_global_catch_v2_*.gpkg/*'

# if os.path.isdir(folder_to_save):
#     shutil.rmtree(folder_to_save)
#     os.makedirs(folder_to_save)

# copy the dataset
copy_files (location_of_HDMA_riv,\
            folder_to_save)
copy_files (location_of_HDMA_cat,\
            folder_to_save)
copy_files (location_of_HydroLakes,\
            folder_to_save)


# Loading the hydro-lakes version v1.0
## Subsetting and adding extension if desired

In [27]:
# read the hydrolakes shapefile
shp = gpd.read_file(folder_to_save+'HydroLAKES_polys_v10.shp')

# if lakes with specific characters are needed
#shp = shp [shp['Lake_area']>100]; extension = '_100km2' # select the lakes with area more than 100 km2
#shp = shp.reset_index()



## To resolve the Lake Huron there is slight change in the lake shape
### lake Huron is very close to lake superior and therefore it wont be resolved given the coarsness of HDMA river network topology
## Lake Michigan and Lake Huron are merged to one lake with new ID
## Lake id of 847 prolematic lake is also removed (which resulted in circular network topology)


In [28]:
shp_sub = shp

# manupulation of greate lakes to make then resolvabale
# create a test.shp
# get the shp of lake Huron and correct
box = Polygon([[-84.3885, 46.5672],[-84.0244,46.5672],[-84.0244,46.2540],[-84.3885,46.2540]])
box = gpd.GeoDataFrame(pd.DataFrame(['p1'], columns = ['geometry']),
                       crs = {'init':'epsg:4326'},
                       geometry = [box])
shp_sub_Huron = shp_sub[shp_sub['Hylak_id']==8] # lake Huron
indx = shp_sub_Huron.index # get the index of lake Huron
shp_sub_Huron_corrected = gpd.overlay(shp_sub_Huron, box, how = 'difference') # remove the box from lake Huron
shp_sub['geometry'].loc[indx] = shp_sub_Huron_corrected['geometry'].iloc[0] # update lake Huron in shp_sub

# get lake Huron and Michigan and merge them
lake_id = np.array([6,8]) # lake Huron 8, lake Michigan 6
shp_slice = shp_sub[shp_sub['Hylak_id'].isin(lake_id)]
shp_slice.geometry = shp_slice.geometry.buffer(0.00001)
shp_slice ['Hylak_id'] = 1 # unified id and dissolve on that
shp_slice_dissolve = shp_slice.dissolve(by='Hylak_id')
shp_slice_dissolve = shp_slice_dissolve.reset_index(drop=True)

# repopulate the aggregation of the result
columns_list = list(shp_slice_dissolve.columns)
columns_list.remove('geometry')
shp_slice_dissolve [columns_list] = None # put everything else except gemetry none
shp_slice_dissolve ['Hylak_id']   = shp['Hylak_id'].max()+1 # new lake id
shp_slice_dissolve ['Lake_name']  = 'Michigan+Huron'
shp_slice_dissolve ['Lake_area']  = shp_slice['Lake_area'].sum()
shp_slice_dissolve ['Vol_total']  = shp_slice['Vol_total'].sum()
shp_slice_dissolve ['Country']    = 'United States of America'
shp_slice_dissolve ['Continent']  = 'North America'
shp_slice_dissolve ['Poly_src']   = 'SWBD'
shp_slice_dissolve ['Lake_type']  = 1
shp_slice_dissolve ['Grand_id']   = 0

# remove lake Huron and Michigan from the shapefile and add their merger
shp_sub = shp_sub.drop(index=shp_slice.index) # remove lake michigan and huron
shp_sub = shp_sub.append(shp_slice_dissolve) # add merger of lake michigan and huron

# remove the problematic lake Hylak_id = 847
temp = shp_sub[shp_sub['Hylak_id']==847] # find the idx of the hydrolake_id 847
shp_sub = shp_sub.drop(index=temp.index) # remove the lake 847 from the lake diefenbaker

# shift the lake_id by 7000000 not to mix with ids of HDMA river network topology 
shp_sub ['Hylak_id'] = shp_sub ['Hylak_id'] + 7000000

# save the file
shp_sub.to_file(folder_to_save+'HydroLAKES_polys_v10_file_1'+extension+'.shp') # save the subset of the shapefile as a new shapefile



  in_crs_string = _prepare_from_proj_string(in_crs_string)
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: EPSG:4326
Right CRS: +init=epsg:4326 +type=crs

  shp_sub_Huron_corrected = gpd.overlay(shp_sub_Huron, box, how = 'difference') # remove the box from lake Huron

  shp_slice.geometry = shp_slice.geometry.buffer(0.00001)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)
