In [5]:
import pandas as pd
from shapely.ops import unary_union
import shapely
import geopandas as gpd
from shapely.geometry import Polygon
from shapely.geometry import Point
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import requests
import re
import glob
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt


def shp_std_light(name_of_source_file,
                  name_of_result_file,
                  name_of_result_file_fixed_shapes,
                  ID_field,
                  epsilon,
                  list_id):
    """
    @ author:                  Shervan Gharari
    @ Github:                  https://github.com/ShervanGharari/shapefile_standardization
    @ author's email id:       sh.gharari@gmail.com
    @license:                  MIT
    This function gets name of a shapefile and remove inernal holes
    Arguments
    ---------
    name_of_source_file: string, the name of the source file including path and extension
    name_of_result_file: string, the name of the final file including path and extension
    name_of_result_file_fixed_shapes: string, the name of the file that includes fixed shapes
        including path and extension
    ID_field: string, the name of the field in the original shapefile that is used for keeping
        track of holes
    epsilon: real, the minimum distance for buffer operation
    list_id: list of shape IDs that should be corrected
    Returns
    -------
    Saves Files
    -------
    a shp file that includes corrected polygones
    a possible shapefile that includes the fixed shapes
    """
    # load the shapefile
    shp = gpd.read_file(name_of_source_file)
    shp_new = shp # pass the shape to a new shape
    shp_new['flag'] = 0 # add flag for the shapefile ids that are resolved
    for ID in list_id:
        for index, _ in shp.iterrows():
            if shp[ID_field][index] == ID:
                shp_temp = shp.geometry.iloc[index]
                shp_temp = shp_temp.buffer(epsilon,cap_style=2, join_style=2) # to amalgamate tmultipolygons into a polygon
                shp_temp = gpd.GeoSeries(shp_temp) # to geoseries
                shp_temp = gpd.GeoDataFrame(shp_temp) # to geoframe
                shp_temp.columns = ['geometry'] # call the colomn geometry
                poly = shp_temp.geometry.iloc[0] # get the polygone from the shapefile
                A = extract_poly_coords(poly) # extract the exterior
                outer = A['exterior_coords'] # pass the exterior
                poly_new = Polygon (outer) # make a polygone out of the 
                shp_new.geometry.iloc[index] = poly_new # pass the geometry to the new shapefile
                shp_new['flag'].iloc[index] = 1 # put flag as 1
                shp_temp = shp_new.geometry.iloc[index] # get the shape
                shp_temp = shp_temp.buffer(-2*epsilon,cap_style=2, join_style=2) # redo the buffer
                shp_temp = shp_temp.buffer(epsilon,cap_style=2, join_style=2) # redo the buffer
                shp_temp = gpd.GeoSeries(shp_temp) # to geoseries
                shp_temp = gpd.GeoDataFrame(shp_temp) # geo dataframe
                shp_temp.columns = ['geometry'] # name the column as geometry
                shp_new.geometry.iloc[index] = shp_temp.geometry.iloc[0] # pass that to the new shape
    shp_new.to_file(name_of_result_file) 
    shp_new = shp_new [shp_new.flag ==1]
    if not shp_new.empty:
        shp_new.to_file(name_of_result_file_fixed_shapes)
    

def shp_std_hard(name_of_source_file,
                 name_of_result_file,
                 name_of_result_file_holes,
                 name_of_log_file,
                 ID_field,
                 area_tolerance):
    """
    @ author:                  Shervan Gharari
    @ Github:                  https://github.com/ShervanGharari/shapefile_standardization
    @ author's email id:       sh.gharari@gmail.com
    @ license:                 MIT
    This function gets name of a shapefile, its directory, and its extensions (such as gpkg or shp) and
    save a stadard shapefile. if presence it also save the holes of a shapefile
    Arguments
    ---------
    name_of_source_file: string, the name of the source file including path and extension
    name_of_result_file: string, the name of the final file including path and extension
    name_of_result_file_holes: string, the name of the file that includes holes including path
        and extension
    name_of_log_file: string, the name of the text log file with path and txt extension
    ID_field: string, the name of the field in the original shapefile that is used for keeping
        track of holes
    area_tolerance: float; the tolerance to compare area before and after correction and report
        differences
    Returns
    -------
    Saves Files
    -------
    a shp file that includes corrected polygones
    a possible shapefile that includes the removed problematice holes
    a log file in the same folder descringin the invalid shapefiles
    """
    shp_original = gpd.read_file(name_of_source_file)
    shp_poly     = shp_original
    shp_hole     = None
    logfile = open(name_of_log_file,"w") # preparing the log file to write
    number_invalid = 0 # counter for invalid shapes
    number_resolved = 0 # counter for resolved invalid shapes
    number_not_resolved = 0 # counter for not resolved invalid shapes
    for index, _ in shp_original.iterrows():
        # initialization
        polys = shp_original.geometry.iloc[index] # get the shape
        area_before = polys.area # area before changes
        invalid = False # initializing invalid as false
        # check if the shapefile is valid
        if polys.is_valid is False: # check if the geometry is invalid
            number_invalid = number_invalid + 1
            invalid = True
            str_temp = str(number_invalid)+". shape with ID "+str(shp_original[ID_field].iloc[index])+\
            " is not valid"
            logfile.write(str_temp)
        # put the shape into a Polygon or MultiPolygon
        if polys.type == 'Polygon':
            #print(polys.type)
            shp_temp = gpd.GeoSeries(polys) # convert multipolygon to a shapefile with polygons only
            #shp_temp.columns = ['geometry'] # naming geometry column
            shp_temp = gpd.GeoDataFrame(shp_temp) # convert multipolygon to a shapefile with polygons
            shp_temp.columns = ['geometry'] # naming geometry column
            #print(shp_temp)
        if polys.type == 'MultiPolygon':
            #print(polys.type)
            shp_temp = gpd.GeoDataFrame(polys) # convert multipolygon to a shapefile with polygons only
            shp_temp.columns = ['geometry'] # naming geometry column
            #print(shp_temp)
        has_holes = False # initializing hole as false
        shp_temp['CCW'] = 0 # initialize check for couterclockwise (holes)
        for index1, _ in shp_temp.iterrows(): #loop over polygone of one element
            poly = shp_temp.geometry.iloc[index1] # get the geometry of polygon
            if poly.exterior.is_ccw is True: # then the polgone is a hole
                shp_temp['CCW'].iloc[index1] = 1 # set the hole flag to 1
                shp_temp['geometry'].iloc[index1] = shapely.geometry.polygon.orient(poly, sign = +1) 
                # +1 CCW
                #print(shp_temp['geometry'].iloc[index1])
                has_holes = True
        shp_temp_polys = shp_temp[shp_temp.CCW ==0] # get the polyons that are not couter clockwise
        shp_temp_polys['dis'] = 0 # add a field for desolve
        shp_temp_polys = shp_temp_polys.dissolve(by='dis') # to one multipolygon
        polys_temp = shp_temp_polys.geometry.iloc[0] # update the shapefile on that
        polys_temp = unary_union(polys_temp) # unify all the polygons into a multipolygons
        shp_poly.geometry.iloc[index] = polys_temp.buffer(0) # fix the issue by buffer(0)
        shp_poly.geometry.iloc[index] = polys_temp.buffer(0) # fix the issue by buffer(0)
        area_after = shp_poly.geometry.iloc[index].area # area after changes
        # check if the shapefile becomes valid
        # check if the geometry #is invalid
        if shp_poly.geometry.iloc[index].is_valid is True and invalid is True: 
            str_temp = " and becomes valid \n"
            logfile.write(str_temp)
            number_resolved = number_resolved + 1
        # check if the geometry is invalid
        if shp_poly.geometry.iloc[index].is_valid is False and invalid is True:
            str_temp = " and does not become valid; please check the shape \n"
            logfile.write(str_temp)
            number_not_resolved = number_not_resolved + 1
        if has_holes is True:
            shp_temp_holes = shp_temp[shp_temp.CCW ==1]
            shp_temp_holes['dis'] = 0
            shp_temp_holes = shp_temp_holes.dissolve(by='dis') # to one multipolyno
            shp_temp_holes[ID_field] = shp_original[ID_field].iloc[index]
            if shp_hole is None:
                shp_hole = shp_temp_holes
            else:
                shp_hole = gpd.GeoDataFrame( pd.concat([shp_hole, shp_temp_holes], ignore_index=True) )
            str_temp = "Shape has a hole \n"
            logfile.write(str_temp)
        if abs(area_before-area_after)>area_tolerance: # tolernace can be different based on projection
            str_temp = "shape area changes abs("+str(area_before)+"-"+str(area_after)+") = "+\
            str(area_before-area_after)+" \n"
            logfile.write(str_temp)
    shp_poly.to_file(name_of_result_file)
    if shp_hole is not None:
        shp_hole.to_file(name_of_result_file_holes) #save any hole to check
    str_temp = "Total number of shapes = "+str(shp_original.shape[0])+" \n"
    logfile.write(str_temp)
    str_temp = "Total number of invalid shapes = "+str(number_invalid)+" \n"
    logfile.write(str_temp)
    str_temp = "Total number of resolved invalid shapes = "+str(number_resolved)+" \n"
    logfile.write(str_temp)
    str_temp = "Total number of not resolved invalid shapes = "+str(number_not_resolved)+" \n"
    logfile.write(str_temp)
    logfile.close() # close the log gile

def extract_poly_coords(geom):
    if geom.type == 'Polygon':
        exterior_coords = geom.exterior.coords[:]
        interior_coords = []
        for interior in geom.interiors:
            interior_coords += interior.coords[:]
    elif geom.type == 'MultiPolygon':
        exterior_coords = []
        interior_coords = []
        for part in geom:
            epc = extract_poly_coords(part)  # Recursive call
            exterior_coords += epc['exterior_coords']
            interior_coords += epc['interior_coords']
    else:
        raise ValueError('Unhandled geometry type: ' + repr(geom.type))
    return {'exterior_coords': exterior_coords,
            'interior_coords': interior_coords}


In [2]:
# the 2 digit pfaf code for the shapefile to be processed
# list of IDs for downloading the processing
IDs = ['11', '12', '13', '14', '15', '16', '17', '18',
       '21', '22', '23', '24', '25', '26', '27', '28', '29',
       '31', '32', '33', '34', '35', '36',
       '41', '42', '43', '44', '45', '46', '47', '48', '49',
       '51', '52', '53', '54', '55', '56', '57',
       '61', '62', '63', '64', '65', '66', '67',
       '71', '72', '73', '74', '75', '76', '77', '78',
       '81', '82', '83', '84', '85', '86',
       '91']
# IDs = ['11']

# in this folder create subfolders cat, riv, hill, cat_step_1,cat_step_2
path = '/Users/shg096/Desktop/MERIT_Hydro/'


In [3]:
# correct the two problemative shapefiles in the entire catchemtns
list_id = [11040208,56045327]  # the COMID IDs that result in shp_std_hard to crash hole outside shell
for ID in IDs:
    shp_std_light(path+'cat/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1.shp',
                  path+'cat_step_0/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1_test.shp',
                  path+'cat_step_0/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1_fixed_test.shp',
                 'COMID',
                  0.0000001,
                  list_id)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [6]:
# correct the shapefiles for the catchemtns
for ID in IDs:
    shp_std_hard(path+'cat_step_0/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1_test.shp',
                 path+'cat_step_1/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1_corr1_test.shp',
                 path+'cat_step_1/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1_corr1_hole_test.shp',
                 path+'cat_step_1/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1_corr1_log_test.txt',
                 'COMID',
                 0.0000000001)
    

In [7]:
# check the corrected shapefiles from the pervious block with different rules in QGIS with checkvalidity
# is needed provide the COMID here to be recorrected
list_id = [11038670,11040208,11035758,
           12073970,
           16008278,16009917,16012413,
           17008507,
           25000050,
           28045843,28046799,28047182,28050769,28059551,28064206,
           29020703,29028261,29034407,29048575,29050425,29071345,29092185,
           31042597,31079633,32029132,
           43031330,43050399,43063824,
           45074597,
           47014085,
           48001834,48021477,48026003,
           49010729,
           52010007,
           56141996,56158704,
           61008413,61026810,
           62038658,
           64009361,64074730,
           65027306,
           66009603,
           72055397,72055872,72058490,
           74006369,74071283,
           75027926,
           77014997,
           78012325,
           81033705,
           82004214,82041566,82002087,
           91025753,91035154,91035236,91035911]  # list of COMID that are still not valid based on QGIS
for ID in IDs:
    shp_std_light(path+'cat_step_1/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1_corr1_test.shp',
                  path+ 'cat_fixed/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1_corr2_test.shp',
                  path+ 'cat_fixed/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1_corr2_fixedshp_test.shp',
                 'COMID',
                 0.0000001,
                 list_id)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_

In [8]:
# define projection for the corrected subbasins
for ID in IDs:
    shp = gpd.read_file(path+'cat_fixed/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1_corr2_test.shp')
    shp.crs = {'init': 'epsg:4326', 'no_defs': True}
    shp.to_file(path+'cat_fixed/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1_fixed_test.shp')



  return _prepare_from_string(" ".join(pjargs))


In [9]:
# define projection for the corrected subbasins
for ID in IDs:
    shp = gpd.read_file(path+'cat_fixed/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1_fixed_test.shp')
    shp = shp.drop(columns=['flag'])
    shp.to_file(path+'cat_fixed/cat_pfaf_'+ID+'_MERIT_Hydro_v07_Basins_v01_bugfix1_fixed_test.shp')

