# Functions

These are the functions needed in order to run the extraction notebook. **Please do not edit this file as it will break the code**

In [1]:
import geopandas as gpd   # Note that you require geopandas version >= 0.7 that incluse clip see here for installation (https://gis.stackexchange.com/questions/360127/geopandas-0-6-1-installed-instead-of-0-7-0-in-conda-windows-10#)
import os
import fiona
import ipywidgets as widgets
from IPython.display import display
from rasterstats import zonal_stats
import rasterio
from geojson import Feature, Point, FeatureCollection
import rasterio.fill
import shapely
from shapely.geometry import shape, mapping
import pyproj
import json
#from earthpy import clip    clip has been deprecated to geopandas
#import earthpy.spatial as es
import numpy as np
import tkinter as tk
from tkinter import filedialog, messagebox
import gdal
import datetime
import warnings
import pandas as pd
import scipy.spatial
from scipy.spatial import Voronoi
warnings.filterwarnings('ignore')

root = tk.Tk()
root.withdraw()
root.attributes("-topmost", True)

''

## Getting admin 1 boundary name to clusters

In [2]:
def get_admin1_name(clusters, admin_col_name, crs):
    # Import layer
    messagebox.showinfo('OnSSET', 'Select the admin 1 boundaries')
    admin_1 = gpd.read_file(filedialog.askopenfilename(filetypes = (("shapefile","*.shp"),("all files","*.*"))))
    
    clusters_support = clusters[['id', 'geometry']].to_crs({'init': "EPSG:4326"})
    
    # Apply spatial join 
    cluster_support_2 = gpd.sjoin(clusters_support, admin_1[["geometry", admin_col_name]], op='intersects').drop(['index_right'], axis=1)
    group_by_id = cluster_support_2.groupby(["id"]).sum().reset_index()
    clusters = pd.merge(clusters, group_by_id[['id', admin_col_name]], on='id', how = 'left')
    clusters.rename(columns = {admin_col_name:'Admin_1'}, inplace = True)
    
    print(datetime.datetime.now())
    
    #Return result
    return clusters

In [None]:
def get_admin1_name_bulk(clusters, file_name, admin_col_name, crs):
    # Import layer
    #messagebox.showinfo('OnSSET', 'Select the admin 1 boundaries')
    admin_1 = gpd.read_file(file_name)
    
    clusters_support = clusters[['id', 'geometry']].to_crs({'init': "EPSG:4326"})
    
    # Apply spatial join 
    cluster_support_2 = gpd.sjoin(clusters_support, admin_1[["geometry", admin_col_name]], op='intersects').drop(['index_right'], axis=1)
    group_by_id = cluster_support_2.groupby(["id"]).sum().reset_index()
    clusters = pd.merge(clusters, group_by_id[['id', admin_col_name]], on='id', how = 'left')
    clusters.rename(columns = {admin_col_name:'Admin_1'}, inplace = True)
    
    print(datetime.datetime.now())
    
    #Return result
    return clusters

In [None]:
def get_admin_name(clusters, admin, admin_col_name):
    clusters_support = clusters[['id', 'geometry']].to_crs({'init': "EPSG:4326"})
    clusters_support_centroid = clusters_support.copy()
    clusters_support_centroid.geometry = clusters_support_centroid.centroid
    
    # Apply spatial join 
    clusters_support_centroid_2 = gpd.sjoin(clusters_support_centroid, admin[["geometry", admin_col_name]], op='intersects').drop(['index_right'], axis=1)
    group_by_id = clusters_support_centroid_2.groupby(["id"]).sum().reset_index()
    clusters = pd.merge(clusters, group_by_id[['id', admin_col_name]], on='id', how = 'left')
    #clusters.rename(columns = {admin_col_name:'Admin_name'}, inplace = True)
    
    print(datetime.datetime.now())
    
    #Return result
    return clusters

## Getting IDP & Refugee camps characteristics

In [3]:
def get_IDPs_RefugeeCamps_status(clusters, col_name, crs):
    # Import layer
    messagebox.showinfo('OnSSET', 'Select the layer of IDP')
    idp_gdf = gpd.read_file(filedialog.askopenfilename(filetypes = (("shapefile","*.shp"),("all files","*.*"))))
    
    clusters_support = clusters[['id', 'geometry']].to_crs({'init': "EPSG:4326"})
    
    # Apply spatial join and group by cluster "id"
    pointsInPolygon = gpd.sjoin(idp_gdf, clusters_support, how="inner", op='intersects')
    pointsInPolygon[col_name]=1
    group_by_id = pointsInPolygon.groupby(["id", col_name]).sum().reset_index().drop("index_right", axis=1)
    
    # Merge back to clusters
    clusters = pd.merge(clusters, group_by_id[['id', col_name]], on='id', how = 'left')
    
    clusters[col_name] = np.where(clusters[col_name] > 0, 1, 0)
    
    print(datetime.datetime.now())
    
    #Return result
    return clusters

## Getting No of building per cluster

In [4]:
def get_buildings_in_clusters(clusters, col_name, crs):
    # Import layer
    messagebox.showinfo('OnSSET', 'Select the layer of building footprints')
    gdf = gpd.read_file(filedialog.askopenfilename(filetypes = (("shapefile","*.shp"),("all files","*.*"))))
    
    #Converting polygon buildings to points
    gdf_centroids = gpd.GeoDataFrame(gdf,
                                     crs="EPSG:4326",
                                     geometry=[Point(xy) for xy in zip(gdf.centroid.x, gdf.centroid.y)])
    
    # Reverting clusters to original crs 
    clusters_support = clusters[['id', 'geometry']].to_crs({'init': "EPSG:4326"})
    #clusters_support.id = clusters_support.id.astype(int)
    
    # Apply spatial join and group by cluster "id"
    pointsInPolygon = gpd.sjoin(gdf_centroids, clusters_support, how="inner", op='intersects')
    pointsInPolygon[col_name]=1
    group_by_id = pointsInPolygon.groupby(["id"]).sum().reset_index().drop("index_right", axis=1)
    
    # Merge back to clusters
    clusters = pd.merge(clusters, group_by_id[['id', col_name]], on='id', how = 'left')
    
    # Fill NaN values with 0
    clusters[col_name] = clusters[col_name].fillna(0)
    
    print(datetime.datetime.now())
    
    #Return result
    return clusters

## Getting No of water points per cluster

In [5]:
def get_waterpoints_in_clusters(clusters, col_name, crs):
    # Import layer
    messagebox.showinfo('OnSSET', 'Select the layer of water points')
    gdf = gpd.read_file(filedialog.askopenfilename(filetypes = (("shapefile","*.shp"),("all files","*.*"))))
    
    # Reverting clusters to original crs 
    clusters_support = clusters[['id', 'geometry']].to_crs({'init': "EPSG:4326"})
    
    # Apply spatial join and group by cluster "id"
    pointsInPolygon = gpd.sjoin(gdf, clusters_support, how="inner", op='intersects')
    pointsInPolygon[col_name]=1
    group_by_id = pointsInPolygon.groupby(["id"]).sum().reset_index().drop("index_right", axis=1)
    
    # Merge back to clusters
    clusters = pd.merge(clusters, group_by_id[['id', col_name]], on='id', how = 'left')
    
    # Fill NaN values with 0
    clusters[col_name] = clusters[col_name].fillna(0)
    
    print(datetime.datetime.now())
    
    #Return result
    return clusters

## Processing Rasters

In [6]:
def processing_raster(name, method, clusters):
    messagebox.showinfo('OnSSET', 'Select the ' + name + ' map')
    raster=rasterio.open(filedialog.askopenfilename(filetypes = (("rasters","*.tif"),("all files","*.*"))))
    
    clusters = zonal_stats(
        clusters,
        raster.name,
        stats=[method],
        prefix=name, geojson_out=True, all_touched=True)
    
    print(datetime.datetime.now())
    return clusters

In [15]:
def processing_raster_bulk(file_name, name, method, clusters):
    #messagebox.showinfo('OnSSET', 'Select the ' + name + ' map')
    raster=rasterio.open(file_name)
    
    clusters = zonal_stats(
        clusters,
        raster.name,
        stats=[method],
        prefix=name, geojson_out=True, all_touched=True)
    
    print(datetime.datetime.now())
    return clusters

In [None]:
## Processing Categorical/Discrete Rasters
def processing_raster_cat(path, raster, prefix, polys):
    """
    This function calculates stats for categorical rasters and attributes them to the given vector features. 
    
    INPUT: 
    path: the directory where the raster layer is stored 
    raster: the name and extention of the raster layer 
    prefix: string used as prefix when assigning features to the vectors
    clusters: the vector layer containing the clusters
    
    OUTPUT:
    geojson file of the vector features including the new attributes
    """    
    raster=rasterio.open(path + '\\' + raster)
    
    polys = zonal_stats(
        polys,
        raster.name,
        categorical=True,
        prefix=prefix, geojson_out=True, all_touched=True)
    
    print("{} processing completed at".format(prefix), datetime.datetime.now())
    return polys

In [None]:
# Land cover area estimator
def calc_Crop_sqkm(df, col_list):
    """ 
    This function takes the df where the Cropland type for different classes is provided per location (row).
    It adds all pixels per location; then is calculates the ratio of crop class in each location (% of total).
    Finally is estimates the area per cropland type in each location by multiplying with the total area each row represents.
    
    INPUT: 
    df -> Pandas dataframe with LC type classification 
    col_list -> list of columns to include in the summary (e.g. LC0-LC1)
    
    OUTPUT: Updated dataframe with estimated area (sqkm) of cropland per row
    """
    df["Crop_pix_sum"] = df[col_list].sum(axis=1)
    for col in col_list:
        df[col] = df[col]/df["Crop_pix_sum"]*df["Vor_area_ha"]
        
    df = df.drop('Crop_pix_sum', axis=1)
    
    return df

## Processing Elevation and Slope

In [7]:
def processing_elevation_and_slope(name, method, clusters, workspace,crs):
    messagebox.showinfo('OnSSET', 'Select the ' + name + ' map')
    raster=rasterio.open(filedialog.askopenfilename(filetypes = (("rasters","*.tif"),("all files","*.*"))))
    
    clusters = zonal_stats(
        clusters,
        raster.name,
        stats=[method],
        prefix=name, geojson_out=True, all_touched=True)

    gdal.Warp(workspace + r"\dem.tif",raster.name,dstSRS=crs)

    def calculate_slope(DEM):
        gdal.DEMProcessing(workspace + r'\slope.tif', DEM, 'slope')
        with rasterio.open(workspace + r'\slope.tif') as dataset:
            slope=dataset.read(1)
        return slope

    slope=calculate_slope(workspace + r"\dem.tif")

    slope = rasterio.open(workspace + r'\slope.tif')
    gdal.Warp(workspace + r'\slope_4326.tif',slope.name,dstSRS='EPSG:4326')
    slope_4326 = rasterio.open(workspace + r'\slope_4326.tif')

    clusters = zonal_stats(
        clusters,
        slope_4326.name,
        stats=["majority"],
        prefix="sl_", all_touched = True, geojson_out=True)
    
    print(datetime.datetime.now())
    return clusters

In [None]:
def processing_elevation_and_slope_bulk(file_name, name, method, clusters, workspace,crs):
    #messagebox.showinfo('OnSSET', 'Select the ' + name + ' map')
    raster=rasterio.open(file_name)
    
    clusters = zonal_stats(
        clusters,
        raster.name,
        stats=[method],
        prefix=name, geojson_out=True, all_touched=True)

    gdal.Warp(workspace + r"\dem.tif",raster.name,dstSRS=crs)

    def calculate_slope(DEM):
        gdal.DEMProcessing(workspace + r'\slope.tif', DEM, 'slope')
        with rasterio.open(workspace + r'\slope.tif') as dataset:
            slope=dataset.read(1)
        return slope

    slope=calculate_slope(workspace + r"\dem.tif")

    slope = rasterio.open(workspace + r'\slope.tif')
    gdal.Warp(workspace + r'\slope_4326.tif',slope.name,dstSRS='EPSG:4326')
    slope_4326 = rasterio.open(workspace + r'\slope_4326.tif')

    clusters = zonal_stats(
        clusters,
        slope_4326.name,
        stats=["majority"],
        prefix="sl_", all_touched = True, geojson_out=True)
    
    print(datetime.datetime.now())
    return clusters

## Finalizing rasters

In [8]:
def finalizing_rasters(workspace, clusters, crs):
    output = workspace + r'\placeholder.geojson'
    with open(output, "w") as dst:
        collection = {
            "type": "FeatureCollection",
            "features": list(clusters)}
        dst.write(json.dumps(collection))
  
    clusters = gpd.read_file(output)
    os.remove(output)
    
    print(datetime.datetime.now())
    return clusters

## Preparing for vectors

In [9]:
def preparing_for_vectors(workspace, clusters, crs):   
    clusters.crs = {'init' :'epsg:4326'}
    clusters = clusters.to_crs({ 'init': crs}) 
    points = clusters.copy()
    points["geometry"] = points["geometry"].centroid
    points.to_file(workspace + r'\clusters_cp.shp', driver='ESRI Shapefile')
    print(datetime.datetime.now())    
    return clusters

In [1]:
def preparing_for_vectors_updated(workspace, clusters, crs_proj):   
    #clusters.crs = {'init' : crs}
    cl_points = clusters.copy()
    cl_points_proj = cl_points.to_crs({ 'init': crs_proj}) 
    cl_points_proj["geometry"] = cl_points_proj["geometry"].centroid
    cl_points_proj.to_file(workspace + r'\clusters_cp.shp', driver='ESRI Shapefile')
    print(datetime.datetime.now())    
    return clusters

## Processing Lines

In [10]:
def processing_lines(name, admin, crs, workspace, clusters):
    messagebox.showinfo('OnSSET', 'Select the ' + name + ' map')
    lines=gpd.read_file(filedialog.askopenfilename(filetypes = (("shapefile","*.shp"),("all files","*.*"))))

    lines_clip = gpd.clip(lines, admin)
    lines_clip.crs = {'init' :'epsg:4326'}
    lines_proj=lines_clip.to_crs({ 'init': crs})

    lines_proj.to_file(workspace + r"\ " + name + "_proj.shp", driver='ESRI Shapefile')

    line = fiona.open(workspace +  r"\ " + name + "_proj.shp")
    firstline = line.next()

    schema = {'geometry' : 'Point', 'properties' : {'id' : 'int'},}
    with fiona.open(workspace + r"\ " + name + "_proj_points.shp", "w", "ESRI Shapefile", schema) as output:
        for lines in line:
            if lines["geometry"] is not None:
                first = shape(lines['geometry'])
                length = first.length
                for distance in range(0,int(length),100):
                    point = first.interpolate(distance)
                    output.write({'geometry' :mapping(point), 'properties' : {'id':1}})

    lines_f = fiona.open(workspace + r"\ " + name + "_proj_points.shp")
    lines = gpd.read_file(workspace +  r"\ " + name + "_proj.shp")
    points = fiona.open(workspace + r'\clusters_cp.shp')

    geoms1 = [shape(feat["geometry"]) for feat in lines_f]
    s1 = [np.array((geom.xy[0][0], geom.xy[1][0])) for geom in geoms1]
    s1_arr = np.array(s1)

    geoms2 = [shape(feat["geometry"]) for feat in points]
    s2 = [np.array((geom.xy[0][0], geom.xy[1][0])) for geom in geoms2]
    s2_arr = np.array(s2)

    def do_kdtree(combined_x_y_arrays,points):
        mytree = scipy.spatial.cKDTree(combined_x_y_arrays)
        dist, indexes = mytree.query(points)
        return dist, indexes

    def vector_overlap(vec, settlementfile, column_name):
        vec.drop(vec.columns.difference(["geometry"]), 1, inplace=True)
        a = gpd.sjoin(settlementfile, vec, op = 'intersects')
        a[column_name + '2'] = 0
        return a  

    results1, results2 = do_kdtree(s1_arr,s2_arr)

    z=results1.tolist()
    clusters[name+'Dist'] = z
    clusters[name+'Dist'] = clusters[name+'Dist']/1000

    a = vector_overlap(lines, clusters, name+'Dist')

    clusters = pd.merge(left = clusters, right = a[['id',name+'Dist2']], on='id', how = 'left')
    clusters.drop_duplicates(subset ="id", keep = "first", inplace = True) 

    clusters.loc[clusters[name+'Dist2'] == 0, name+'Dist'] = 0

    del clusters[name+'Dist2']
    print(datetime.datetime.now())
    return clusters

In [None]:
def processing_lines_bulk(file_name, name, admin, crs, workspace, clusters):
    #messagebox.showinfo('OnSSET', 'Select the ' + name + ' map')
    lines=gpd.read_file(file_name)

    lines_clip = gpd.clip(lines, admin)
    lines_clip.crs = {'init' :'epsg:4326'}
    lines_proj=lines_clip.to_crs({ 'init': crs})

    lines_proj.to_file(workspace + r"\ " + name + "_proj.shp", driver='ESRI Shapefile')

    line = fiona.open(workspace +  r"\ " + name + "_proj.shp")
    firstline = line.next()

    schema = {'geometry' : 'Point', 'properties' : {'id' : 'int'},}
    with fiona.open(workspace + r"\ " + name + "_proj_points.shp", "w", "ESRI Shapefile", schema) as output:
        for lines in line:
            if lines["geometry"] is not None:
                first = shape(lines['geometry'])
                length = first.length
                for distance in range(0,int(length),100):
                    point = first.interpolate(distance)
                    output.write({'geometry' :mapping(point), 'properties' : {'id':1}})

    lines_f = fiona.open(workspace + r"\ " + name + "_proj_points.shp")
    lines = gpd.read_file(workspace +  r"\ " + name + "_proj.shp")
    points = fiona.open(workspace + r'\clusters_cp.shp')

    geoms1 = [shape(feat["geometry"]) for feat in lines_f]
    s1 = [np.array((geom.xy[0][0], geom.xy[1][0])) for geom in geoms1]
    s1_arr = np.array(s1)

    geoms2 = [shape(feat["geometry"]) for feat in points]
    s2 = [np.array((geom.xy[0][0], geom.xy[1][0])) for geom in geoms2]
    s2_arr = np.array(s2)

    def do_kdtree(combined_x_y_arrays,points):
        mytree = scipy.spatial.cKDTree(combined_x_y_arrays)
        dist, indexes = mytree.query(points)
        return dist, indexes

    def vector_overlap(vec, settlementfile, column_name):
        vec.drop(vec.columns.difference(["geometry"]), 1, inplace=True)
        a = gpd.sjoin(settlementfile, vec, op = 'intersects')
        a[column_name + '2'] = 0
        return a  

    results1, results2 = do_kdtree(s1_arr,s2_arr)

    z=results1.tolist()
    clusters[name+'Dist'] = z
    clusters[name+'Dist'] = clusters[name+'Dist']/1000

    a = vector_overlap(lines, clusters, name+'Dist')

    clusters = pd.merge(left = clusters, right = a[['id',name+'Dist2']], on='id', how = 'left')
    clusters.drop_duplicates(subset ="id", keep = "first", inplace = True) 

    clusters.loc[clusters[name+'Dist2'] == 0, name+'Dist'] = 0

    del clusters[name+'Dist2']
    print(datetime.datetime.now())
    return clusters

In [None]:
def processing_shorelines(name, lines, crs, workspace, clusters):
    #messagebox.showinfo('OnSSET', 'Select the ' + name + ' map')
    #lines=gpd.read_file(filedialog.askopenfilename(filetypes = (("shapefile","*.shp"),("all files","*.*"))))

    #lines_clip = gpd.clip(lines, admin)
    lines.crs = {'init' :'epsg:4326'}
    lines_proj=lines.to_crs({ 'init': crs})

    lines_proj.to_file(workspace + r"\ " + name + "_proj.shp", driver='ESRI Shapefile')

    line = fiona.open(workspace +  r"\ " + name + "_proj.shp")
    firstline = line.next()

    schema = {'geometry' : 'Point', 'properties' : {'id' : 'int'},}
    with fiona.open(workspace + r"\ " + name + "_proj_points.shp", "w", "ESRI Shapefile", schema) as output:
        for lines in line:
            if lines["geometry"] is not None:
                first = shape(lines['geometry'])
                length = first.length
                for distance in range(0,int(length),100):
                    point = first.interpolate(distance)
                    output.write({'geometry' :mapping(point), 'properties' : {'id':1}})

    lines_f = fiona.open(workspace + r"\ " + name + "_proj_points.shp")
    lines = gpd.read_file(workspace +  r"\ " + name + "_proj.shp")
    points = fiona.open(workspace + r'\clusters_cp.shp')

    geoms1 = [shape(feat["geometry"]) for feat in lines_f]
    s1 = [np.array((geom.xy[0][0], geom.xy[1][0])) for geom in geoms1]
    s1_arr = np.array(s1)

    geoms2 = [shape(feat["geometry"]) for feat in points]
    s2 = [np.array((geom.xy[0][0], geom.xy[1][0])) for geom in geoms2]
    s2_arr = np.array(s2)

    def do_kdtree(combined_x_y_arrays,points):
        mytree = scipy.spatial.cKDTree(combined_x_y_arrays)
        dist, indexes = mytree.query(points)
        return dist, indexes

    def vector_overlap(vec, settlementfile, column_name):
        vec.drop(vec.columns.difference(["geometry"]), 1, inplace=True)
        a = gpd.sjoin(settlementfile, vec, op = 'intersects')
        a[column_name + '2'] = 0
        return a  

    results1, results2 = do_kdtree(s1_arr,s2_arr)

    z=results1.tolist()
    clusters[name+'Dist'] = z
    clusters[name+'Dist'] = clusters[name+'Dist']/1000

    a = vector_overlap(lines, clusters, name+'Dist')

    clusters = pd.merge(left = clusters, right = a[['id',name+'Dist2']], on='id', how = 'left')
    clusters.drop_duplicates(subset ="id", keep = "first", inplace = True) 

    clusters.loc[clusters[name+'Dist2'] == 0, name+'Dist'] = 0

    del clusters[name+'Dist2']
    print(datetime.datetime.now())
    return clusters

## Processing points


In [11]:
def processing_points(name, admin, crs, workspace, clusters, mg_filter):
    messagebox.showinfo('OnSSET', 'Select the ' + name + ' map')
    points=gpd.read_file(filedialog.askopenfilename(filetypes = (("shapefile","*.shp"),("all files","*.*"))))
    if mg_filter:
        points['umgid'] = range(0, len(points))
        points_post = points

    points_clip = gpd.clip(points, admin)
    points_clip.crs = {'init' :'epsg:4326'}
    points_proj=points_clip.to_crs({ 'init': crs})

    points_proj.to_file(workspace + r"\ " + name + "_proj.shp", driver='ESRI Shapefile')

    points_f = fiona.open(workspace + r"\ " + name + "_proj.shp")
    points = gpd.read_file(workspace +  r"\ " + name + "_proj.shp")
    points2 = fiona.open(workspace + r'\clusters_cp.shp')

    geoms1 = [shape(feat["geometry"]) for feat in points_f]
    s1 = [np.array((geom.xy[0][0], geom.xy[1][0])) for geom in geoms1]
    s1_arr = np.array(s1)
    
    geoms2 = [shape(feat["geometry"]) for feat in points2]
    s2 = [np.array((geom.xy[0][0], geom.xy[1][0])) for geom in geoms2]
    s2_arr = np.array(s2)

    def do_kdtree(combined_x_y_arrays,points):
        mytree = scipy.spatial.cKDTree(combined_x_y_arrays)
        dist, indexes = mytree.query(points)
        return dist, indexes

    def vector_overlap(vec, settlementfile, column_name):
        vec.drop(vec.columns.difference(["geometry"]), 1, inplace=True)
        a = gpd.sjoin(settlementfile, vec, op = 'intersects')
        a[column_name + '2'] = 0
        return a  

    results1, results2 = do_kdtree(s1_arr,s2_arr)

    z=results1.tolist()
    clusters[name+'Dist'] = z
    clusters[name+'Dist'] = clusters[name+'Dist']/1000.
    if mg_filter:
        z2 = results2.tolist()
        clusters['umgid'] = z2

    a = vector_overlap(points, clusters, name+'Dist')

    clusters = pd.merge(left = clusters, right = a[['id',name+'Dist2']], on='id', how = 'left')
    clusters.drop_duplicates(subset ="id", keep = "first", inplace = True) 

    clusters.loc[clusters[name+'Dist2'] == 0, name+'Dist'] = 0
    
    if mg_filter:
        clusters = pd.merge(clusters, points_post[['umgid', 'name', "MV_network", "MG_type"]], on='umgid', how = 'left')
        clusters.rename(columns = {'name':'MGName',
                                   'MV_network':'MGMVstatus',
                                   'MG_type':'MGType'}, inplace = True)

    del clusters[name+'Dist2']
    if mg_filter:
        del clusters['umgid']
    print(datetime.datetime.now())
    return clusters

In [None]:
def processing_points_bulk(file_name, name, admin, crs, workspace, clusters, mg_filter):
    #messagebox.showinfo('OnSSET', 'Select the ' + name + ' map')
    points=gpd.read_file(file_name)
    if mg_filter:
        points['umgid'] = range(0, len(points))
        points_post = points

    points_clip = gpd.clip(points, admin)
    points_clip.crs = {'init' :'epsg:4326'}
    points_proj=points_clip.to_crs({ 'init': crs})

    points_proj.to_file(workspace + r"\ " + name + "_proj.shp", driver='ESRI Shapefile')

    points_f = fiona.open(workspace + r"\ " + name + "_proj.shp")
    points = gpd.read_file(workspace +  r"\ " + name + "_proj.shp")
    points2 = fiona.open(workspace + r'\clusters_cp.shp')

    geoms1 = [shape(feat["geometry"]) for feat in points_f]
    s1 = [np.array((geom.xy[0][0], geom.xy[1][0])) for geom in geoms1]
    s1_arr = np.array(s1)
    
    geoms2 = [shape(feat["geometry"]) for feat in points2]
    s2 = [np.array((geom.xy[0][0], geom.xy[1][0])) for geom in geoms2]
    s2_arr = np.array(s2)

    def do_kdtree(combined_x_y_arrays,points):
        mytree = scipy.spatial.cKDTree(combined_x_y_arrays)
        dist, indexes = mytree.query(points)
        return dist, indexes

    def vector_overlap(vec, settlementfile, column_name):
        vec.drop(vec.columns.difference(["geometry"]), 1, inplace=True)
        a = gpd.sjoin(settlementfile, vec, op = 'intersects')
        a[column_name + '2'] = 0
        return a  

    results1, results2 = do_kdtree(s1_arr,s2_arr)

    z=results1.tolist()
    clusters[name+'Dist'] = z
    clusters[name+'Dist'] = clusters[name+'Dist']/1000.
    if mg_filter:
        z2 = results2.tolist()
        clusters['umgid'] = z2

    a = vector_overlap(points, clusters, name+'Dist')

    clusters = pd.merge(left = clusters, right = a[['id',name+'Dist2']], on='id', how = 'left')
    clusters.drop_duplicates(subset ="id", keep = "first", inplace = True) 

    clusters.loc[clusters[name+'Dist2'] == 0, name+'Dist'] = 0
    
    if mg_filter:
        clusters = pd.merge(clusters, points_post[['umgid', 'name', "MV_network", "MG_type"]], on='umgid', how = 'left')
        clusters.rename(columns = {'name':'MGName',
                                   'MV_network':'MGMVstatus',
                                   'MG_type':'MGType'}, inplace = True)

    del clusters[name+'Dist2']
    if mg_filter:
        del clusters['umgid']
    print(datetime.datetime.now())
    return clusters

## Processing hydro

In [12]:
def processing_hydro(admin, crs, workspace, clusters, points, hydropowervalue, 
                     hydropowerunit):

    points_clip = gpd.clip(points, admin)
    points_clip.crs = {'init' :'epsg:4326'}
    points_proj=points_clip.to_crs({ 'init': crs})

    points_proj.to_file(workspace + r"\HydropowerDist_proj.shp", driver='ESRI Shapefile')
    points_f = fiona.open(workspace +  r"\HydropowerDist_proj.shp")
    points = gpd.read_file(workspace +  r"\HydropowerDist_proj.shp")
    points2 = fiona.open(workspace + r'\clusters_cp.shp')

    geoms1 = [shape(feat["geometry"]) for feat in points_f]
    s1 = [np.array((geom.xy[0][0], geom.xy[1][0])) for geom in geoms1]
    s1_arr = np.array(s1)
    
    geoms2 = [shape(feat["geometry"]) for feat in points2]
    s2 = [np.array((geom.xy[0][0], geom.xy[1][0])) for geom in geoms2]
    s2_arr = np.array(s2)

    mytree = scipy.spatial.cKDTree(s1_arr)
    dist, indexes = mytree.query(s2_arr)
            
    def vector_overlap(vec, settlementfile, column_name):
        vec.drop(vec.columns.difference(["geometry"]), 1, inplace=True)
        a = gpd.sjoin(settlementfile, vec, op = 'intersects')
        a[column_name + '2'] = 0
        return a  

    z1=dist.tolist()
    z2=indexes.tolist()
    clusters['HydropowerDist'] = z1
    clusters['HydropowerDist'] = clusters['HydropowerDist']/1000
    clusters['HydropowerFID'] = z2
    
    z3 = []
    for s in indexes:
        z3.append(points[hydropowervalue][s])
        
    clusters['Hydropower'] = z3
    
    x = hydropowerunit
    
    if x is 'MW':
        clusters['Hydropower'] = clusters['Hydropower']*1000
    elif x is 'kW':
        clusters['Hydropower'] = clusters['Hydropower']
    else:
        clusters['Hydropower'] = clusters['Hydropower']/1000

    a = vector_overlap(points, clusters, 'HydropowerDist')

    clusters = pd.merge(left = clusters, right = a[['id','HydropowerDist2']], on='id', how = 'left')
    clusters.drop_duplicates(subset ="id", keep = "first", inplace = True) 

    clusters.loc[clusters['HydropowerDist2'] == 0, 'HydropowerDist'] = 0

    del clusters['HydropowerDist2']
    print(datetime.datetime.now())
    return clusters

## Creating Voronoi polygons for polygon settlements

In [None]:
def createVoronoi(admin, settlements, crs_projected, crs):
    ##=================================================##
    ## Generating boundaries based on the admin unit
    ##=================================================##
    #Create a large rectangle surrounding the admin boundaries
    admin_gdf_buf_prj = admin.to_crs(crs_projected)
    bound = admin_gdf_buf_prj.geometry[admin_gdf_buf.geometry.index[0]].buffer(50000).envelope.boundary 

    ##Create many points along the rectangle boundary. I create one every 100 m.
    boundarypoints = [bound.interpolate(distance=d) for d in range(0, np.ceil(bound.length).astype(int), 100)]
    boundarycoords = np.array([[p.x, p.y] for p in boundarypoints])
    
    print("Boundary area defined..")
    
    ##===============================================================================================##
    ## Get all points from polygon perimeter (excluding interior points in case of complex geometries)
    ##===============================================================================================##
    # Create an empty GeoDataFrame to store the points
    points_df = gpd.GeoDataFrame(columns=['id', 'uid' 'geometry'])
    #Project settlement layers
    settles_gdf_prj = settlements.to_crs(crs_projected)

    # Iterate over each row in the GeoDataFrame
    for index, row in settles_gdf_prj.iterrows():
        polygon_id = index  
        polygon_uid = row['id']  # Assuming the index is the unique identifier for each polygon
        geometry = row['geometry']
    
        # Check if the geometry is a Polygon or MultiPolygon
        if geometry.geom_type == 'Polygon':
            settles_gdf_prj_to_iterate = [geometry]
        elif geometry.geom_type == 'MultiPolygon':
            settles_gdf_prj_to_iterate = geometry.geoms
    
        # Iterate over each polygon (in case of MultiPolygon)
        for polygon in settles_gdf_prj_to_iterate:
            # Extract the exterior coordinates of the polygon
            exterior_coords = list(polygon.exterior.coords)
        
            # Iterate over each vertex in the exterior
            for coord in exterior_coords:
                point = Point(coord)
                # Append a row to the points DataFrame with the point and its corresponding polygon id
                points_df = points_df.append({'id': polygon_id, 'uid': polygon_uid, 'geometry': point}, ignore_index=True)

    # Convert the geometry column to the appropriate GeoSeries
    points_df['geometry'] = gpd.GeoSeries(points_df['geometry'])
    points_df = points_df.drop(["uidgeometry"], axis=1)
    points_df.crs = crs_projected

    print("Perimeter vertices generated..")
    
    ##===============================================================================##
    ## Generating the Voronoi polygons associated with all points & clip to boundaries
    ##===============================================================================##
    x = points_df.geometry.x.values
    y = points_df.geometry.y.values
    coords = np.vstack((x, y)).T

    all_coords = np.concatenate((boundarycoords, coords)) #Create an array of all points on the boundary and inside the polygon

    vor = Voronoi(points=all_coords)
    lines = [shapely.geometry.LineString(vor.vertices[line]) for line in vor.ridge_vertices if -1 not in line]

    polys = shapely.ops.polygonize(lines)
    voronois = gpd.GeoDataFrame(geometry=gpd.GeoSeries(polys), crs=crs_proj)

    polydf = gpd.GeoDataFrame(geometry=[admin_gdf_buf_prj.geometry[admin_gdf_buf.geometry.index[0]]], crs=crs_projected)
    points = gpd.GeoDataFrame(geometry=gpd.points_from_xy(x=coords[:,0], y=coords[:,1], crs=crs_projected))

    result = gpd.overlay(df1=voronois, df2=polydf, how="intersection")

    # Adding an index column
    result['uniqueID'] = range(1, len(result)+1)
    
    print("Voronoi polygons generated..")

    ##==============================================================##
    ##Getting all IDs of points within the same polygon to voronoi
    ##==============================================================##

    ## Adding a small buffed to get points within the voronoi
    buffered_result = result.copy()
    buffered_result['geometry'] = buffered_result.buffer(15)

    ## Get Polygon ID to Voronoi with spatial join
    buffered_result_withID = gpd.sjoin(buffered_result, points_df[["geometry", "uid"]], 
                                       how='left').drop(['index_right'], axis=1)

    ## Adding uid to the voronoi results
    result = result.merge(buffered_result_withID[["uniqueID", "uid"]], how="left", on='uniqueID')

    ## Dissolve results based to the 
    result_dissolved = result.dissolve(by='uid')
    
    print("Dissolved Voronoi polygons completed..")

    ###==============================================================##
    ### Visualization for testing
    ###==============================================================##
    print ('Vizualizing...')
    fig, ax = plt.subplots(figsize=(15, 15))
    polydf.boundary.plot(ax=ax, edgecolor="blue", linewidth=6)
    #voronois.plot(ax=ax, color="red", alpha=0.3, edgecolor="black")
    result_dissolved.plot(ax=ax, color="red", alpha=0.3, edgecolor="black")
    admin_gdf_buf_prj.plot(ax=ax, color="green", alpha=0.3, edgecolor="black")
    settles_gdf_prj.plot(ax=ax, color="maroon")

    ##==============================================================##
    ## Estimating area
    ##==============================================================##
    #add area of each polygon
    result_dissolved["Vor_area_sq.km"] = result_dissolved.geometry.area/10**6
    result_dissolved["Vor_area_ha"] = result_dissolved["Vor_area_sq.km"]*10**2
    
    #Revert to original crs
    result_dissolved = result_dissolved.to_crs(crs)

    return result_dissolved

In [1]:
## Based on centroids -- not used
def createVoronoi_old(admin, settlements, crs):
    #Create a large rectangle surrounding the admin boundaries
    admin_gdf_buf_prj = admin_gdf_buf.to_crs(crs_proj)
    bound = admin_gdf_buf_prj.geometry[admin_gdf_buf.geometry.index[0]].buffer(50000).envelope.boundary 

    ##Create many points along the rectangle boundary. I create one every 100 m.
    boundarypoints = [bound.interpolate(distance=d) for d in range(0, np.ceil(bound.length).astype(int), 100)]
    boundarycoords = np.array([[p.x, p.y] for p in boundarypoints])

    #Get the points inside the polygon
    settles_gdf_prj = settles_gdf.to_crs(crs_proj)
    x = settles_gdf_prj.centroid.geometry.x.values
    y = settles_gdf_prj.centroid.geometry.y.values
    coords = np.vstack((x, y)).T

    all_coords = np.concatenate((boundarycoords, coords)) #Create an array of all points on the boundary and inside the polygon

    vor = Voronoi(points=all_coords)
    lines = [shapely.geometry.LineString(vor.vertices[line]) for line in 
             vor.ridge_vertices if -1 not in line]

    polys = shapely.ops.polygonize(lines)
    voronois = gpd.GeoDataFrame(geometry=gpd.GeoSeries(polys), crs=crs_proj)

    polydf = gpd.GeoDataFrame(geometry=[admin_gdf_buf_prj.geometry[admin_gdf_buf.geometry.index[0]]], crs=crs_proj)
    points = gpd.GeoDataFrame(geometry=gpd.points_from_xy(x=coords[:,0], y=coords[:,1], crs=crs_proj))

    result = gpd.overlay(df1=voronois, df2=polydf, how="intersection")

    settles_gdf_prj_cen = settles_gdf_prj.copy()
    settles_gdf_prj_cen.geometry = settles_gdf_prj_cen.geometry.centroid
    result = gpd.sjoin(result, settles_gdf_prj_cen[["geometry", "id"]], how='left').drop(['index_right'], axis=1)

    #fig, ax = plt.subplots(figsize=(15, 15))
    #polydf.boundary.plot(ax=ax, edgecolor="blue", linewidth=6)
    #voronois.plot(ax=ax, color="red", alpha=0.3, edgecolor="black")
    #result.plot(ax=ax, color="red", alpha=0.3, edgecolor="black")
    #admin_gdf_buf_prj.plot(ax=ax, color="green", alpha=0.3, edgecolor="black")
    #points.plot(ax=ax, color="maroon")
    
    return result

## Creating the prioritization columns for filter visualization

In [13]:
def create_prio_columns(clusters):
    if "HF_kWh" in clusters:
        clusters["School"] = np.where((clusters["HF_kWh"] > 0), 1, 0)
    
    if "EF_kWh" in clusters:
        clusters["Health_facility"] = np.where((clusters["EF_kWh"] > 0), 1, 0)
    
    if "waterpoints_count" in clusters:
        clusters["Water_point"] = np.where((clusters["waterpoints_count"] > 0), 1, 0)
    
    print(datetime.datetime.now())
    
    return clusters

## Conditioning

In [None]:
def cleaning_string_attributes(df, column_name):
    df[column_name].replace("�", 'u', regex=True, inplace=True)
    df[column_name].replace("-", '_', regex=True, inplace=True)
    df[column_name].replace(" ", '_', regex=True, inplace=True)
    df[column_name].replace("/", '_', regex=True, inplace=True)
    df[column_name].replace("'", '_', regex=True, inplace=True)
    df[column_name].replace("é", 'e', regex=True, inplace=True)
    df[column_name].replace("î", 'i', regex=True, inplace=True)
    df[column_name].replace("ï", 'i', regex=True, inplace=True)
    df[column_name].replace("ô", 'o', regex=True, inplace=True)
    df[column_name].replace("ã", 'a', regex=True, inplace=True)
    df[column_name].replace("ç", 'c', regex=True, inplace=True)
    df[column_name].replace("Ö", 'o', regex=True, inplace=True)
    df[column_name].replace("è", 'e', regex=True, inplace=True)
    df[column_name].replace("à", 'a', regex=True, inplace=True)
    df[column_name].replace("á", 'a', regex=True, inplace=True)
    df[column_name].replace("ú", 'u', regex=True, inplace=True)
    df[column_name].replace("ª", 'a', regex=True, inplace=True)
    df[column_name].replace("", 'NaN', regex=True, inplace=True) 
    df[column_name].fillna(value=np.nan, inplace=True) 
    return df

In [14]:
def conditioning(clusters, workspace, popunit):
    clusters = clusters.to_crs({ 'init': 'epsg:4326'}) 

    clusters = clusters.rename(columns={"NightLight": "NightLights", popunit : "Pop",})

    if "Area" in clusters:
        clusters = clusters.rename(columns={"Area": "GridCellArea"})
        
    if "Pop_cellscount" in clusters:
        clusters = clusters.rename(columns={"ClusterCellscount": "ClusterCells"})
        
    if "ClusterCellscount" in clusters:
        clusters = clusters.rename(columns={"CoreCellscount": "CoreCells"})
        
    if "CoreCellscount" in clusters:
        clusters = clusters.rename(columns={"CoreCellscount": "CoreCells"})
        
    if "landcovermajority" in clusters:
        clusters = clusters.rename(columns={"landcovermajority": "LandCover"})

    if "elevationmean" in clusters:
        clusters = clusters.rename(columns={"elevationmean": "Elevation"})  

    if "sl_majority" in clusters:
        clusters = clusters.rename(columns={"sl_majority": "Slope"})

    if "ghimean" in clusters:
        clusters = clusters.rename(columns={"ghimean": "GHI"})
        
    if "traveltimemean" in clusters:
        clusters["traveltimemean"] = clusters["traveltimemean"]/60
        clusters = clusters.rename(columns={"traveltimemean": "TravelHours"})
    elif "TravelHour" in clusters:
        clusters = clusters.rename(columns={"TravelHour": "TravelHours"})
        
    if "windmean" in clusters:
        clusters = clusters.rename(columns={"windmean": "WindVel"})
    
    if "Residentia" in clusters:
        clusters = clusters.rename(columns={"Resudentia": "ResidentialDemandTierCustom"})
    elif "customdemandmean" in clusters:
        clusters = clusters.rename(columns={"customdemandmean": "ResidentialDemandTierCustom"})
    else:
        clusters["ResidentialDemandTierCustom"] = 0
        
    if "Urban_Demand_Indexmean" in clusters:
        clusters = clusters.rename(columns={"Urban_Demand_Indexmean": "ResidentialDemandTierCustomUrban"})
    else:
        clusters["ResidentialDemandTierCustomUrban"] = 0
        
    if "Rural_Demand_Indexmean" in clusters:
        clusters = clusters.rename(columns={"Rural_Demand_Indexmean": "ResidentialDemandTierCustomRural"})
    else:
        clusters["ResidentialDemandTierCustomRural"] = 0
    
    if "Substation" in clusters:
        clusers = clusters.rename(columns={"Substation": "SubstationDist"})
    elif "SubstationDist" not in clusters:
        clusters["SubstationDist"] = 99999

    if "CurrentHVL" in clusters:
        clusters = clusters.rename(columns={"CurrentHVL": "Existing_HVDist"})
    
    if "CurrentMVL" in clusters:
        clusters = clusters.rename(columns={"CurrentMVL": "Existing_MVDist"})
    
    if "PlannedHVL" in clusters:
        clusters = clusters.rename(columns={"PlannedHVL": "Planned_HVDist"})
    
    if "PlannedMVL" in clusters:
        clusters = clusters.rename(columns={"PlannedMVL": "Planned_MVDist"})

    if "Existing_HVDist" in clusters:
        clusters = clusters.rename(columns={"Existing_HVDist": "CurrentHVLineDist"})
        if "Planned_HVDist" in clusters:    
            mask = (clusters['Planned_HVDist'] > clusters['CurrentHVLineDist'])
            clusters['Planned_HVDist'][mask] = clusters['CurrentHVLineDist']
            clusters = clusters.rename(columns={"Planned_HVDist": "PlannedHVLineDist"})
        else:
            clusters["PlannedHVLineDist"] = clusters["CurrentHVLineDist"]
    elif "Existing_HVDist" not in clusters and "Planned_HVDist" not in clusters:
        clusters["PlannedHVLineDist"] = 99999
        clusters["CurrentHVLineDist"] = 99999
    else:
        clusters["CurrentHVLineDist"] = 99999
        clusters = clusters.rename(columns={"Planned_HVDist": "PlannedHVLineDist"})

    if "Existing_MVDist" in clusters:
        clusters = clusters.rename(columns={"Existing_MVDist": "CurrentMVLineDist"})
        if "Planned_MVDist" in clusters:    
            mask = (clusters['Planned_MVDist'] > clusters['CurrentMVLineDist'])
            clusters['Planned_MVDist'][mask] = clusters['CurrentMVLineDist']
            clusters = clusters.rename(columns={"Planned_MVDist": "PlannedMVLineDist"})
        else:
            clusters["PlannedMVLineDist"] = clusters["CurrentMVLineDist"]
    elif "Existing_MVDist" not in clusters and "Planned_MVDist" not in clusters:
        clusters["PlannedMVLineDist"] = 99999
        clusters["CurrentMVLineDist"] = 99999
    else:
        clusters["CurrentMVLineDist"] = 99999
        clusters = clusters.rename(columns={"Planned_MVDist": "PlannedMVLineDist"})

    if "RoadsDist" not in clusters:
        clusters = clusters.rename(columns={"RoadsDist": "RoadDist"})
    else:
        clusters["RoadDist"] = 99999
        
    if "Transforme" in clusters: 
        clusters = clusters.rename(columns={"Transforme": "TransformerDist"})
    elif "TransformerDist" not in clusters:
        clusters["TransformerDist"] = 99999

    if "Hydropower" not in clusters:
        clusters["Hydropower"] = 0
        
    if "Hydropow_1" in clusters:
        clusters = clusters.rename(columns={"Hydropow_1": "HydropowerDist"})
    elif 'HydropowerDist' not in clusters:
        clusters["HydropowerDist"] = 99999
        
    if "Hydropow_2" in clusters:
        clusters = clusters.rename(columns={"Hydropow_2": "HydropowerFID"})
    elif "HydropowerFID" not in clusters:
        clusters["HydropowerFID"] = 0
    
    if "IsUrban" not in clusters:
        clusters["IsUrban"] = 0    
        
    if "PerCapitaD" not in clusters:
        clusters["PerCapitaDemand"] = 0
    else:
        clusters = clusters.rename(columns={"PerCapitaD": "PerCapitaDemand"})
        
    if "HealthDema" not in clusters:
        clusters["HealthDemand"] = 0     
    else:
        clusters = clusters.rename(columns={"HealthDema": "HealthDemand"})    
    if "HF_kWh" in clusters:
        clusters["HealthDemand"] = clusters["HF_kWh"]
        
    if "EducationD" not in clusters:
        clusters["EducationDemand"] = 0     
    else:
        clusters = clusters.rename(columns={"EducationD": "EducationDemand"})
    if "EF_kWh" in clusters:
        clusters["EducationDemand"] = clusters["EF_kWh"]
        
    if "AgriDemand" not in clusters:
        clusters["AgriDemand"] = 0  
        
    if "Commercial" not in clusters:
        clusters["CommercialDemand"] = 0
    else:
        clusters = clusters.rename(columns={"Commercial": "CommercialDemand"})
        
    if "Conflict" not in clusters:
        clusters["Conflict"] = 0       

    if "Electrific" not in clusters:
        clusters["ElectrificationOrder"] = 0
    else:
        clusters = clusters.rename(columns={"Electrific": "ElectrificationOrder"})
    
    if "Resident_1" not in clusters:
        clusters["ResidentialDemandTier1"] = 7.74
    else: 
        clusters = clusters.rename(columns={"Resident_1": "ResidentialDemandTier1"})

    if "Resident_2" not in clusters:
        clusters["ResidentialDemandTier2"] = 43.8
    else: 
        clusters = clusters.rename(columns={"Resident_2": "ResidentialDemandTier2"})

    if "Resident_3" not in clusters:
        clusters["ResidentialDemandTier3"] = 160.6
    else: 
        clusters = clusters.rename(columns={"Resident_3": "ResidentialDemandTier3"})

    if "Resident_4" not in clusters:
        clusters["ResidentialDemandTier4"] = 423.4
    else: 
        clusters = clusters.rename(columns={"Resident_4": "ResidentialDemandTier4"})
    
    if "Resident_5" not in clusters:
        clusters["ResidentialDemandTier5"] = 598.6
    else: 
        clusters = clusters.rename(columns={"Resident_5": "ResidentialDemandTier5"})
        
    if "MGDist" not in clusters:
        clusters["MGDist"] = 99999
    
    if "MGName" not in clusters:
        clusters["MGName"] = None
        
    if "MGMVstatus" not in clusters:
        clusters["MGMVstatus"] = None
        
    if "MGType" not in clusters:
        clusters["MGType"] = None
        
    if "waterpoints_count" in clusters:
        clusters = clusters.rename(columns={"waterpoints_count": "waterpoints"})
    
    clusters["X_deg"] = clusters.geometry.centroid.x
    
    clusters["Y_deg"] = clusters.geometry.centroid.y
    
    clusters["Commercial_Multiplier"] = 0
    
    del clusters["geometry"]
    #clusters.to_file(workspace + r"\GEP-OnSSET_InputFile.shp", driver='ESRI Shapefile')
    clusters.to_csv(workspace + r"\GEP-OnSSET_InputFile.csv", index=False)
    
    print(datetime.datetime.now())
    print("The extraction file is now ready for review & use in the workspace directory as 'GEP-OnSSET_InputFile.csv'!")
    
    return clusters