# Ward hierarchical

https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn.cluster.AgglomerativeClustering

**inputs used**
* **n_clusters**: Number of clusters (set to None)
* **distance_threshold**: distance for which two are considered to be the same cluster. Looping through 200 and 500 in steps of 50
* **linkage**: Between which points the distacne is measured (see https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html). Have tested Ward, Single and Complete.
* **affinity**: How the distnace is measured. Using euclidean.

In [None]:
import numpy as np
import geopandas as gpd
from osgeo import gdal, ogr, osr
import os
import pandas as pd
import math
import rasterio as rio
from rasterio import features
from sklearn.cluster import AgglomerativeClustering

gdf = gpd.read_file(r"C:\PhD\Papers\2. MAUP\Namibia\points.shp")
raster = rio.open(r"C:\PhD\Papers\2. MAUP\Namibia\100m\100mPop.tif")

pt = gdf.to_crs({ 'init': 'EPSG:3395'})
pt["X"] = pt["geometry"].x
pt["Y"] = pt["geometry"].y
pt["id"]=np.arange(len(pt.index))


numpis=pt.to_numpy()
steps = len(numpis)/50000
steps_int = math.ceil(steps)
actual_steps = len(numpis)/steps_int
df = pd.DataFrame(numpis)

x = [200, 250, 300, 350, 400, 450, 500]
y = np.arange(start=0, stop=len(numpis)+1, step = actual_steps)

In [None]:
for val in x:
    i = 0
    for step in y:
        while i < len(y)-1:
            minimum = y[i]
            maximum = y[i+1]

            df = df.drop(columns=['geometry'], errors = 'ignore')
            df2 = df.loc[(df[0] >= minimum) & (df[0] <= maximum)]
            df2 = df2[[6, 7]]

            ward = AgglomerativeClustering(n_clusters=None, distance_threshold = val, 
                                           linkage='single', affinity='euclidean').fit(df2)
            labels = ward.labels_
            df2["clusters"] = ward.labels_

            df2 = df2.rename(columns={6: "X", 7:"Y"})


            gdf = gpd.GeoDataFrame(df2, geometry=gpd.points_from_xy(df2.X, df2.Y))
            gdf = gdf[["clusters", "geometry"]]
            gdf = gdf.rename(columns={'geom': 'geometry'})
            gdf.crs = {'init' :'epsg:3395'}
            gdf = gdf.to_crs({'init': 'EPSG:4326'})

            gdf.to_file("clusters_" + str(val) + "_" + str(i) + "_.shp")

            i = i+1

In [None]:
path = r"C:\PhD\Papers\2. MAUP\Test\Hierarchy\Complete"
for file in os.listdir(path):
    filename = os.fsdecode(file)
    if filename.endswith(".shp"):
        
        name = os.path.join(path, filename)
        gdf = gpd.read_file(name)
        dff = gdf[['clusters', 'geometry']]
        shapes = ((g, v) for v, g in zip(dff['clusters'].values, dff['geometry'].values))

        with rio.open(raster.name) as src:
            image = features.rasterize(
                        shapes,
                        out_shape=src.shape,
                        transform=src.transform,
                        all_touched=False)
            image = image.astype('float64')

            out_meta = src.meta

            out_meta.update({"driver": "GTiff",
                             "height": src.height,
                             "width": src.width,
                             "transform": src.transform,
                             'dtype': rio.float64,
                             "crs": src.crs,
                             "compress":'LZW',
                             "nodata": 0})

        with rio.open(filename[:-3] + "tif", 'w', **out_meta) as dst:
            dst.write(image, indexes=1)      

In [None]:
def toPolygon(Raster, output):
   
    Raster = gdal.Open(Raster)
    
    band = Raster.GetRasterBand(1)
    bandArray = band.ReadAsArray()
    
    outShapefile = output
    
    driver = ogr.GetDriverByName("ESRI Shapefile")
    if os.path.exists(outShapefile+".shp"):
        driver.DeleteDataSource(outShapefile+".shp")
    outDatasource = driver.CreateDataSource(outShapefile+ ".shp")
    
    spat_ref = osr.SpatialReference()
    proj = Raster.GetProjectionRef()
    spat_ref.ImportFromWkt(proj)
    
    
    outLayer = outDatasource.CreateLayer(outShapefile+ ".shp", srs=spat_ref)
    newField = ogr.FieldDefn('cluster', ogr.OFTInteger)
    outLayer.CreateField(newField)
    
    gdal.Polygonize(band, band, outLayer, 0, ["8CONNECTED=8"], callback=None)
    outDatasource.Destroy()
    sourceRaster = None
    return bandArray

for file in os.listdir(r"C:\PhD\Papers\2. MAUP\Test"):
    filename = os.fsdecode(file)
    if filename.endswith(".tif"):
        toPolygon(filename, filename[:-4])