# DBScan

https://scikit-learn.org/stable/modules/clustering.html#dbscan

Inputs that are used: <br/>
**eps** = Negihbourhood size will do in intervalls of 50 from 100 to 500<br/>
**min_sample** = Minimum points in radius (including centerpoint) in order for centerpoint to be considered a core point. In this notebook I loop through 1, 3 and 5.

In [1]:
import numpy as np
import geopandas as gpd
from sklearn.cluster import DBSCAN
from osgeo import gdal, ogr, osr
import os
import pandas as pd
import rasterio as rio
from rasterio import features

In [None]:
gdf = gpd.read_file(r"C:\PhD\Papers\2. MAUP\Namibia\Points_for_input.shp")
#raster = rio.open(r"C:\PhD\Papers\2. MAUP\Benin\clusters\pop\pop100m.tif")

In [None]:
pt = gdf.to_crs({ 'init': 'EPSG:3395'})
pt["X"] = pt["geometry"].x
pt["Y"] = pt["geometry"].y
pt = pt[['X', 'Y']]

In [None]:
numpis=pt.to_numpy()
df = pd.DataFrame(numpis)

In [None]:
#DBScan with 7 different buffers and 3 different cores
#Converts points to raster
x = [500, 450, 400, 350, 300, 250, 200]
y = [1, 3, 5]
for core in y:
    for val in x: 
        df = df.drop(columns=['geometry'], errors = 'ignore')
        db = DBSCAN(eps=val, min_samples=core).fit(numpis)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_
        df["clusters"] = db.labels_
        df["clusters"].replace({-1: df["clusters"].max()+1}, inplace=True)
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

        df = df.rename(columns={0: "X",1:"Y"})

        print('Estimated number of clusters: %d' % n_clusters_)

        gdf = gpd.GeoDataFrame(
            df, geometry=gpd.points_from_xy(df.X, df.Y))

        gdf = gdf[["clusters","geometry"]]
        gdf = gdf.rename(columns={'geom': 'geometry'})
        gdf.crs = {'init' :'epsg:3395'}
        gdf = gdf.to_crs({'init': 'EPSG:4326'})
        gdf = gdf.rename(columns={'geometry': 'geom'})

        dff = gdf[['clusters', 'geom']]
        shapes = ((g, v) for v, g in zip(dff['clusters'].values, dff['geom'].values))

        with rio.open(raster.name) as src:
            image = features.rasterize(
                        shapes,
                        out_shape=src.shape,
                        transform=src.transform,
                        all_touched=False)
            image = image.astype('float64')

            out_meta = src.meta

            out_meta.update({"driver": "GTiff",
                             "height": src.height,
                             "width": src.width,
                             "transform": src.transform,
                             'dtype': rio.float64,
                             "crs": src.crs,
                             "compress":'LZW',
                             "nodata": 0})

        with rio.open("clusters_" +str(core) +'_'+ str(val) + "_2.tif", 'w', **out_meta) as dst:
            dst.write(image, indexes=1)      

In [None]:
def toPolygon(Raster, output):
   
    Raster = gdal.Open(Raster)
    
    band = Raster.GetRasterBand(1)
    bandArray = band.ReadAsArray()

    outShapefile = output
    
    driver = ogr.GetDriverByName("ESRI Shapefile")
    if os.path.exists(outShapefile+".shp"):
        driver.DeleteDataSource(outShapefile+".shp")
    outDatasource = driver.CreateDataSource(outShapefile+ ".shp")
    
    spat_ref = osr.SpatialReference()
    proj = Raster.GetProjectionRef()
    spat_ref.ImportFromWkt(proj)
    
    outLayer = outDatasource.CreateLayer(outShapefile+ ".shp", srs=spat_ref)
    newField = ogr.FieldDefn("clusters", ogr.OFTInteger)
    outLayer.CreateField(newField)
    
    gdal.Polygonize(band, band, outLayer, 0, ["8CONNECTED=8","GROUPBY=clusters"], callback=None)
    outDatasource.Destroy()
    sourceRaster = None

In [None]:
#Polygonize rasters
for file in os.listdir(r"C:\OnSSET\OnSSET_GIS_Extraction_notebook\OnSTOVE"):
    filename = os.fsdecode(file)
    if filename.endswith(".tif"):
        toPolygon(filename, filename[:-4])

In [None]:
#Collecting geometries and adds id
for file in os.listdir(r"C:\OnSSET\OnSSET_GIS_Extraction_notebook\OnSTOVE"):
    filename = os.fsdecode(file)
    if filename.endswith(".shp"):
        inFile = gpd.read_file(filename)
        
        maximum = inFile["clusters"].max()
        multi = inFile.loc[inFile["clusters"] < maximum]
        dissolved = multi.dissolve(by="clusters")
        single = inFile.loc[inFile["clusters"] == maximum]
        
        combined = gpd.GeoDataFrame(pd.concat([dissolved, single], ignore_index=True))
        combined["id"] = np.arange(len(combined))+1
        
        combined.to_file(filename)

In [None]:
#Calculates area (convex hull)
for file in os.listdir(r"C:\OnSSET\OnSSET_GIS_Extraction_notebook\OnSTOVE"):
    filename = os.fsdecode(file)
    if filename.endswith(".shp"):
        inFile = gpd.read_file(filename)
        
        hull = inFile.dissolve("id").convex_hull.reset_index().set_geometry(0)
        reproj = hull.to_crs({ 'init': 'EPSG:3395'})
        reproj["Area"] = reproj.area/1000000
        
        
        inFile = inFile[['id', 'geometry']]
        joined = inFile.merge(reproj, on='id')
        
        joined_clean = joined[["id","geometry","Area"]]
        joined_clean["Country"] = 'Benin'
        joined_clean.to_file(filename)

In [None]:
#Adding population, ElecPop and NTL

gdf = gpd.read_file(r"C:\PhD\Papers\2. MAUP\Benin\Points.shp")
for file in os.listdir(r"C:\OnSSET\OnSSET_GIS_Extraction_notebook\OnSTOVE"):
    filename = os.fsdecode(file)
    if filename.endswith(".shp"):
        inFile = gpd.read_file(filename)
        
        points_polys = gpd.sjoin(inFile, gdf, how="left")
        stats_pt  = points_polys.groupby('id_left').agg(
        Pop  = ('Pop','sum'),
        ElecPop  = ('ElecPop','sum'),
        Nightlight  = ('NightLight','max'))
        stats_pt.reset_index(inplace=True)
        to_merge = stats_pt.rename(columns={'id_left': "id"})
        
        joined = inFile.merge(to_merge, on='id')
        joined.to_file(filename)

In [None]:
#Summarizing
df = pd.DataFrame(columns=['Country', 'Buffer', 'Core','Area','PopDensity','MaxPop','AveragePop', 'Nrs', 'nonClustered'])
buffer = 200
core = 1
i = 0
for file in os.listdir(r"C:\OnSSET\OnSSET_GIS_Extraction_notebook\OnSTOVE"):
    filename = os.fsdecode(file)
    if filename.endswith(".shp"):
        inFile = gpd.read_file(filename)
        
        subset_df = inFile[inFile["Area"] < 0.01]

        column_count = subset_df.count()
        
        df.loc[i] = ['Namibia'] + [buffer] + [core] + [inFile["Area"].sum()] + [inFile["Pop"].sum()/inFile["Area"].sum()] + [inFile["Pop"].max()] + [inFile["Pop"].mean()] + [len(inFile.index)] + [column_count[1]]

        if buffer < 500:
            buffer = buffer + 50
            core = core
        else:
            buffer = 200
            core = core + 2
        
        i = i + 1
        

In [None]:
#Adding inputs
for file in os.listdir(r"C:\OnSSET\OnSSET_GIS_Extraction_notebook\OnSTOVE"):
    filename = os.fsdecode(file)
    if filename.endswith(".shp"):
        inFile = gpd.read_file(filename)
        
        hull = inFile.dissolve("id").convex_hull.reset_index().set_geometry(0)
        hull = hull.rename(columns={0: "geometry"})
        convexhull=gpd.GeoDataFrame(data=hull, columns=["id","geometry"])
        
        points_polys = gpd.sjoin(convexhull, gdf, how="left")
        
        stats_pt  = points_polys.groupby('id').agg(
        WindVel  = ('WindVel','mean'),
        GHI  = ('GHI','mean'),
        TravelHours  = ('TravelHour','min'),
        Elevation  = ('Elevation','mean'),
        ResidentialDemandTierCustom  = ('Residentia','mean'),
        SubstationDist  = ('Substation','min'),
        CurrentHVLineDist  = ('CurrentHVL','min'),
        CurrentMVLineDist  = ('CurrentMVL','min'),
        RoadDist  = ('RoadDist','min'),
        TransformerDist  = ('Transforme','min'),
        PlannedHVLineDist  = ('CurrentMVL','min'),
        PlannedMVLineDist  = ('PlannedMVL','min'),
        HydropowerDist  = ('Hydropower','min'),
        Hydropower  = ('Hydropow_1','min'),
        HydropowerFID  = ('Hydropow_2','min'))
        
        stats_pt.reset_index(inplace=True)
        joined = inFile.merge(stats_pt, on='id')
        
        points_polys = gpd.sjoin(gdf, convexhull, how="left")
        stats_pt = points_polys[points_polys["id"] >= 0]
        stats_pt = stats_pt.groupby(['id', 'LandCover']).size().sort_values(ascending=False).reset_index(name='count')
        stats_pt = stats_pt.loc[stats_pt.groupby('id')['count'].idxmax()]
        joined = joined.merge(stats_pt, on='id')
        
        points_polys = gpd.sjoin(gdf, convexhull, how="left")
        stats_pt = points_polys[points_polys["id"] >= 0]
        stats_pt = stats_pt.groupby(['id', 'Slope']).size().sort_values(ascending=False).reset_index(name='count')
        stats_pt = stats_pt.loc[stats_pt.groupby('id')['count'].idxmax()]
        clusters = joined.merge(stats_pt, on='id')
        
        clusters["IsUrban"] = 0  
        clusters["PerCapitaDemand"] = 0
        clusters["PerCapitaDemand"] = 0
        clusters["HealthDemand"] = 0     
        clusters["EducationDemand"] = 0     
        clusters["AgriDemand"] = 0  
        clusters["CommercialDemand"] = 0
        clusters["Conflict"] = 0       
        clusters["ElectrificationOrder"] = 0
        clusters["ResidentialDemandTier1"] = 7.74
        clusters["ResidentialDemandTier2"] = 43.8
        clusters["ResidentialDemandTier3"] = 160.6
        clusters["ResidentialDemandTier4"] = 423.4
        clusters["ResidentialDemandTier5"] = 598.6
 
        clusters["X_deg"] = clusters.geometry.centroid.x
        clusters["Y_deg"] = clusters.geometry.centroid.y
        
        del clusters['count_x']
        del clusters['count_y']
        
        df1 = pd.DataFrame(clusters.drop(columns='geometry'))
        df1.to_csv(filename[:-3]+'csv', index = False)

In [2]:
urb_clus = gpd.read_file(r"C:\PhD\Papers\2. MAUP\Malawi\urb_clus.gpkg")


  for feature in features_lst:


In [6]:
urb_clus

Unnamed: 0,Area,Country,id,Population,NightLight,ElecPop,WindVel,GHI,TravelHours,Elevation,...,RoadDist,X_deg,Y_deg,TransformerDist,PlannedMVLineDist,PlannedHVLineDist,HydropowerDist,Hydropower,HydropowerFID,geometry
0,7.787302,Malawi,155337,25262.826562,4.449424,23586.237137,3.625362,1777.359717,0.822727,653.363636,...,0.0,35.710547,-16.079255,0,0.0,15.136,15.454628,168.383,3,"MULTIPOLYGON (((35.69426 -16.07356, 35.69426 -..."
1,8.890651,Malawi,159970,15128.845075,4.367579,11734.043818,4.258345,1865.818083,0.915,106.8,...,0.0,34.891935,-16.463018,0,0.0,4.098,36.20983,266.645,1,"MULTIPOLYGON (((34.86508 -16.47856, 34.86508 -..."
2,5.307499,Malawi,76595,19371.397235,8.041087,19371.397235,4.371353,1994.539797,0.443333,1224.0,...,0.0,33.73988,-13.538252,0,0.0,13.283,24.323265,3862.43,57,"MULTIPOLYGON (((33.73340 -13.54277, 33.73190 -..."
3,9.358802,Malawi,160924,24820.474811,2.894187,24791.135241,4.08614,1889.138062,1.710606,57.272727,...,0.0,35.25516,-16.921471,0,0.0,0.0,19.52876,110.36,0,"MULTIPOLYGON (((35.24525 -16.91432, 35.24676 -..."
4,7.597147,Malawi,85820,27954.533718,9.687197,27525.293424,4.752832,1961.192094,1.066667,1196.25,...,0.0,32.893601,-13.796276,0,0.0,0.0,40.861145,220.88,18,"MULTIPOLYGON (((32.87089 -13.79512, 32.87089 -..."
5,8.923768,Malawi,33044,15494.086483,3.819453,14368.592993,4.973541,1872.245255,0.44,545.4,...,0.0,34.280338,-11.609852,0,0.0,0.0,19.115214,413.489,30,"MULTIPOLYGON (((34.25440 -11.62191, 34.25440 -..."
6,29.887225,Malawi,130635,113486.004344,7.408401,113066.337602,3.593625,1759.965001,0.007292,898.90625,...,0.0,35.33955,-15.388472,0,0.0,0.0,24.908108,2533.68,55,"MULTIPOLYGON (((35.30259 -15.40272, 35.30259 -..."
7,6.929465,Malawi,140772,20486.453449,3.663053,20486.453449,3.787512,1832.334766,0.918519,680.333333,...,0.0,34.517111,-15.600348,0,0.0,5.792,40.066536,193.422,6,"MULTIPOLYGON (((34.50175 -15.57272, 34.50175 -..."
8,61.827398,Malawi,150012,518479.410919,37.058022,518479.410919,3.874509,1793.540014,0.000735,1025.264706,...,0.0,35.012315,-15.784584,0,0.0,0.0,32.893927,193.422,6,"MULTIPOLYGON (((34.96842 -15.77689, 34.96842 -..."
9,4.05568,Malawi,143206,15637.712168,7.552449,15637.712168,4.04879,1833.886642,0.083333,731.666667,...,0.0,35.017613,-15.65219,0,0.0,6.264,26.374508,193.422,6,"MULTIPOLYGON (((35.01009 -15.64197, 35.00941 -..."


In [7]:
#Adding Urban
#urb_clus = gpd.read_file(r"C:\PhD\Papers\2. MAUP\Benin\Urb_clus.shp")

urb_clus_csv = urb_clus.rename(columns={'Population': "Pop"})
urb_clus_csv = urb_clus_csv.rename(columns={'NightLight': "Nightlight", 'TravelHour': "TravelHours", 'Residentia': "ResidentialDemandTierCustom", 
                                            'Substation': "SubstationDist", 'CurrentHVL': "CurrentHVLineDist", 'CurrentMVL': "CurrentMVLineDist",
                                            'PlannedHVL': "PlannedHVLineDist",'PlannedMVL': "PlannedMVLineDist",'Transforme': "TransformerDist", 
                                            'Hydropower': "HydropowerDist", 'Hydropow_1': "Hydropower", 'Hydropow_2': "HydropowerFID", 
                                            'PerCapitaD': "PerCapitaDemand", 'HealthDema': "HealthDemand", 'EducationD': "EducationDemand", 
                                            'Electrific': "ElectrificationOrder", 'Commercial': "CommercialDemand", 
                                            'Resident_1': "ResidentialDemandTier1", 'Resident_2': "ResidentialDemandTier2", 
                                            'Resident_3': "ResidentialDemandTier3", 'Resident_4': "ResidentialDemandTier4",
                                            'Resident_5': "ResidentialDemandTier5"})
urb_cluster = urb_clus[["id", "Area", "Country", "Population", "ElecPop", "NightLight", "geometry"]]
urb_cluster = urb_cluster.rename(columns={'Population': "Pop", "NightLight":"Nightlight"})

for file in os.listdir(r"C:\OnSSET\OnSSET_GIS_Extraction_notebook\OnSTOVE"):
    filename = os.fsdecode(file)
    if filename.endswith(".shp"):
        inFile_shp = gpd.read_file(filename)
        inFile_csv = gpd.read_file(filename[:-3]+"csv")
        
        max_id = inFile_shp["id"].max()
        urb_cluster["id2"] = np.arange(len(urb_cluster))+max_id+1
        id_column = urb_cluster[["id","id2"]]
        urb_clus_csv = urb_clus_csv.merge(id_column, on='id')
        urb_clus_csv["id"] = urb_clus_csv["id2"]
        urb_cluster["id"] = urb_cluster["id2"]
        del urb_cluster["id2"]
        del urb_clus_csv["id2"]

        urb_file = pd.DataFrame(urb_clus_csv.drop(columns='geometry'))
        
        rdf_shp = gpd.GeoDataFrame(pd.concat([inFile_shp, urb_cluster], ignore_index=True))
        
        rdf_csv = pd.concat([inFile_csv, urb_clus_csv])
        del rdf_csv["geometry"]
        
        rdf_shp.to_file(filename)
        rdf_csv.to_csv(filename[:-3]+"csv", index = False)

  for feature in features_lst:


InvalidIndexError: Reindexing only valid with uniquely valued Index objects