# DBScan

https://scikit-learn.org/stable/modules/clustering.html#dbscan

Inputs that are used: <br/>
**eps** = Negihbourhood size will do in intervalls of 50 from 100 to 500<br/>
**min_sample** = Minimum points in radius (including centerpoint) in order for centerpoint to be considered a core point. In this notebook I loop through 1, 3 and 5.

In [None]:
import numpy as np
import geopandas as gpd
from sklearn.cluster import DBSCAN
from osgeo import gdal, ogr, osr
import os
import pandas as pd
import rasterio as rio
from rasterio import features

In [None]:
gdf = gpd.read_file(r"C:\PhD\Papers\2. MAUP\Namibia\points.shp")
raster = rio.open(r"C:\PhD\Papers\2. MAUP\Namibia\100m\100mPop.tif")

In [None]:
pt = gdf.to_crs({ 'init': 'EPSG:3395'})
pt["X"] = pt["geometry"].x
pt["Y"] = pt["geometry"].y
pt = pt[['X', 'Y']]

In [None]:
numpis=pt.to_numpy()
df = pd.DataFrame(numpis)

In [None]:
n_clusters_

In [None]:
x = [200, 250, 300, 350, 400, 450, 500]
y = [1, 3, 5]
for core in y:
    for val in x: 
        df = df.drop(columns=['geometry'], errors = 'ignore')
        db = DBSCAN(eps=val, min_samples=core).fit(numpis)
        core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
        core_samples_mask[db.core_sample_indices_] = True
        labels = db.labels_
        df["clusters"] = db.labels_
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)

        df = df.rename(columns={0: "X",1:"Y"})

        print('Estimated number of clusters: %d' % n_clusters_)

        gdf = gpd.GeoDataFrame(
            df, geometry=gpd.points_from_xy(df.X, df.Y))

        gdf = gdf[["clusters","geometry"]]
        gdf = gdf.rename(columns={'geom': 'geometry'})
        gdf.crs = {'init' :'epsg:3395'}
        gdf = gdf.to_crs({'init': 'EPSG:4326'})
        gdf = gdf.rename(columns={'geometry': 'geom'})

        dff = gdf[['clusters', 'geom']]
        shapes = ((g, v) for v, g in zip(dff['clusters'].values, dff['geom'].values))

        with rio.open(raster.name) as src:
            image = features.rasterize(
                        shapes,
                        out_shape=src.shape,
                        transform=src.transform,
                        all_touched=False)
            image = image.astype('float64')

            out_meta = src.meta

            out_meta.update({"driver": "GTiff",
                             "height": src.height,
                             "width": src.width,
                             "transform": src.transform,
                             'dtype': rio.float64,
                             "crs": src.crs,
                             "compress":'LZW',
                             "nodata": 0})

        with rio.open("clusters_" +str(core) +'_'+ str(val) + "_2.tif", 'w', **out_meta) as dst:
            dst.write(image, indexes=1)      

In [None]:
def toPolygon(Raster, output, fieldName):
   
    Raster = gdal.Open(Raster)
    
    band = Raster.GetRasterBand(1)
    bandArray = band.ReadAsArray()
    
    outShapefile = output
    
    driver = ogr.GetDriverByName("ESRI Shapefile")
    if os.path.exists(outShapefile+".shp"):
        driver.DeleteDataSource(outShapefile+".shp")
    outDatasource = driver.CreateDataSource(outShapefile+ ".shp")
    
    spat_ref = osr.SpatialReference()
    proj = Raster.GetProjectionRef()
    spat_ref.ImportFromWkt(proj)
    
    
    outLayer = outDatasource.CreateLayer(outShapefile+ ".shp", srs=spat_ref)
    newField = ogr.FieldDefn(fieldName, ogr.OFTInteger)
    outLayer.CreateField(newField)
    
    gdal.Polygonize(band, band, outLayer, 0, ["8CONNECTED=8","GROUPBY="+"fieldName"], callback=None)
    outDatasource.Destroy()
    sourceRaster = None
    return bandArray

In [None]:
for file in os.listdir(r"C:\PhD\Papers\2. MAUP\Test"):
    filename = os.fsdecode(file)
    if filename.endswith(".tif"):
        toPolygon(filename, filename[:-4])