BI or DIE - Level-up Session II 27.09.2022

Creating fast but convincing Geo-Analytics-Visualization

# Part I - preparation

## Importing Moduls

In [None]:
%load_ext lab_black

In [None]:
import pandas as pd
import numpy as np

from shapely.geometry import Point, MultiPoint, Polygon, LineString
import shapefile

import folium as fl # folium for the creating of interactive maps

from pyproj import Transformer

from sklearn.cluster import DBSCAN

## getting open data, cleanse and merge

In [None]:
def import_plz():
    # shapefiles download from https://www.suche-postleitzahl.org/downloads
    # on beginning of August 2022
    # define names of files in fitting folder
    filename = "plz-5stellig/plz-5stellig.shp"
    filename2 = "plz-5stellig/plz_einwohner.csv"
    
    sf = shapefile.Reader(filename)
    fields = [x[0] for x in sf.fields][1:]
    records = sf.records()
    shps = [s.points for s in sf.shapes()]
    # fill dataframe with data
    df_plz = pd.DataFrame(columns=fields, data=records)
    df_plz = df_plz.assign(coords=shps)
    
    # read csv for point-data
    df_plz2 = pd.read_csv(filename2, sep=",", decimal=".")
    df_plz2["plz"] = df_plz2.plz.astype(str)
    
    return df_plz, df_plz2


df_plz, df_plz2 = import_plz()

# data cleansing und selection
plz_do = df_plz[df_plz.note.str.contains("Dortmund")].reset_index(drop=True) # shrink to Dortmund

plz_do["einw_pro_qkm"] = plz_do["einwohner"] / plz_do["qkm"]  # einw_pro_qkm

# shrink second dataframe to the selected zip codes from first dataframe
df_plz22 = df_plz2[df_plz2.plz.isin(plz_do["plz"].to_list())].reset_index()

# creating buckets with suitable colors for visualization
buckets = [0, 1600, 2300, 3000, 9000]
buckets_name = ["red", "orange", "yellow", "green"]
plz_do["col_einw"] = pd.cut(plz_do.einw_pro_qkm, buckets, labels=buckets_name)

## creating first map

This map shows the distribution of zip codes in Dortmund. Color code symbolizes population density.

In [None]:
m = fl.Map(location=[51.5149, 7.4650], zoom_start=11, tiles="cartodbpositron")

fg = fl.FeatureGroup(name="PLZ_Gebiete")  # create layer object for PLZ

# iterate through plz gebiete
for i in range(0, plz_do.shape[0]):
    fl.GeoJson(
        Polygon(plz_do.coords[i]),
        style_function=lambda feature, col=plz_do.col_einw[i]: {
            "fillColor": col,
            "color": col,
        },
    ).add_to(fg)
fg.add_to(m)


fg = fl.FeatureGroup(name="Mittelpunkte")
for i in range(0, df_plz22.shape[0]):
    fl.Marker(
        location=[df_plz22.lat[i], df_plz22.lon[i]],
        popup=df_plz22.note[i],
        icon=fl.Icon(),
    ).add_to(fg)
fg.add_to(m)

fl.LayerControl().add_to(m)
# save to file
m.save("visualization_01.html")
# alternatively for a view inside of notebook
#m

## Geo Analytics part

Transform from latitude longitude representation to an isometric representation to perform DBSCAN algorithm to identify cluster. This leads to a specific density of zip codes - which correspondes to density of people living there.

In [None]:
transformer = Transformer.from_crs(4326, 25832)

df_plz2["X"], df_plz2["Y"] = transformer.transform(df_plz2.lat, df_plz2.lon)

X = df_plz2[["X", "Y"]].to_numpy()

clustering = DBSCAN(eps=1500, min_samples=2).fit(X)
df_plz2["cluster"] = clustering.labels_

In [None]:
# find out how many cluster have been identified regarding the parameter of 1.500 meter and min samples 2
# -1 symbolizes noise
df_plz2.cluster.value_counts(dropna=False)

In [None]:
m = fl.Map(location=[51.5149, 7.4650], zoom_start=5, tiles="cartodbpositron")
# create CircleMarker and then DBSCAN

fg = fl.FeatureGroup(name="Mittelpunkte cluster")
for i in range(0, df_plz2.shape[0]):
    if df_plz2.cluster[i] != -1:
        fl.CircleMarker(
            location=[df_plz2.lat[i], df_plz2.lon[i]],
            radius=5,
            color="blue",
            popup=df_plz2.note[i],
            icon=fl.Icon(),
        ).add_to(fg)
fg.add_to(m)

fg = fl.FeatureGroup(name="Mittelpunkte all")
for i in range(0, df_plz2.shape[0]):
    fl.CircleMarker(
        location=[df_plz2.lat[i], df_plz2.lon[i]],
        radius=5,
        color="blue",
        popup=df_plz2.note[i],
        icon=fl.Icon(),
    ).add_to(fg)
fg.add_to(m)
fl.LayerControl().add_to(m)
m.save("visualization_02.html")
# alternatively
#m