In [1]:
import geopandas as gpd
import os
import pandas as pd

os.chdir("/home/staszek/mgr/gradient/gradient/")

from src.embedders.osm_data_embedder import OSMDataEmbedder
from srai.regionalizers import geocode_to_region_gdf
from srai.embedders import CountEmbedder
from srai.regionalizers import H3Regionalizer
from srai.loaders.osm_loaders.filters import OsmTagsFilter
from srai.joiners import IntersectionJoiner
from typing import Tuple, Literal, Union, cast
import matplotlib.pyplot as plt

import seaborn as sns
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    f1_score,
    accuracy_score,
    classification_report,
    confusion_matrix,
)

from IPython.display import display
from sklearn.manifold import TSNE
import plotly.express as px

pd.set_option("display.max_columns", None)

In [2]:
# this cell is treated as parameters cell for Papermill
city_name = "Warszawa"
nominatim_city_name = "Warsaw, Poland"
year = 2022

# Prepare data

The following cells prepare single geo data frame `X_and_y_df` that will be fed to the model


In [3]:
h3_resolution = 10

query: OsmTagsFilter = {"highway": True, "railway": True, "route": True}


def create_hex_gds() -> Tuple[gpd.GeoDataFrame, gpd.GeoDataFrame]:
    data_ebedder = OSMDataEmbedder(
        geocode_to_region_gdf(nominatim_city_name),
        CountEmbedder(),
        H3Regionalizer(h3_resolution),
        query,
    )

    filename = f"data/baseline-datasets/in/{city_name}-hex-and-features-gdf.shp"

    if not os.path.exists(filename):
        hex_and_features_gdf: gpd.GeoDataFrame = data_ebedder.make_embeddings()  # type: ignore
        hex_and_features_gdf.to_file(
            f"data/baseline-datasets/in/{city_name}-hex-and-features-gdf.shp",
            index=True,
        )
    else:
        hex_and_features_gdf = gpd.read_file(filename)
        hex_and_features_gdf.set_index("region_id", inplace=True)

    hex_gdf: gpd.GeoDataFrame = hex_and_features_gdf[["geometry"]]  # type: ignore

    return hex_and_features_gdf, hex_gdf


hex_and_features_gdf, hex_gdf = create_hex_gds()
display(hex_and_features_gdf)
display(hex_gdf)

Unnamed: 0_level_0,highway_bridleway,highway_bus_stop,highway_construction,highway_corridor,highway_crossing,highway_cycleway,highway_cyclist_waiting_aid,highway_elevator,highway_emergency_access_point,highway_footway,highway_give_way,highway_living_street,highway_milestone,highway_mini_roundabout,highway_motorway_junction,highway_passing_place,highway_path,highway_pedestrian,highway_planned,highway_platform,highway_primary,highway_primary_link,highway_proposed,highway_raceway,highway_residential,highway_secondary,highway_secondary_link,highway_service,highway_speed_camera,highway_steps,highway_stop,highway_street_lamp,highway_tertiary,highway_tertiary_link,highway_track,highway_traffic_mirror,highway_traffic_signals,highway_trunk,highway_trunk_link,highway_turning_circle,highway_turning_loop,highway_unclassified,railway_abandoned,railway_buffer_stop,railway_construction,railway_crossing,railway_crossing_box,railway_defect_detector,railway_derail,railway_dismantled,railway_disused,railway_engine_shed,railway_fuel,railway_halt,railway_hump_yard,railway_junction,railway_level_crossing,railway_loading_gauge,railway_loading_ramp,railway_milestone,railway_pit,railway_platform,railway_platform_edge,railway_preheating,railway_proposed,railway_radio,railway_rail,railway_rail_brake,railway_railway_crossing,railway_razed,railway_service_station,railway_signal,railway_signal_box,railway_site,railway_spur_junction,railway_station,railway_stop,railway_subway,railway_subway_entrance,railway_switch,railway_track_scale,railway_tram,railway_tram_crossing,railway_tram_level_crossing,railway_tram_level_crossing;tram_crossing,railway_tram_stop,railway_traverser,railway_turntable,railway_ventilation_shaft,railway_wash,railway_water_crane,railway_workshop,railway_yard,route_bicycle,route_ferry,route_historical,route_inline_skates,route_piste,geometry
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1
8a1f53530867fff,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((21.08869 52.17905, 21.08831 52.17846..."
8a1f53cd648ffff,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((21.18528 52.23191, 21.18489 52.23132..."
8a1f52369517fff,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((20.89724 52.30729, 20.89685 52.30670..."
8a1f53d99b1ffff,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((20.99315 52.35134, 20.99276 52.35075..."
8a1f53c9c4affff,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((21.04127 52.23364, 21.04088 52.23305..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8a1f52265b4ffff,0,1,0,0,2,0,0,0,0,11,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,3,0,0,0,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((20.94146 52.23045, 20.94107 52.22986..."
8a1f52348027fff,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((20.89153 52.25937, 20.89114 52.25878..."
8a1f5234b79ffff,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((20.90671 52.23233, 20.90632 52.23174..."
8a1f53522977fff,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((21.13924 52.19802, 21.13885 52.19743..."


Unnamed: 0_level_0,geometry
region_id,Unnamed: 1_level_1
8a1f53530867fff,"POLYGON ((21.08869 52.17905, 21.08831 52.17846..."
8a1f53cd648ffff,"POLYGON ((21.18528 52.23191, 21.18489 52.23132..."
8a1f52369517fff,"POLYGON ((20.89724 52.30729, 20.89685 52.30670..."
8a1f53d99b1ffff,"POLYGON ((20.99315 52.35134, 20.99276 52.35075..."
8a1f53c9c4affff,"POLYGON ((21.04127 52.23364, 21.04088 52.23305..."
...,...
8a1f52265b4ffff,"POLYGON ((20.94146 52.23045, 20.94107 52.22986..."
8a1f52348027fff,"POLYGON ((20.89153 52.25937, 20.89114 52.25878..."
8a1f5234b79ffff,"POLYGON ((20.90671 52.23233, 20.90632 52.23174..."
8a1f53522977fff,"POLYGON ((21.13924 52.19802, 21.13885 52.19743..."


In [4]:
def get_accidents_gdf(
    city_name: str, year: Union[Literal["all"], int] = "all"
) -> gpd.GeoDataFrame:
    accidents_df = pd.read_csv("data/wypadki-pl/accidents.csv")
    accidents_df = (
        accidents_df[accidents_df["gmi_nazwa"] == city_name]
        .reset_index()
        .rename(columns={"index": "feature_id"})
        .set_index("feature_id")
    )
    accidents_gdf = gpd.GeoDataFrame(
        accidents_df,
        geometry=gpd.points_from_xy(accidents_df.wsp_gps_x, accidents_df.wsp_gps_y),
        crs="EPSG:4326",
    )  # type: ignore

    if year != "all":
        accidents_gdf: gpd.GeoDataFrame = cast(
            gpd.GeoDataFrame, accidents_gdf[accidents_gdf.year == year]
        )

    return accidents_gdf


accidents_gdf = get_accidents_gdf(city_name, year=year)

display(accidents_gdf)

joiner = IntersectionJoiner()
accidents_joint_df = (
    joiner.transform(hex_gdf, accidents_gdf)
    .reset_index()
    .set_index("region_id")
    .groupby("region_id")
    .count()
)

display(accidents_joint_df)

Unnamed: 0_level_0,wsp_gps_x,wsp_gps_y,id_w_czas,czas_zdarzenia,woj_nazwa,pow_nazwa,gmi_nazwa,mie_nazwa,opis_zdarzenia,uczestnicy,zdarzenie_id,id_systemu_zr,year,month,day,geometry
feature_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
9169,20.995194,52.239333,2022-01-03,11:25,MAZOWIECKIE,Warszawa,Warszawa,Warszawa,Najechanie na pieszego,{'244142036': {'ofiary': {'Kierujący': {'obraz...,207053003,116809821,2022,1,3,POINT (20.99519 52.23933)
9170,20.932610,52.257611,2022-01-22,14:46,MAZOWIECKIE,Warszawa,Warszawa,Warszawa,Najechanie na pieszego,{'243947322': {'ofiary': {'Kierujący': {'obraz...,207061606,119808822,2022,1,22,POINT (20.93261 52.25761)
9172,20.962028,52.246389,2022-01-26,07:10,MAZOWIECKIE,Warszawa,Warszawa,Warszawa,Najechanie na pieszego,{'244398334': {'ofiary': {'Kierujący': {'obraz...,207063496,120302129,2022,1,26,POINT (20.96203 52.24639)
9173,20.995555,52.233055,2022-02-12,16:10,MAZOWIECKIE,Warszawa,Warszawa,Warszawa,Zderzenie pojazdów boczne,{'243903748': {'ofiary': {'Kierujący': {'obraz...,207076956,120340846,2022,2,12,POINT (20.99556 52.23305)
9174,20.930360,52.195416,2022-02-21,06:15,MAZOWIECKIE,Warszawa,Warszawa,Warszawa,Najechanie na pieszego,{'': {'ofiary': {'Pieszy': {'obrazenia': ['Cię...,207085814,120355522,2022,2,21,POINT (20.93036 52.19542)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11213,20.960444,52.193333,2022-08-12,11:50,MAZOWIECKIE,Warszawa,Warszawa,Warszawa,Zderzenie pojazdów boczne,{'244477071': {'ofiary': {'Kierujący': {'obraz...,207227713,121740537,2022,8,12,POINT (20.96044 52.19333)
11214,20.949444,52.305833,2022-09-21,23:30,MAZOWIECKIE,Warszawa,Warszawa,Warszawa,Najechanie na pieszego,{'': {'ofiary': {'Pieszy': {'obrazenia': ['30 ...,207261379,121815633,2022,9,21,POINT (20.94944 52.30583)
11215,20.992500,52.176610,2022-11-18,15:18,MAZOWIECKIE,Warszawa,Warszawa,Warszawa,Najechanie na pieszego,{'244029812': {'ofiary': {'Kierujący': {'obraz...,207308587,124224824,2022,11,18,POINT (20.99250 52.17661)
11216,21.056972,52.264667,2022-05-16,01:00,MAZOWIECKIE,Warszawa,Warszawa,Warszawa,"Najechanie na słup, znak",{'244250310': {'ofiary': {'Kierujący': {'obraz...,207365991,120516061,2022,5,16,POINT (21.05697 52.26467)


Unnamed: 0_level_0,feature_id
region_id,Unnamed: 1_level_1
8a1f522418b7fff,1
8a1f52241d07fff,1
8a1f52244377fff,1
8a1f52244a0ffff,1
8a1f52244a5ffff,1
...,...
8a1f53d9b687fff,1
8a1f53d9b747fff,1
8a1f53d9b977fff,1
8a1f53d9b98ffff,1


In [5]:
X_and_y_df = hex_and_features_gdf.join(accidents_joint_df)
X_and_y_df.feature_id.fillna(0, inplace=True)
display(X_and_y_df)

Unnamed: 0_level_0,highway_bridleway,highway_bus_stop,highway_construction,highway_corridor,highway_crossing,highway_cycleway,highway_cyclist_waiting_aid,highway_elevator,highway_emergency_access_point,highway_footway,highway_give_way,highway_living_street,highway_milestone,highway_mini_roundabout,highway_motorway_junction,highway_passing_place,highway_path,highway_pedestrian,highway_planned,highway_platform,highway_primary,highway_primary_link,highway_proposed,highway_raceway,highway_residential,highway_secondary,highway_secondary_link,highway_service,highway_speed_camera,highway_steps,highway_stop,highway_street_lamp,highway_tertiary,highway_tertiary_link,highway_track,highway_traffic_mirror,highway_traffic_signals,highway_trunk,highway_trunk_link,highway_turning_circle,highway_turning_loop,highway_unclassified,railway_abandoned,railway_buffer_stop,railway_construction,railway_crossing,railway_crossing_box,railway_defect_detector,railway_derail,railway_dismantled,railway_disused,railway_engine_shed,railway_fuel,railway_halt,railway_hump_yard,railway_junction,railway_level_crossing,railway_loading_gauge,railway_loading_ramp,railway_milestone,railway_pit,railway_platform,railway_platform_edge,railway_preheating,railway_proposed,railway_radio,railway_rail,railway_rail_brake,railway_railway_crossing,railway_razed,railway_service_station,railway_signal,railway_signal_box,railway_site,railway_spur_junction,railway_station,railway_stop,railway_subway,railway_subway_entrance,railway_switch,railway_track_scale,railway_tram,railway_tram_crossing,railway_tram_level_crossing,railway_tram_level_crossing;tram_crossing,railway_tram_stop,railway_traverser,railway_turntable,railway_ventilation_shaft,railway_wash,railway_water_crane,railway_workshop,railway_yard,route_bicycle,route_ferry,route_historical,route_inline_skates,route_piste,geometry,feature_id
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1
8a1f53530867fff,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((21.08869 52.17905, 21.08831 52.17846...",0.0
8a1f53cd648ffff,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((21.18528 52.23191, 21.18489 52.23132...",0.0
8a1f52369517fff,0,0,0,0,0,0,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((20.89724 52.30729, 20.89685 52.30670...",0.0
8a1f53d99b1ffff,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((20.99315 52.35134, 20.99276 52.35075...",0.0
8a1f53c9c4affff,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((21.04127 52.23364, 21.04088 52.23305...",0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8a1f52265b4ffff,0,1,0,0,2,0,0,0,0,11,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,3,0,0,0,5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((20.94146 52.23045, 20.94107 52.22986...",0.0
8a1f52348027fff,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,4,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((20.89153 52.25937, 20.89114 52.25878...",0.0
8a1f5234b79ffff,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((20.90671 52.23233, 20.90632 52.23174...",0.0
8a1f53522977fff,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,"POLYGON ((21.13924 52.19802, 21.13885 52.19743...",0.0


Let's see each hex with Y (feature ID) and some of the X features:


In [None]:
sample_cols = X_and_y_df.columns[
    ~(X_and_y_df.columns == "geometry") & ~(X_and_y_df.columns == "feature_id")
][1:3]

X_and_y_df[["geometry", *sample_cols, "feature_id"]].explore(
    style_kwds=dict(fillOpacity=0.01)
)  # type: ignore

# Analysis

Before applying model, let's see some statistics


In [None]:
X_and_y_df["feature_id"].plot(kind="hist")

# Add values above each bar
for i, v in enumerate(X_and_y_df["feature_id"].value_counts()):
    plt.annotate(str(v), xy=(i, v), ha="left", va="bottom")

plt.xlabel("Num of accidents in hex")
plt.ylabel("Hex count")
plt.title(f"{city_name} - Number of accidents per hex")

plt.show()

In [None]:
X_and_y_df["feature_id"].describe()

Usually there is a large imbalance towards places with noa ccidents, let's change this problem to a binary classification task of detecting an accident or no accident.


In [None]:
X_and_y_df.loc[X_and_y_df["feature_id"] > 0, "feature_id"] = 1.0

In [None]:
X_and_y_df["feature_id"].plot(kind="hist")

# Add values above each bar
for i, v in enumerate(X_and_y_df["feature_id"].value_counts()):
    plt.annotate(str(v), xy=(i, v), ha="left", va="bottom")

plt.xlabel("Num of accidents in hex")
plt.ylabel("Hex count")
plt.title(f"{city_name} - Binary classification of hexes")

plt.show()

In [None]:
# Perform t-SNE dimensionality reduction
tsne = TSNE(n_components=2, n_jobs=-1)
X_tsne = tsne.fit_transform(X_and_y_df.drop(columns=["geometry", "feature_id"]))

In [None]:
fig = px.scatter(
    X_and_y_df, x=X_tsne[:, 0], y=X_tsne[:, 1], color=X_and_y_df["feature_id"]
)
fig.update_layout(
    title="t-SNE Visualization",
    xaxis_title="t-SNE Component 1",
    yaxis_title="t-SNE Component 2",
)
fig.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_and_y_df.drop(columns=["geometry", "feature_id"]),
    X_and_y_df["feature_id"],
    test_size=0.3,
    random_state=42,
    stratify=X_and_y_df["feature_id"],
)


svc = LinearSVC(dual=False, class_weight="balanced", random_state=11)
svc.fit(X_train, y_train)

In [None]:
preds = svc.predict(X_test)

print("F1 Score:", f1_score(y_test, preds))
print("Accuracy:", accuracy_score(y_test, preds))
print("Classification report:")
print(classification_report(y_test, preds))

In [None]:
cm = confusion_matrix(y_test, preds)

sns.heatmap(cm, annot=True, fmt="d")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

Legend for the map below:

1. Blue: correct predictions
2. Orange: False positive (expected no accident, got accident)
3. Red: False negative (expected accident, got no accident)


In [None]:
confusion_gdf = pd.DataFrame(y_test)
confusion_gdf["predicted"] = preds
confusion_gdf["is_correct"] = confusion_gdf.feature_id == preds
confusion_gdf = confusion_gdf.join(hex_gdf, how="left")
confusion_gdf = gpd.GeoDataFrame(data=confusion_gdf, geometry="geometry", crs="EPSG:4326")  # type: ignore


correct_hexes = confusion_gdf[confusion_gdf.is_correct == True].explore()  # type: ignore
correct_hexes = confusion_gdf[
    (confusion_gdf.feature_id == 1.0) & (confusion_gdf.predicted == 0.0)
].explore(  # type: ignore
    m=correct_hexes, style_kwds=dict(fillColor="red", color="red")
)  # false negative #type: ignore

confusion_gdf[
    (confusion_gdf.feature_id == 0.0) & (confusion_gdf.predicted == 1.0)
].explore(  # type: ignore
    m=correct_hexes, style_kwds=dict(fillColor="orange", color="orange")
)  # false positive  #type: ignore