In [None]:
import osmnx as ox
import numpy as np
from sklearn.cluster import DBSCAN
from shapely.geometry import Point
from pathlib import Path
import os
import geopandas as gpd
from geopy.geocoders import Nominatim
from loguru import logger

from mclights.utils.get_data import get_data_osmx_or_local
from mclights.utils.coordinate import convert_to_points, get_city

project_dir = Path(os.getcwd()).parent
data_dir = project_dir / "data"
temp_data_dir = project_dir / "temp_data"
raw_data_dir = project_dir / "raw_data"

In [None]:
names_combination = ["McDonald's", "McDonald´s", "McDonald’s"]

mcdonalds = get_data_osmx_or_local(
    raw_data_dir / "mcdonalds.gpkg",
    {"amenity": "fast_food", "name": names_combination}
)

In [None]:
mcdonalds = mcdonalds[mcdonalds["name"].isin(names_combination)]
mcdonalds = mcdonalds[~mcdonalds["geometry"].isna()]
mcdonalds = mcdonalds[["geometry", "addr:city"]]
mcdonalds = mcdonalds.rename(columns={"addr:city": "city_name"})

mcdonalds["geometry"] = mcdonalds["geometry"].apply(convert_to_points)

geolocator = Nominatim(user_agent="mcdonalds_locator")
mask_missing_cities = mcdonalds["city_name"].isna()
logger.info(
    f"There are {mask_missing_cities.sum()} missing cities out of {len(mcdonalds)} locations, "
    f"which is {mask_missing_cities.sum()/len(mcdonalds):.1%}."
)

mcdonalds = mcdonalds.to_crs(epsg=4326)
mcdonalds.loc[mask_missing_cities, "city_name"] = mcdonalds.loc[mask_missing_cities, "geometry"].apply(
    lambda geom: get_city(geom, geolocator)
)
mcdonalds = mcdonalds.to_crs(epsg=2180)

# I cannot detect Mc'Donald in Lubieszyn for some reason
lubieszyn_missing_mask = (
    mcdonalds.geometry.distance(Point(194277.344, 630278.733)) < 500
    & mcdonalds["city_name"].isna()
)
mcdonalds.loc[lubieszyn_missing_mask, "city_name"] = "Lubieszyn"

mcdonalds_path = data_dir / "mcdonalds.gpkg"
mcdonalds.to_file(mcdonalds_path, layer=mcdonalds_path.stem, driver="GPKG")

In [None]:
import matplotlib.pyplot as plt

temp = mcdonalds.copy()
fig, ax = plt.subplots(figsize=(10, 10))


coords = list(zip(temp.geometry.x, temp.geometry.y))

db = DBSCAN(eps=50, min_samples=1)
labels = db.fit_predict(coords)

temp['cluster_id'] = labels

# temp.plot(ax=ax)

# plt.show()

temp = temp[temp.duplicated(keep=False, subset=["cluster_id"])].sort_values(by="city_name")

temp.plot(ax=ax)
plt.show()