In [None]:
import json
from io import StringIO
from pathlib import Path
from urllib.parse import urlparse

import geckodriver_autoinstaller
import geopandas as gpd
import h3
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from srai.regionalizers import VoronoiRegionalizer, geocode_to_region_gdf
from tqdm import tqdm

In [None]:
CITY = 'wro'

In [None]:
map_points_raw_data = json.loads(Path(f"../input_data/{CITY}.json").read_text())
data_wkt = map_points_raw_data["spatialReference"]["latestWkid"]
points = map_points_raw_data["features"]
data_wkt, len(points)

In [None]:
df = pd.json_normalize(points)
df.columns = [column.split(".")[1] for column in df.columns]
df

In [None]:
gdf = gpd.GeoDataFrame(
    data=df, geometry=gpd.GeoSeries.from_xy(df.x, df.y, crs=data_wkt).to_crs(4326)
)
gdf.explore()

In [None]:
gdf["h3"] = gdf.geometry.apply(lambda pt: h3.latlng_to_cell(pt.x, pt.y, 15))
gdf

In [None]:
city_data = (
    pd.read_csv("../input_data/nec_urls.csv", sep=",")
    .query(f'city == "{CITY}"')
    .iloc[0]
)
city_data

In [None]:
scraping_url = city_data.url

In [None]:
geckodriver_autoinstaller.install()

In [None]:
driver = webdriver.Firefox()
driver.get(scraping_url)
delay = 5  # seconds

WebDriverWait(driver, delay).until(
    EC.presence_of_element_located((By.CLASS_NAME, "obkw"))
)

In [None]:
def get_polling_districts_with_urls(tr) -> tuple[int, str]:
    a_href = tr.select_one('a', href=True)
    a_href.find('div').decompose()
    return int(a_href.text), a_href['href']

In [None]:
html = driver.page_source
soup = BeautifulSoup(html, "html.parser")
div = soup.select_one("div.obkw").select_one("table")
polling_districts = {}

for tr in div.find_all("tr")[1:]:
    k, v = get_polling_districts_with_urls(tr)
    polling_districts[k] = v

polling_districts

In [None]:
parse_url = urlparse(driver.current_url)
base_url = f"{parse_url.scheme}://{parse_url.netloc}"
base_url

In [None]:
def get_votes_from_polling_district(url) -> tuple[dict, int, int]:
    # president
    driver.get(url)
    delay = 5  # seconds

    WebDriverWait(driver, delay).until(
        EC.presence_of_element_located((By.ID, "obkw_can_cont_4_1"))
    )

    html = driver.page_source
    soup = BeautifulSoup(html, "html.parser")

    turnout_div = soup.select_one("div.turnout").select_one("table")
    turnout_tbl = pd.read_html(StringIO(str(turnout_div)))[0]
    voters_possible = int(
        turnout_tbl[
            turnout_tbl[0].str.contains("Liczba wyborców")
            & turnout_tbl[0].str.contains("Prezydenta")
        ].iloc[0][1]
    )

    voters_voted = int(
        turnout_tbl[
            turnout_tbl[0].str.contains("Liczba kart")
            & turnout_tbl[0].str.contains("Prezydenta")
        ].iloc[0][1]
    )

    candidates_div = soup.select_one("div#obkw_can_cont_4_1").select_one("table")
    candidates_tbl = pd.read_html(StringIO(str(candidates_div)))[0]
    votes_per_candidate = (
        candidates_tbl[["Nazwisko i imiona", "Liczba głosów na kandydata"]]
        .set_index("Nazwisko i imiona")["Liczba głosów na kandydata"]
        .to_dict()
    )
    return voters_possible, voters_voted, votes_per_candidate

In [None]:
votes_per_district = {}
votes_possible_per_district = {}
votes_voted_per_district = {}
for idx in tqdm(gdf['Numer_obwodu'].values):
    url = base_url + polling_districts[idx]
    voters_possible, voters_voted, votes_per_candidate = get_votes_from_polling_district(url)
    votes_per_district[idx] = votes_per_candidate
    votes_possible_per_district[idx] = voters_possible
    votes_voted_per_district[idx] = voters_voted

In [None]:
rows = []
for k, v in votes_per_district.items():
    rows.append(
        {
            "idx": k,
            **v,
            "voters_total": votes_possible_per_district[k],
            "voters_voted": votes_voted_per_district[k],
        }
    )

df_votes_per_district = pd.DataFrame(rows)
df_votes_per_district

In [None]:
joined_full_data = gdf.merge(df_votes_per_district, left_on="Numer_obwodu", right_on="idx")
joined_full_data

In [None]:
candidates_names = list(votes_per_district[list(votes_per_district.keys())[0]].keys())
candidates_names

In [None]:
polling_districts_data = joined_full_data[
    [
        "h3",
        "Numer_obwodu",
        "okw_nazwa",
        "okw_siedziba_okw",
        "okw_adres",
        *candidates_names,
        "voters_total",
        "voters_voted",
    ]
]

polling_districts_data["total_votes"] = polling_districts_data[candidates_names].sum(
    axis=1
)
for candidate_name in candidates_names:
    polling_districts_data[f"{candidate_name}_%"] = (
        polling_districts_data[candidate_name] / polling_districts_data["total_votes"]
    )

polling_districts_data["turnout_%"] = (
    polling_districts_data["voters_voted"] / polling_districts_data["voters_total"]
)

polling_districts_data

In [None]:
h3_votes_data = (
    joined_full_data[["h3", *candidates_names, "voters_voted", "voters_total"]]
    .groupby("h3")
    .sum()
    .reset_index()
)

h3_votes_data["total_votes"] = h3_votes_data[candidates_names].sum(axis=1)
for candidate_name in candidates_names:
    h3_votes_data[f"{candidate_name}_%"] = (
        h3_votes_data[candidate_name] / h3_votes_data["total_votes"]
    )

h3_votes_data["turnout_%"] = (
    h3_votes_data["voters_voted"] / h3_votes_data["voters_total"]
)

h3_votes_data

In [None]:
geo_distinct_data = (
    joined_full_data[
        [
            "h3",
            "geometry",
        ]
    ]
    .groupby("h3")
    .first()
)
geo_distinct_data = gpd.GeoDataFrame(geo_distinct_data).set_crs(
    4326, allow_override=True
)
geo_distinct_data

In [None]:
area = geocode_to_region_gdf(city_data.geocode)
area

In [None]:
voronoi_regions = VoronoiRegionalizer(seeds=geo_distinct_data).transform(area)
voronoi_regions

In [None]:
polling_districts_data.to_json(
    f"../output_data/{CITY}_polling_districts_data.json", orient="records"
)
h3_votes_data.to_json(f"../output_data/{CITY}_h3_votes_data.json", orient="records")
geo_distinct_data.to_file(f"../output_data/{CITY}_geo_data.geojson")
voronoi_regions.to_file(f"../output_data/{CITY}_voronoi_data.geojson")

In [None]:
full_data_geo = (
    h3_votes_data.merge(geo_distinct_data.reset_index(), on="h3")
    .merge(voronoi_regions.reset_index(), left_on="h3", right_on="region_id")
    .merge(
        polling_districts_data.groupby("h3")["Numer_obwodu"].apply(list),
        on="h3",
    )
)
full_data_geo = full_data_geo.rename(
    columns={"geometry_x": "point", "geometry_y": "voronoi"}
)
full_data_geo

In [None]:
import pydeck as pdk
from branca.colormap import LinearColormap

In [None]:
geyser_hexes = [
    "#008080",
    "#70a494",
    "#b4c8a8",
    "#f6edbd",
    "#edbb8a",
    "#de8a5a",
    "#ca562c",
]
sunset_hexes = [
    "#f3e79b",
    "#fac484",
    "#f8a07e",
    "#eb7f86",
    "#ce6693",
    "#a059a0",
    "#5c53a5",
]
ICON_URL = ""

In [None]:
def generate_map_for_candidates(pair):
    percent_pair = [f"{col}_%" for col in pair]

    sub_gdf = gpd.GeoDataFrame(full_data_geo, geometry="voronoi")

    sub_gdf["ratio"] = sub_gdf[percent_pair[0]] / (
        sub_gdf[percent_pair[0]] + sub_gdf[percent_pair[1]]
    )
    sub_gdf["diff"] = sub_gdf[percent_pair[0]] - sub_gdf[percent_pair[1]]
    sub_gdf["diff_str"] = sub_gdf["diff"].apply(lambda x: f"{(x * 100.0):.2f}")
    sub_gdf["diff_abs_scale"] = sub_gdf["diff"].abs() / sub_gdf["diff"].abs().max()

    for col in percent_pair:
        sub_gdf[col] = sub_gdf[col].apply(lambda x: f"{(x * 100.0):.2f}")

    sub_gdf["turnout_str"] = sub_gdf["turnout_%"].apply(lambda x: f"{(x * 100.0):.2f}")

    min_diff = min(sub_gdf["diff"].min(), -0.01)
    max_diff = max(sub_gdf["diff"].max(), 0.01)

    min_steps = [min_diff, min_diff * 2 / 3, min_diff / 3]
    max_steps = [max_diff / 3, max_diff * 2 / 3, max_diff]
    full_steps = [*min_steps, 0, *max_steps]

    cmap = LinearColormap(
        geyser_hexes,
        index=[round(x, 3) for x in full_steps],
        vmin=round(min_diff, 3),
        vmax=round(max_diff, 3),
    )

    sub_gdf["rgb"] = sub_gdf["diff"].apply(cmap.rgb_bytes_tuple)
    sub_gdf["okw_html"] = sub_gdf["Numer_obwodu"].apply(
        lambda x: ", ".join(map(str, x))
    )

    pt_data = pd.DataFrame(
        dict(
            lon=[pt.x for pt in full_data_geo.point],
            lat=[pt.y for pt in full_data_geo.point],
        )
    )

    view = pdk.data_utils.compute_view(pt_data)

    area_contour = pdk.Layer(
        "GeoJsonLayer",
        area,
        opacity=0.8,
        pickable=True,
        stroked=True,
        filled=False,
        get_line_width=15,
        line_width_min_pixels=1,
        get_line_color=[0, 0, 0, 128],
    )

    geojson = pdk.Layer(
        "GeoJsonLayer",
        sub_gdf[
            [
                "voronoi",
                *percent_pair,
                "okw_html",
                "diff_str",
                "rgb",
                "diff_abs_scale",
                "turnout_str",
            ]
        ],
        opacity=0.8,
        pickable=True,
        stroked=True,
        filled=True,
        get_line_width=5,
        line_width_min_pixels=1,
        get_fill_color="[rgb[0], rgb[1], rgb[2], 25 + (230 * diff_abs_scale)]",
        get_line_color=[0, 0, 0, 25],
    )

    icon_data = {
        "url": ICON_URL,
        "width": 242,
        "height": 242,
        "anchorY": 242,
    }

    pt_data["icon_data"] = [icon_data for _ in pt_data.index]

    icon_layer = pdk.Layer(
        opacity=0.8,
        type="IconLayer",
        pickable=False,
        data=pt_data,
        get_icon="icon_data",
        get_size=1,
        size_scale=10,
        size_min_pixels=10,
        size_max_pixels=120,
        get_position=["lon", "lat"],
    )

    legend_cmap = LinearColormap(
        geyser_hexes,
        index=[round(x * 100, 2) for x in full_steps],
        vmin=round(min_diff * 100, 2),
        vmax=round(max_diff * 100, 2),
    )

    r = pdk.Deck(
        layers=[area_contour, geojson, icon_layer],
        initial_view_state=view,
        map_style="road",
        tooltip={
            "html": f"""
            <b>Obwodowa komisja wyborcza nr: {{okw_html}}</b> <br/>
            <b>{pair[0]}:</b> {{{percent_pair[0]}}}% <br/>
            <b>{pair[1]}:</b> {{{percent_pair[1]}}}% <br/>
            <b>Róźnica:</b> {{diff_str}}% <br/>
            <b>Frekwencja:</b> {{turnout_str}}% <br/>
            """,
            "style": {"color": "white", "font-family": '"Open Sans", sans-serif'},
        },
        description=f"""
        <div style='font-family: "Open Sans", sans-serif;'>
        <h3>{city_data.city_name}: {pair[0]} vs {pair[1]}</h3>
        Różnica w % <br/>
        <span style="font-size: 0.8em">Wartości ujemne - przewaga {pair[1]} <br/> Wartości dodatnie - przewaga {pair[0]}</span> <br/>
        {legend_cmap._repr_html_()} <br/>
        <span style="font-size: 0.8em">(Obszary obwodów są przybliżone z wykorzystaniem diagramu Voronoia)</span>
        </div>
        """,
    )

    candidate_0 = "".join([x.upper()[0] for x in pair[0].split()])
    candidate_1 = "".join([x.upper()[0] for x in pair[1].split()])
    file_name = f"../output_data/{CITY}_{candidate_0}_{candidate_1}.html"

    r.to_html(file_name)

    with open(file_name, "r") as in_file:
        buf = in_file.readlines()

    with open(file_name, mode="w") as f:
        for line in buf:
            if "</style>" in line:
                line = (
                    """
                    .deck-json-description-box {
                        z-index: 2 !important;
                    }
                    @import url('https://fonts.googleapis.com/css2?family=Open+Sans:ital@0;1&display=swap')
                    """
                    + line
                )
            f.write(line)

In [None]:
for i in range(len(candidates_names)):
    for j in range(i + 1, len(candidates_names)):
        pair = [candidates_names[i], candidates_names[j]]
        generate_map_for_candidates(pair)

In [None]:
def generate_map_for_turnout():
    sub_gdf = gpd.GeoDataFrame(full_data_geo, geometry="voronoi")

    sub_gdf["turnout_str"] = sub_gdf["turnout_%"].apply(lambda x: f"{(x * 100.0):.2f}")
    cmap = LinearColormap(
        sunset_hexes,
        vmin=0,
        vmax=1,
    )

    sub_gdf["rgb"] = sub_gdf["turnout_%"].apply(cmap.rgb_bytes_tuple)
    sub_gdf["okw_html"] = sub_gdf["Numer_obwodu"].apply(
        lambda x: ", ".join(map(str, x))
    )
    
    max_turnout = sub_gdf["turnout_%"].max()
    
    sub_gdf["turnout_scale"] = sub_gdf["turnout_%"] = (
        sub_gdf["turnout_%"] / max_turnout
    )
    
    avg_turnout = sub_gdf['voters_voted'].sum() / sub_gdf['voters_total'].sum()

    pt_data = pd.DataFrame(
        dict(
            lon=[pt.x for pt in full_data_geo.point],
            lat=[pt.y for pt in full_data_geo.point],
        )
    )

    view = pdk.data_utils.compute_view(pt_data)

    area_contour = pdk.Layer(
        "GeoJsonLayer",
        area,
        opacity=0.8,
        pickable=True,
        stroked=True,
        filled=False,
        get_line_width=15,
        line_width_min_pixels=1,
        get_line_color=[0, 0, 0, 128],
    )

    geojson = pdk.Layer(
        "GeoJsonLayer",
        sub_gdf[
            [
                "voronoi",
                "okw_html",
                "rgb",
                "turnout_scale",
                "turnout_str",
            ]
        ],
        opacity=0.8,
        pickable=True,
        stroked=True,
        filled=True,
        get_line_width=5,
        line_width_min_pixels=1,
        get_fill_color="[rgb[0], rgb[1], rgb[2], 25 + (230 * turnout_scale)]",
        get_line_color=[0, 0, 0, 25],
    )

    icon_data = {
        "url": ICON_URL,
        "width": 242,
        "height": 242,
        "anchorY": 242,
    }

    pt_data["icon_data"] = [icon_data for _ in pt_data.index]

    icon_layer = pdk.Layer(
        opacity=0.8,
        type="IconLayer",
        pickable=False,
        data=pt_data,
        get_icon="icon_data",
        get_size=1,
        size_scale=10,
        size_min_pixels=10,
        size_max_pixels=120,
        get_position=["lon", "lat"],
    )

    legend_cmap = LinearColormap(
        sunset_hexes,
        vmin=0,
        vmax=100,
    )

    r = pdk.Deck(
        layers=[area_contour, geojson, icon_layer],
        initial_view_state=view,
        map_style="road",
        tooltip={
            "html": """
            <b>Obwodowa komisja wyborcza nr: {okw_html}</b> <br/>
            <b>Frekwencja:</b> {turnout_str}% <br/>
            """,
            "style": {"color": "white", "font-family": '"Open Sans", sans-serif'},
        },
        description=f"""
        <div style='font-family: "Open Sans", sans-serif;'>
        <h3>{city_data.city_name}: frekwencja</h3>
        Wartość w % <br/>
        <span style="font-size: 0.8em">Maksymalna frekwencja: {100*max_turnout:.2f}% <br/>
        Średnia frekwencja {100*avg_turnout:.2f}%</span> <br/>
        {legend_cmap._repr_html_()} <br/>
        <span style="font-size: 0.8em">(Obszary obwodów są przybliżone z wykorzystaniem diagramu Voronoia)</span>
        </div>
        """,
    )

    file_name = f"../output_data/{CITY}_turnout.html"

    r.to_html(file_name)

    with open(file_name, "r") as in_file:
        buf = in_file.readlines()

    with open(file_name, mode="w") as f:
        for line in buf:
            if "</style>" in line:
                line = (
                    """
                    .deck-json-description-box {
                        z-index: 2 !important;
                    }
                    @import url('https://fonts.googleapis.com/css2?family=Open+Sans:ital@0;1&display=swap')
                    """
                    + line
                )
            f.write(line)

In [None]:
generate_map_for_turnout()