In [None]:
from collections import Counter
from datetime import datetime, timedelta, date
from functools import reduce

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from IPython.display import display

from util.config import versioned
from util.polish import TERYT
from scrapers.pkw.sources import election_date
from analysis.utils import drop_duplicates, filter_local_good, extract_companies
from analysis.people import people_merged

In [None]:
teryt = None

In [None]:
all_people = people_merged()

In [None]:
print(all_people.columns)
print(len(all_people))
local_good = filter_local_good(all_people, teryt)
print(f"Znaleziono {len(local_good)} osób")

In [None]:
all_people[all_people["last_name"] == "rozenek"]

In [None]:
local_good["overall_score"].apply(lambda x: round(x)).value_counts()

In [None]:
local_companies = pd.DataFrame(extract_companies(local_good), columns=["KRS", "Nazwa", "Liczba ciekawych osób"])
display(local_companies)
local_companies[local_companies["Nazwa"].str.isnumeric()]["KRS"].to_list()

In [None]:
komitet_counter = Counter((elt["party"] or "").lower().strip()
        for array in local_good["elections"].to_list()
        if array is not None
        for elt in array)
komitet_counter.most_common(5)

In [None]:
all_parties = set(party for parties in local_good["parties_simplified"].to_list() for party in parties)
local = {
 'Blok Samorządowy Razem',
 'Edward Pietrzyk',
 'Razem dla Radomska',
 'Razem dla Skierniewic',
 'Zbigniew Burzyński',
 'Ziemia Bełchatowska',
 'KWW Plus',
}
parties = all_parties - local
# display(parties)

def party_day_score(row):
    date = row.name
    result = [0] * len(row.index)
    global local_good
    for _i, _row in local_good.iterrows():
        for emp in _row["employment"]:
            duration = timedelta(days=365 * float(emp["employed_for"]))
            start_employed: date = emp["employed_end"] - duration
            if start_employed <= date.date() <= emp["employed_end"]:
                found_a_party = True
                for idx, col in enumerate(row.index):
                    if col in _row["parties_simplified"]:
                        result[idx] += 1
                        break
                else:
                    found_a_party = False
                if not found_a_party:
                    # Add them to "Inne" section
                    result[-1] += 1
    return pd.Series(result)

parties_aggregated = None

def show_graph(unit, unit_name, start='2010/01/01', end='2025/10/01'):
    dates = pd.date_range(start=start, end=end, freq=unit)
    df = pd.DataFrame({
        party: np.random.rand(len(dates)) for party in parties | set(["Inne"])
    }, index=dates)
    # Order dataframe columns by most recent, so we will return only at once
    df = df[["PiS","KO","PO","PSL","SLD",
             #"PO+PiS",
             # "Samoobrona",
             "AWS","UW","Inne"]]
    # display(df)
    
    party_stats = pd.DataFrame({}, columns=df.columns)
    party_stats[df.columns] = df.apply(party_day_score, axis=1)
    global parties_aggregated
    parties_aggregated = party_stats.sum(0).sort_values(ascending=False)
    descending_party_popularity = parties_aggregated.index
    party_stats = party_stats[descending_party_popularity]
    # display(party_stats)
    
    ax = party_stats.plot.area(figsize=(20, 4))
    ax.set_title(f"Liczba stanowisk kontrolowanych przez partię [próba na {unit_name}]")
    ax.legend(loc="center left", bbox_to_anchor=(1.0,0.5))

show_graph("ME", "miesiąc")

In [None]:
show_graph("QE", "kwartał")

In [None]:
show_graph("QE", "kwartał", start="2001/01/01")

In [None]:
ax = parties_aggregated.plot.pie()
ax.set_title("Łączny czas kontroli przez partie")

In [None]:
local_good["employed_total_years"] = local_good[("employed_total")].apply(lambda d: d.days / 365)
ax = local_good[["last_employed", "employed_total_years"]].plot.scatter(x="last_employed", y="employed_total_years", style=".", figsize=(20, 6))
ax.set_title("Data ostatniego zatrudnienia vs Totalny staż w publicznych spółkach")
ax.set_ylabel("Długość stażu")
ax.set_xlabel("Data ostatniego zatrudnienia")

In [None]:
plt.figure(figsize=(12, 8))
ax = local_good["election_before_work"].apply(lambda duration: duration.days / 365).plot()
ax.set_xlabel("Identyfikator osoby")
ax.set_ylabel("Lata")
ax.grid(True)
ax.set_title("Od wyborów do rozpoczęcia pracy w latach")
plt.tight_layout()
plt.show()

In [None]:
d = date(year=1, month=1, day=1)
zero_delta = d - d

local_good["election_before_work_years"] = local_good["election_before_work"].apply(lambda d: d.days / 365)
longest_dur = local_good["election_before_work_years"].max()  # Use it, to put negative values after positive values of this column
local_good["election_before_work_normalized"] = local_good["election_before_work_years"].apply(lambda d: -d + longest_dur if d < 0 else d)

def calculate_position(column, smallest_good):
    # print(column)
    sort_series = local_good[column].sort_values(ascending=smallest_good)
    # display(sort_series)
    index_position = pd.Series(index=sort_series.index, data=range(len(sort_series)))
    # display(index_position)
    index_ordered = index_position.sort_index()
    # display(index_ordered)
    return index_ordered

a = calculate_position("election_before_work_normalized", smallest_good=True)
b = calculate_position("mistake_odds", smallest_good=False)
c = calculate_position("employed_total_years", smallest_good=False)
d = calculate_position("first_employed", smallest_good=False)
e = calculate_position("last_employed", smallest_good=False)

local_good["position"] = a + b + c + d + e
# TODO sort by position
# local_good.sort_values(by="position", inplace=True)
local_good.sort_values(by="first_employed", inplace=True, ascending=False)

polish_headers = ["Pozycja", "Imię i nazwisko", "Data urodzenia", "Szansa na błąd - jeden na ...", "Od wyborów do pracy [lata]", "Łączny staż [lata]", "Pierwsze zatrudnienie", "Ostatnie zatrudnienie", "Historia"]
cleaned = pd.DataFrame({}, columns=polish_headers)
cleaned[polish_headers] = local_good[["position", "krs_name", "birth_date", "mistake_odds", "election_before_work_years", "employed_total_years", "first_employed", "last_employed", "history"]]

display(cleaned.style.set_properties(**{
    'text-align': 'left',
    'white-space': 'pre-wrap',
}))

pd.set_option('display.max_rows', 10)
pd.set_option('display.max_colwidth', None)

In [None]:
# Write to output
local_output = versioned.get_path(f"people_woj_{teryt}.csv")
cleaned.to_csv(local_output,index=False)

# Write to output
companies_output = versioned.get_path(f"companies_woj_{teryt}.csv")
local_companies.to_csv(companies_output,index=False)