In [None]:
import pandas as pd

from analysis.people import find_all_matches, con
from util.config import versioned

In [None]:
matched_all = find_all_matches(con)
matched_all = matched_all[matched_all["overall_score"] > 10.5]
def drop_duplicates(df, *cols):
    for col in cols:
        df = df[(~df[col].duplicated()) | df[col].isna()]
    return df.reset_index(drop=True)
matched_all = drop_duplicates(matched_all, "krs_name", "pkw_name", "wiki_name")
matched_all

In [None]:
people_krs = pd.DataFrame(versioned.read_jsonl("people_krs.jsonl"))
people_krs

In [None]:
people_pkw = pd.DataFrame(versioned.read_jsonl("people_pkw.jsonl"))
people_pkw

In [None]:
import matplotlib.pyplot as plt

# 2. Convert the 'event_date' column to datetime objects
df['employed_end'] = pd.to_datetime(df['employed_end'])

# 3. Extract the year and count the occurrences
yearly_counts = df['employed_end'].dt.year.value_counts()

# 4. Sort the counts by year (the index) for a chronological chart
yearly_counts = yearly_counts.sort_index()

# 5. Create the bar chart
plt.figure(figsize=(8, 5))
ax = yearly_counts.plot(
    kind='bar',
    color='coral',
    title='Number of Events per Year'
)

ax.set_xlabel("Year")
ax.set_ylabel("Number of Occurrences")
plt.xticks(rotation=0) # Keep year labels horizontal
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# 3. Extract the year and count the occurrences
company_counts = people_krs['employed_krs'].value_counts()

# 4. Sort the counts by year (the index) for a chronological chart
company_counts = company_counts.sort_values()

# 5. Create the bar chart
plt.figure(figsize=(8, 5))
ax = company_counts.plot(
    kind='bar',
    color='coral',
    title='Number of Events per Year'
)

ax.set_xlabel("Year")
ax.set_ylabel("Number of Occurrences")
plt.xticks(rotation=0) # Keep year labels horizontal
plt.tight_layout()
plt.show()

company_counts

In [None]:
import matplotlib.pyplot as plt

unique_chance = 1 - matched_all["unique_chance"]
fig, ax = plt.subplots(figsize=(12, 7))
ax.grid(True)
ax.set_title('Osoby posortowane według szansy na bład', fontsize=16)
ax.set_yscale('log')
ax.plot(unique_chance)

In [None]:
import numpy as np

def calculate_poisson_binomial_history(probabilities: pd.Series) -> pd.DataFrame:
    """
    Calculates the evolving probability distribution of the number of successes
    at each step of including a new trial.

    Args:
        probabilities: A pandas Series where each value is the probability
                       of success (e.g., an error) for that trial.

    Returns:
        A pandas DataFrame where:
        - The index 'i' represents the number of people considered (from 0 to n).
        - The column 'k' represents the number of errors.
        - The value at [i, k] is the probability of having exactly k errors
          after considering the first i people.
    """
    n = len(probabilities)
    # Initialize a 2D array to store the history of distributions.
    # Dimensions are (n+1) rows for each step, (n+1) cols for k errors.
    history = np.zeros((n + 1, n + 1))
    history[0, 0] = 1.0  # P(0 errors | 0 people) = 1

    # Iterate from the 1st person to the nth person
    for i in range(1, n + 1):
        p = probabilities.iloc[i-1]  # Get the probability for the current person
        prev_probs = history[i-1, :] # Get the distribution from the previous step

        # Vectorized update, same as before
        term1 = prev_probs * (1 - p)
        term2 = np.roll(prev_probs, 1) * p
        term2[0] = 0

        history[i, :] = term1 + term2

    # Convert the result to a labeled DataFrame
    df = pd.DataFrame(history)
    df.index.name = "People Considered (i)"
    df.columns.name = "Number of Errors (k)"
    return df

binom_2d = calculate_poisson_binomial_history(unique_chance)

In [None]:
def show_distribution(df, label, num_errors_to_plot=10):
    fig, ax = plt.subplots(figsize=(12, 7))
    
    # Each is a column in our DataFrame
    for k in range(num_errors_to_plot):
        ax.plot(df.index, df[k], label=label.format(k=k))
    
    ax.set_title('Szansa na liczbę błędów przy wiekszej liczbie osób', fontsize=16)
    ax.set_xlabel('Liczba dodanych osób', fontsize=12)
    ax.set_ylabel('Prawdopodobieństwo', fontsize=12)
    ax.legend(title='Szansa na liczbę błędów')
    ax.grid(True)
    plt.show()
aggregated_error = binom_2d.cumsum(axis=1)
show_distribution(aggregated_error, 'P(<={k} błędów)', num_errors_to_plot=5)

In [None]:
treshold_count = len(aggregated_error[aggregated_error[0].gt(0.95)])
treshold_count, 1 / unique_chance[treshold_count]

In [None]:
from matplotlib import ticker
import math

def reverse(x, pos):
    return f"1 na {math.ceil(1/x):,}"

def display_expected_error_count(unique_chance):
    expected_error_count = unique_chance.cumsum() # We use here linearity of expectation
    number_of_people = np.arange(len(expected_error_count))
    expected_error_rate = np.divide(expected_error_count, number_of_people)
    
    df = pd.DataFrame({
        "Expected number of errors": expected_error_count,
        "Expected error ratio": expected_error_rate,
    })
        
    fig, ax2 = plt.subplots(figsize=(12, 6))
    ax1 = ax2.twinx()
    ax1.plot(df.index, df["Expected number of errors"], 
            label='Szacowana liczba błędnych osób', color='red', linewidth=2)
    ax1.set_title('Rosnca liczba błędnych osób przy podwyższonym progu akceptacji ryzyka', fontsize=16)
    ax1.set_xlabel('Number of People Considered', fontsize=12)

    ax2.semilogy(df.index, df["Expected error ratio"], 
            label='Szansa na błędnie dodana osobę', color='blue', linewidth=2)
    ax2.yaxis.set_major_formatter(ticker.FuncFormatter(reverse))
    ax1.spines['left'].set_color('blue')
    ax1.spines['right'].set_color('red')

    
    ax2.grid(True)
    fig.legend()
    plt.show()

display_expected_error_count(unique_chance)