# Numerical Analysis
In this interactive Notebook both the simulation and the deterministic algorithms are assessed on their numerical precision.

In [4]:
# Libraries used in the interactive Notebook
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import random
from tqdm import tqdm

In [5]:
# These are the algorithms, which we are assessing
def simulation(p_is, n):
    amnts = [0]*(len(p_is)+1)
    for _ in range(n):
        goal_amnt = sum([int(random.random() < p_i) for p_i in p_is])
        amnts[goal_amnt] += 1
    return [amnt/n for amnt in amnts]

def dp(p_is):
    n = len(p_is)
    p_k = [1] + [0]*n
    # Iterate over the dp table
    for i in range(n+1):
        for c in range(i, 0, -1):
            inc = p_is[i-1]*p_k[c-1]
            p_k[c-1] -= inc
            p_k[c] += inc
    return p_k

from poibin.poibin import PoiBin

def fft(p_is):
    return PoiBin(p_is).get_pmf_xi()

In [7]:
# Read the Possession xG values and group them by match id
xg_df = pd.read_pickle('possession_xGs.pkl')
xg_counts = xg_df.groupby('match_id').apply(lambda df: df.groupby('team_id')['team_id'].count()).sort_values()
xg_counts = xg_counts.to_frame()
xg_counts.columns = ['xg_count']
xg_counts = xg_counts.reset_index()

In [8]:
# Get the results for each algorithm 
results_dp = []
results_fft = []
results_simulation = []
SIMULATION_RUNS = 10000
# Lets look at some metrics of numerical precision for each unique number of Possession xGs per team and match in our dataset
for count in tqdm(np.sort(xg_counts['xg_count'].unique())):
    # Randomly choose a match with 'count' number of Possession xG
    i_counts = xg_counts[xg_counts['xg_count'] == count]
    match_id = i_counts.sample()['match_id'].iat[0]
    team_id = i_counts[i_counts['match_id'] == match_id].sample()['team_id'].iat[0]
    p_is = xg_df[(xg_df['match_id'] == match_id) & (xg_df['team_id'] == team_id)]['possession_xg'].to_list()
    results_dp.append((count, dp(p_is)))
    results_fft.append((count, fft(p_is)))
    results_simulation.append((count, simulation(p_is, SIMULATION_RUNS)))

100%|██████████| 37/37 [00:00<00:00, 39.17it/s]


One metric to assess is the sum of the probabilities. The entries of a discrete probablity distribution vector sum to 1. In this part we will analyze the sum of the distribution output vectors for each algorithm.

In [9]:
# First let's look at the maximum absolute differenc to one for each result of each algorithm
max_abs_diff_dp = np.max(np.absolute(np.ones(len(results_dp)) - np.array([np.sum(t[1]) for t in results_dp])))
max_abs_diff_fft = np.max(np.absolute(np.ones(len(results_fft)) - np.array([np.sum(t[1]) for t in results_fft])))
max_abs_diff_sim = np.max(np.absolute(np.ones(len(results_simulation)) - np.array([np.sum(t[1]) for t in results_simulation])))
print(f'Maximum absolute difference to 1 of the sum of the output vector: {max_abs_diff_dp=}, {max_abs_diff_fft=}, {max_abs_diff_sim=}')

Maximum absolute difference to 1 of the sum of the output vector: max_abs_diff_dp=4.440892098500626e-16, max_abs_diff_fft=9.992007221626409e-15, max_abs_diff_sim=2.220446049250313e-16


All the algorithms show no relevant signs of inaccuracy when assessing the sum of their output probability vectors.