In [1]:
import pandas as pd
import os
import numpy as np
from collections import defaultdict as dd
from tqdm import tqdm

# Sample game

In [3]:
games_list = os.listdir('../presentables/raw_predicted_scores/')

In [4]:
n = 11801
sqrt_mean_squared_error = 0.087452

In [5]:
all_games_player_distributions = dict()

In [13]:
np.random.seed(0)
n_samples = 2500

for game_filename in tqdm(games_list):

    predicted_game = pd.read_csv(
        f'../presentables/raw_predicted_scores/{game_filename}')

    round_number = game_filename.split('_')[1].split('.')[0]
    team1 = game_filename.split('_')[2].split('v')[0].strip()
    team2 = game_filename.split('v')[1].split('.')[0].strip()

    n_players = len(predicted_game)
    predicted_scores = predicted_game['predicted_score']

    epsilon = np.random.normal(0, 1, n_players*n_samples)

    player_distribution = {player: {i: 0 for i in range(
        4)} for player in predicted_game['player']}

    for i in range(n_samples):
        sampled_game_score = predicted_scores + \
            epsilon[i*n_players:(i+1)*n_players] * sqrt_mean_squared_error
        sampled_game_score.sort_values(ascending=False, inplace=True)
        sampled_players = {predicted_game['player'][j]: _ if _ < 3 else 3
                           for (_, j) in enumerate(sampled_game_score.index)}

        for player in sampled_players:
            player_distribution[player][sampled_players[player]] += 1

    # turn into probabilities
    for player in player_distribution:
        total = sum(player_distribution[player].values())
        for i in range(4):
            player_distribution[player][i] /= total

    all_games_player_distributions[f'{round_number}_{team1}_{team2}'] = player_distribution

100%|██████████| 207/207 [05:10<00:00,  1.50s/it]


# Simulate game

In [20]:
season_distribution = dict()

for i in tqdm(range(n_samples)):

    player_tally = dd(int)

    for game in all_games_player_distributions:
        for player in all_games_player_distributions[game]:
            if all_games_player_distributions[game][player][3] < 0.99:
                # categorical distribution sample
                sampled_votes = np.random.choice(
                    [3, 2, 1, 0], p=list(all_games_player_distributions[game][player].values()))
                player_tally[player] += sampled_votes
                
    # sort by votes
    player_tally = dict(sorted(player_tally.items(),
                        key=lambda x: x[1], reverse=True))

    season_distribution[i] = player_tally

100%|██████████| 2500/2500 [03:28<00:00, 12.01it/s]


In [22]:
for i in range(n_samples):
    season_distribution[i] = dict(
        sorted(season_distribution[i].items(), key=lambda x: x[1], reverse=True))

In [28]:
season_distribution_df = pd.DataFrame(
    {i: season_distribution[i].keys() for i in range(n_samples)})

# Odds

In [32]:
# get probabilities
winner_prob = season_distribution_df.iloc[0].value_counts(normalize=True)

In [43]:
# count the number of times each player is in the top 2
top_2 = season_distribution_df.iloc[0:2].stack().value_counts(normalize=True)*2

In [54]:
winner_odds = {player: np.round(
    (1-winner_prob[player])/winner_prob[player], 2) for player in winner_prob.index}
winner_odds

{'Marcus Bontempelli': 3.74,
 'Christian Petracca': 4.33,
 'Nick Daicos': 7.22,
 'Jordan Dawson': 7.28,
 'Tim Taranto': 18.69,
 'Rory Laird': 18.84,
 'Zachary Merrett': 25.32,
 'Caleb Serong': 29.49,
 'Andrew Brayshaw': 34.71,
 'Zak Butters': 34.71,
 'Errol Gulden': 34.71,
 'Lachie Neale': 51.08,
 'Connor Rozee': 55.82,
 'Rowan Marshall': 68.44,
 'Thomas Stewart': 91.59,
 'Timothy English': 103.17,
 'James Sicily': 155.25,
 'Noah Anderson': 191.31,
 'Tom Green': 226.27,
 'Josh Dunkley': 276.78,
 'Jack Sinclair': 415.67,
 'Thomas Liberatore': 624.0,
 'Charlie Curnow': 624.0,
 'Dan Houston': 832.33,
 'Patrick Cripps': 1249.0,
 'Stephen Coniglio': 1249.0,
 'Clayton Oliver': 1249.0,
 'Adam Cerra': 1249.0,
 'Darcy Parish': 2499.0,
 'Josh Daicos': 2499.0,
 'Tom Mitchell': 2499.0,
 'Toby Greene': 2499.0,
 'Max Gawn': 2499.0}

In [55]:
top2_odds = {player: np.round(
    (1-top_2[player])/top_2[player], 2) for player in top_2.index}
top2_odds

{'Marcus Bontempelli': 1.88,
 'Christian Petracca': 2.04,
 'Nick Daicos': 3.31,
 'Jordan Dawson': 3.41,
 'Tim Taranto': 7.5,
 'Rory Laird': 8.03,
 'Zachary Merrett': 11.32,
 'Caleb Serong': 11.5,
 'Zak Butters': 13.04,
 'Errol Gulden': 13.62,
 'Andrew Brayshaw': 13.62,
 'Lachie Neale': 22.36,
 'Connor Rozee': 25.6,
 'Rowan Marshall': 28.41,
 'Thomas Stewart': 37.46,
 'Timothy English': 45.3,
 'James Sicily': 46.17,
 'Tom Green': 74.76,
 'Josh Dunkley': 82.33,
 'Noah Anderson': 107.7,
 'Jack Sinclair': 130.58,
 'Thomas Liberatore': 177.57,
 'Clayton Oliver': 177.57,
 'Charlie Curnow': 191.31,
 'Adam Cerra': 226.27,
 'Dan Houston': 249.0,
 'Stephen Coniglio': 276.78,
 'Toby Greene': 356.14,
 'Tom Mitchell': 624.0,
 'Darcy Parish': 624.0,
 'Patrick Cripps': 832.33,
 'Brad Crouch': 832.33,
 'Luke Ryan': 832.33,
 'Max Gawn': 1249.0,
 'Matt Rowell': 1249.0,
 'Nic Newman': 1249.0,
 'Jack Viney': 1249.0,
 'Josh Daicos': 2499.0,
 'Luke Jackson': 2499.0,
 'Toby Nankervis': 2499.0}

****
- joint distributions for season simulation (how many players?)

- how many simulations - round 1 and round 2

- dynamic programming

- live input

**ODDS**
- top 2-10 odds

- number of votes 