In [13]:
import sqlite3
import numpy as np
import pandas as pd
import math
from tqdm.auto import trange, tqdm

con = sqlite3.connect("database.sqlite")

In [14]:
lookup_player = pd.read_sql_query('SELECT player_api_id, MAX(overall_rating) '
                       'FROM Player_Attributes '
                       'GROUP BY player_api_id', con)
df = pd.read_sql_query('SELECT id, home_team_goal, away_team_goal, '
                       'home_team_api_id, away_team_api_id, '
                       'B365H, B365A, '
                       'BWH, BWA, '
                       'IWH, IWA, '
                       'LBH, LBA, '
                       'PSH, PSA, '
                       'WHH, WHA, '
                       'SJH, SJA, '
                       'VCH, VCA, '
                       'GBH, GBA, '
                       'BSH, BSA, '
                       'home_player_1, home_player_2, home_player_3, '
                       'home_player_4, home_player_5, home_player_6, '
                       'home_player_7, home_player_8, home_player_9, '
                       'home_player_10, home_player_11, '
                       'away_player_1, away_player_2, away_player_3, '
                       'away_player_4, away_player_5, away_player_6, '
                       'away_player_7, away_player_8, away_player_9, '
                       'away_player_10, away_player_11 '
                       'FROM Match '
                       'ORDER BY season, stage ASC', con)
df.dropna(subset=['id', 'home_team_goal', 'away_team_goal',
                  'home_team_api_id', 'away_team_api_id'], inplace=True)
df_comb = pd.DataFrame(index=np.arange(df.shape[0] * 2), columns=np.arange(0))


In [15]:
betting_provider = ['B365', 'BW', 'IW', 'LB', 'PS', 'WH', 'SJ', 'VC', 'GB', 'BS']

In [16]:
def prepareDF(dataframe, fillframe):
    for i in range(1, 12):
        dataframe = pd.merge(left=dataframe, right=lookup_player, left_on='home_player_' + str(i), right_on='player_api_id', how='left')
        dataframe.drop(columns=['player_api_id'], inplace=True)
        dataframe.rename(columns={'MAX(overall_rating)': 'home_player_' + str(i) + '_overall_rating'}, inplace=True)
    for i in range(1, 12):
        dataframe = pd.merge(left=dataframe, right=lookup_player, left_on='away_player_' + str(i), right_on='player_api_id', how='left')
        dataframe.drop(columns=['player_api_id'], inplace=True)
        dataframe.rename(columns={'MAX(overall_rating)': 'away_player_' + str(i) + '_overall_rating'}, inplace=True)
    teams = dataframe['home_team_api_id'].unique()
    points = {}
    bets = {}
    points_against_teams = {}
    bets_against_teams = {}
    for team_id in teams:
        points[team_id] = {}
        bets[team_id] = {}
        points_against_teams[team_id] = {}
        bets_against_teams[team_id] = {}
        for team_id_inner in teams:
                if team_id != team_id_inner:
                    points_against_teams[team_id][team_id_inner] = {}
                    bets_against_teams[team_id][team_id_inner] = {}
    for i, value in enumerate(dataframe.values):
        home_team_api_id = dataframe['home_team_api_id'].values[i]
        away_team_api_id = dataframe['away_team_api_id'].values[i]
        home_team_goal = dataframe['home_team_goal'].values[i]
        away_team_goal = dataframe['away_team_goal'].values[i]
        id_of_match = dataframe['id'].values[i]
        home_bet = get_bets(dataframe, i, 'H')[0]
        away_bet = get_bets(dataframe, i, 'A')[0]
        bets[home_team_api_id][id_of_match] = home_bet
        bets[away_team_api_id][id_of_match] = away_bet
        bets_against_teams[home_team_api_id][away_team_api_id][id_of_match] = home_bet
        bets_against_teams[away_team_api_id][home_team_api_id][id_of_match] = away_bet
        if home_team_goal > away_team_goal:
            points_against_teams[home_team_api_id][away_team_api_id][id_of_match] = 3
            points_against_teams[away_team_api_id][home_team_api_id][id_of_match] = 0
            points[home_team_api_id][id_of_match] = 3
            points[away_team_api_id][id_of_match] = 0
        elif home_team_goal < away_team_goal:
            points_against_teams[home_team_api_id][away_team_api_id][id_of_match] = 0
            points_against_teams[away_team_api_id][home_team_api_id][id_of_match] = 3
            points[home_team_api_id][id_of_match] = 0
            points[away_team_api_id][id_of_match] = 3
        else:
            points_against_teams[home_team_api_id][away_team_api_id][id_of_match] = 1
            points_against_teams[away_team_api_id][home_team_api_id][id_of_match] = 1
            points[home_team_api_id][id_of_match] = 1
            points[away_team_api_id][id_of_match] = 1

    bet_mean = []
    bet_min = []
    bet_max = []
    home = []
    points_itl = []
    points_avg = []
    points_itl_against = []
    points_against_avg = []
    bet_team_avg = []
    bet_against_avg = []
    bet_against_last = []
    rating = []
    min_rating = []
    max_rating = []
    ids = []

    for i, value in enumerate(tqdm(dataframe.values)):
        home_team_api_id = dataframe['home_team_api_id'].values[i]
        away_team_api_id = dataframe['away_team_api_id'].values[i]
        id_of_match = dataframe['id'].values[i]

        # home
        bet, min, max = get_bets(dataframe, i, 'H')
        bet_mean.append(bet)
        bet_min.append(min)
        bet_max.append(max)
        pitlf, avgpoints = stats_general(id_of_match,points[home_team_api_id])
        points_itl.append(pitlf)
        points_avg.append(avgpoints)
        pitlf, avgpoints = stats_general(id_of_match,points_against_teams[home_team_api_id][away_team_api_id])
        points_itl_against.append(pitlf)
        points_against_avg.append(avgpoints)
        bet_avg, bet_last = stats_bets(id_of_match, bets[home_team_api_id])
        bet_team_avg.append(bet_avg)
        bet_avg, bet_last = stats_bets(id_of_match, bets_against_teams[home_team_api_id][away_team_api_id])
        bet_against_avg.append(bet_avg)
        bet_against_last.append(bet_last)
        home.append(1)
        avgrating, player_min, player_max = stats_players(dataframe, i, 'home')
        rating.append(avgrating)
        min_rating.append(player_min)
        max_rating.append(player_max)
        ids.append(str(id_of_match) + 'h')

        # away
        bet, min, max = get_bets(dataframe, i, 'A')
        bet_mean.append(bet)
        bet_min.append(min)
        bet_max.append(max)
        pitlf, avgpoints = stats_general(id_of_match,points[away_team_api_id])
        points_itl.append(pitlf)
        points_avg.append(avgpoints)
        pitlf, avgpoints = stats_general(id_of_match,points_against_teams[away_team_api_id][home_team_api_id])
        points_itl_against.append(pitlf)
        points_against_avg.append(avgpoints)
        bet_avg, bet_last = stats_bets(id_of_match, bets[away_team_api_id])
        bet_team_avg.append(bet_avg)
        bet_avg, bet_last = stats_bets(id_of_match, bets_against_teams[away_team_api_id][home_team_api_id])
        bet_against_avg.append(bet_avg)
        bet_against_last.append(bet_last)
        home.append(0)
        avgrating, player_min, player_max = stats_players(dataframe, i, 'away')
        rating.append(avgrating)
        min_rating.append(player_min)
        max_rating.append(player_max)
        ids.append(str(id_of_match) + 'a')

    fillframe['points_itl'] = points_itl
    fillframe['home'] = home
    fillframe['bet_mean'] = bet_mean
    fillframe['bet_min'] = bet_min
    fillframe['bet_max'] = bet_max
    fillframe['points_itl_against'] = points_itl_against
    fillframe['points_avg'] = points_avg
    fillframe['points_against_avg'] = points_against_avg
    fillframe['bet_team_avg'] = bet_team_avg
    fillframe['bet_against_avg'] = bet_against_avg
    fillframe['bet_against_last'] = bet_against_last
    fillframe['rating'] = rating
    fillframe['min_rating'] = min_rating
    fillframe['max_rating'] = max_rating
    fillframe['id'] = ids
    fillframe.dropna(inplace=True)

def get_bets(dataframe, i, home='H'):
    count = 0
    result = 0
    min = np.nan
    max = np.nan
    for provider in betting_provider:
        bet = dataframe[provider + home].values[i]
        if not math.isnan(bet):
            result += bet
            count += 1
            if bet < min or math.isnan(min):
                min = bet
            if bet > max or math.isnan(max):
                max = bet
    if count > 0:
        return result / count, min, max
    else:
        return np.nan, min, max

def stats_players(dataframe, i, home='home'):
    player_min = np.nan
    player_max = np.nan
    player_ratings = []
    for j in range(1, 12):
        player_rating = dataframe[home + '_player_' + str(j) + '_overall_rating'].values[i]
        if not math.isnan(player_rating):
            if player_rating < player_min or math.isnan(player_min):
                player_min = player_rating
            if player_rating > player_max or math.isnan(player_max):
                player_max = player_rating
            player_ratings.append(player_rating)
    if len(player_ratings) > 3:
        avg_rating = 0
        for player_rating in player_ratings:
            avg_rating =+ player_rating
        avg_rating = avg_rating / len(player_ratings)
        return avg_rating, player_min, player_max
    else:
        return np.nan, player_min, player_max

def stats_general(id_of_match, points):
    pitlf = 0
    avgpoints = 0
    for j, key in enumerate(points.keys()):
        if key == id_of_match:
            if j-3 >= 0:
                for k, point in enumerate(points.values()):
                    if k < j:
                        avgpoints += point
                        if j - 3 < k:
                            pitlf += point
                avgpoints = (avgpoints / (j +1))
                return pitlf, avgpoints
            else:
                return np.nan, np.nan

def stats_bets(id_of_match, bets):
    avgbets = 0
    lastbet = 0
    for j, key in enumerate(bets.keys()):
        if key == id_of_match:
            if j-3 >= 0:
                for k, bet in enumerate(bets.values()):
                    if k < j:
                        if j - 3 < k:
                            avgbets += bet
                            lastbet = bet
                avgbets = (avgbets / 3)
                return avgbets, lastbet
            else:
                return np.nan, np.nan

In [17]:
prepareDF(df, df_comb)

  0%|          | 0/25979 [00:00<?, ?it/s]

In [18]:
import pyro
import pyro.distributions as dist
from pyro.nn import PyroModule, PyroSample
import torch
import torch.nn as nn
from pyro.infer.autoguide import AutoDiagonalNormal, AutoDelta, AutoNormal
from pyro.infer import SVI, Trace_ELBO, Predictive


In [19]:
class BayesianRegression(PyroModule):
    def __init__(self, in_features):
        super().__init__()
        self.linear = PyroModule[nn.Linear](in_features, 1)
        self.linear.weight = PyroSample(dist.Normal(0., 1.).expand([1, in_features]).to_event(2))
        self.linear.bias = PyroSample(dist.Normal(0., 10.).expand([1]).to_event(1))

    def forward(self, x, y=None):
        sigma = pyro.sample("sigma", dist.Uniform(0., 10.))
        mean = self.linear(x).squeeze(-1)
        with pyro.plate("data", x.shape[0]):
            obs = pyro.sample("obs", dist.Normal(mean, sigma), obs=y)
        return mean

class BayesianNN(PyroModule):
    def __init__(self, in_features, h1=16, h2=32):
        super().__init__()
        self.input_features=in_features
        self.fc1 = PyroModule[nn.Linear](in_features, h1)
        self.fc1.weight = PyroSample(dist.Normal(0., 1.).expand([h1, in_features]).to_event(2))
        self.fc1.bias = PyroSample(dist.Normal(0., 1.).expand([h1]).to_event(1))
        self.fc2 = PyroModule[nn.Linear](h1, h2)
        self.fc2.weight = PyroSample(dist.Normal(0., 1.).expand([h2, h1]).to_event(2))
        self.fc2.bias = PyroSample(dist.Normal(0., 1.).expand([h2]).to_event(1))
        self.fc3 = PyroModule[nn.Linear](h2, 1)
        self.fc3.weight = PyroSample(dist.Normal(0., 1.).expand([1, h2]).to_event(2))
        self.fc3.bias = PyroSample(dist.Normal(0., 1.).expand([1]).to_event(1))
        self.relu = nn.ReLU()

    def forward(self, x, y=None):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        mu = self.fc3(x).squeeze(-1)
        sigma = pyro.sample("sigma", dist.Uniform(0., 1.))
        with pyro.plate("data", x.shape[0]):
            obs = pyro.sample("obs", dist.Normal(mu, sigma), obs=y)
        return mu

In [20]:
used_parameters = ['points_itl', 'points_itl_against', 'points_avg', 'points_against_avg',
                   'bet_team_avg', 'bet_against_avg', 'bet_against_last',
                   'home', 'rating']
predict_parameters = ['bet_mean']

In [21]:
df_test = df_comb.sample(frac=0.2, random_state=42)
df_train = df_comb[~df_comb['id'].isin(df_test['id'])]

In [22]:
def evaluate_param(predict_param, nn=True, lr=1e-2):
    x_data = torch.tensor(df_train[used_parameters].values, dtype=torch.float)
    y_data = torch.tensor([df_train[predict_param].values], dtype=torch.float)
    if nn:
        model = BayesianNN(in_features=len(used_parameters))
    else:
        model = BayesianRegression(in_features=len(used_parameters))
    guide = AutoDiagonalNormal(model)
    adam = pyro.optim.Adam({"lr": lr})
    svi = SVI(model, guide, adam, loss=Trace_ELBO())
    num_iterations = 5000
    bar = trange(num_iterations)
    pyro.clear_param_store()
    for j, epoch in enumerate(bar):
        # calculate the loss and take a gradient step
        loss = svi.step(x_data, y_data)
        if j % 50 == 0:
            bar.set_postfix(loss=f'{loss / x_data.shape[0]:.3f}')
    guide.requires_grad_(False)
    return model, guide

def evaluate_model(predict_param, model, guide):
    predictive = Predictive(model, guide=guide, num_samples=1000, return_sites=("linear.weight", "obs", "_RETURN"))
    x_data_test = torch.tensor(df_test[used_parameters].values, dtype=torch.float)
    preds = predictive(x_data_test)
    y_pred = preds['obs'].T.detach().numpy().mean(axis=1)
    y_std = preds['obs'].T.detach().numpy().std(axis=1)
    print('Mean Bet Value: ' + str(np.round(np.mean(y_pred).item(), 2)))
    print('Mean Bet STD: ' + str(np.round(np.mean(y_std).item(), 2)))
    print()
    if 'linear.weight' in preds:
        weights = {}
        weight = preds["linear.weight"]
        for i, param in enumerate(used_parameters):
            weights[param] = str(np.round(abs(torch.mean(weight[:,0, 0, i]).item()), 2))
        weights = {k: v for k, v in sorted(weights.items(), key=lambda item: item[1], reverse=True)}
        for w in weights.keys():
            print(str(weights[w]) + '    -    ' + w)
    error = 0
    erroravg = 0
    within_05 = 0
    within_1 = 0
    for i, bet in enumerate(y_pred):
        current_error_min = abs(bet - df_test['bet_min'].values[i])
        current_error_max = abs(bet - df_test['bet_max'].values[i])
        erroravg += abs(bet - df_test['bet_mean'].values[i])
        if current_error_min < current_error_max:
            error += current_error_min
            if current_error_min < 0.5:
                within_05 += 1
            if current_error_min < 1:
                within_1 += 1
        else:
            error += current_error_max
            if current_error_max < 0.5:
                within_05 += 1
            if current_error_max < 1:
                within_1 += 1
    avg_error_close = error / len(y_pred)
    avg_error_mean = erroravg / len(y_pred)
    within_05 = within_05 / len(y_pred)
    within_1 = within_1 / len(y_pred)
    print()
    print('distance to closest bet: ' + str(np.round(avg_error_close, 2)))
    print('distance to average bet: ' + str(np.round(avg_error_mean, 2)))
    print('within_0.5: ' + str(np.round(within_05, 2)))
    print('within_1: ' + str(np.round(within_1, 2)))

In [23]:
model, guide = evaluate_param('bet_mean', nn=True)

  0%|          | 0/5000 [00:00<?, ?it/s]

In [24]:
evaluate_model('bet_mean',model, guide)


Mean Bet Value: 3.78
Mean Bet STD: 1.77

2.39    -    home
1.23    -    bet_against_avg
0.36    -    points_against_avg
0.08    -    points_itl
0.07    -    rating
0.04    -    bet_team_avg
0.03    -    points_avg
0.01    -    points_itl_against
0.0    -    bet_against_last

distance to closest bet: 0.82
distance to average bet: 1.1
within_0.5: 0.43
within_1: 0.73
