In [92]:
import sqlite3
import numpy as np
import pandas as pd
import math
from tqdm.auto import trange, tqdm

con = sqlite3.connect("database.sqlite")

In [93]:
# country_id="21518" and league_id="21518" define La Liga
# id="43040" or team_api_id="8633" define Real Madrid
# id="43039" or team_api_id="9783" define RC Deportivo de La Coruña (opponent of Real Madrid in stage 38)
lookup_match = pd.read_sql_query('SELECT id, home_team_goal, away_team_goal, '
                       'home_team_api_id, away_team_api_id, '
                       'B365H, B365D, B365A, '
                       'home_player_1, home_player_2, home_player_3, '
                       'home_player_4, home_player_5, home_player_6, '
                       'home_player_7, home_player_8, home_player_9, '
                       'home_player_10, home_player_11, '
                       'away_player_1, away_player_2, away_player_3, '
                       'away_player_4, away_player_5, away_player_6, '
                       'away_player_7, away_player_8, away_player_9, '
                       'away_player_10, away_player_11, date '
                       'FROM Match '
                       'ORDER BY season, stage ASC', con)
lookup_player = pd.read_sql_query('SELECT player_api_id, date, MAX(overall_rating) '
                       'FROM Player_Attributes '
                       'GROUP BY player_api_id', con)
df = pd.read_sql_query('SELECT id, home_team_goal, away_team_goal, '
                       'home_team_api_id, away_team_api_id, '
                       'B365H, B365D, B365A '
                       'FROM Match '
                       'WHERE country_id!="21518" '
                       'ORDER BY season, stage ASC', con)
df_test = pd.read_sql_query('SELECT id, home_team_goal, away_team_goal, '
                       'home_team_api_id, away_team_api_id, '
                       'B365H, B365D, B365A '
                       'FROM Match '
                       'WHERE country_id="21518" '
                       'ORDER BY season, stage ASC', con)
df.dropna(inplace=True)
df_test.dropna(inplace=True)
df_comb = pd.DataFrame(np.random.randint(0,10, size=(df.shape[0] * 2,1)))
df_comb_test = pd.DataFrame(np.random.randint(0,10, size=(df_test.shape[0] * 2,1)))

In [94]:
def prepareDF(dataframe, fillframe):
    teams = dataframe['home_team_api_id'].unique()
    points = {}
    bets = {}
    points_against_teams = {}
    bets_against_teams = {}
    for team_id in teams:
        points[team_id] = {}
        bets[team_id] = {}
        points_against_teams[team_id] = {}
        bets_against_teams[team_id] = {}
        for team_id_inner in teams:
                if team_id != team_id_inner:
                    points_against_teams[team_id][team_id_inner] = {}
                    bets_against_teams[team_id][team_id_inner] = {}
    for i, value in enumerate(dataframe.values):
        home_team_api_id = dataframe['home_team_api_id'].values[i]
        away_team_api_id = dataframe['away_team_api_id'].values[i]
        home_team_goal = dataframe['home_team_goal'].values[i]
        away_team_goal = dataframe['away_team_goal'].values[i]
        id_of_match = dataframe['id'].values[i]
        bets[home_team_api_id][id_of_match] = dataframe['B365H'].values[i]
        bets[away_team_api_id][id_of_match] = dataframe['B365A'].values[i]
        bets_against_teams[home_team_api_id][away_team_api_id][id_of_match] = dataframe['B365H'].values[i]
        bets_against_teams[away_team_api_id][home_team_api_id][id_of_match] = dataframe['B365A'].values[i]
        if home_team_goal > away_team_goal:
            points_against_teams[home_team_api_id][away_team_api_id][id_of_match] = 3
            points_against_teams[away_team_api_id][home_team_api_id][id_of_match] = 0
            points[home_team_api_id][id_of_match] = 3
            points[away_team_api_id][id_of_match] = 0
        elif home_team_goal < away_team_goal:
            points_against_teams[home_team_api_id][away_team_api_id][id_of_match] = 0
            points_against_teams[away_team_api_id][home_team_api_id][id_of_match] = 3
            points[home_team_api_id][id_of_match] = 0
            points[away_team_api_id][id_of_match] = 3
        else:
            points_against_teams[home_team_api_id][away_team_api_id][id_of_match] = 1
            points_against_teams[away_team_api_id][home_team_api_id][id_of_match] = 1
            points[home_team_api_id][id_of_match] = 1
            points[away_team_api_id][id_of_match] = 1

    b365 = []
    home = []
    points_itl = []
    points_avg = []
    points_itl_against = []
    points_against_avg = []
    bet_team_avg = []
    bet_against_avg = []
    bet_against_last = []
    rating = []

    for i, value in enumerate(tqdm(dataframe.values)):
        home_team_api_id = dataframe['home_team_api_id'].values[i]
        away_team_api_id = dataframe['away_team_api_id'].values[i]
        id_of_match = dataframe['id'].values[i]

        # home
        b365.append(dataframe['B365H'].values[i])
        pitlf, avgpoints = stats_general(id_of_match,points[home_team_api_id])
        points_itl.append(pitlf)
        points_avg.append(avgpoints)
        pitlf, avgpoints = stats_general(id_of_match,points_against_teams[home_team_api_id][away_team_api_id])
        points_itl_against.append(pitlf)
        points_against_avg.append(avgpoints)
        bet_avg, bet_last = stats_bets(id_of_match, bets[home_team_api_id])
        bet_team_avg.append(bet_avg)
        bet_avg, bet_last = stats_bets(id_of_match, bets_against_teams[home_team_api_id][away_team_api_id])
        bet_against_avg.append(bet_avg)
        bet_against_last.append(bet_last)
        home.append(1)
        avgrating = stats_players(id_of_match, 'home')
        rating.append(avgrating)

        # away
        b365.append(dataframe['B365A'].values[i])
        pitlf, avgpoints = stats_general(id_of_match,points[away_team_api_id])
        points_itl.append(pitlf)
        points_avg.append(avgpoints)
        pitlf, avgpoints = stats_general(id_of_match,points_against_teams[away_team_api_id][home_team_api_id])
        points_itl_against.append(pitlf)
        points_against_avg.append(avgpoints)
        bet_avg, bet_last = stats_bets(id_of_match, bets[away_team_api_id])
        bet_team_avg.append(bet_avg)
        bet_avg, bet_last = stats_bets(id_of_match, bets_against_teams[away_team_api_id][home_team_api_id])
        bet_against_avg.append(bet_avg)
        bet_against_last.append(bet_last)
        home.append(0)
        avgrating = stats_players(id_of_match, 'away')
        rating.append(avgrating)

    fillframe['points_itl'] = points_itl
    fillframe['home'] = home
    fillframe['B365'] = b365
    fillframe['points_itl_against'] = points_itl_against
    fillframe['points_avg'] = points_avg
    fillframe['points_against_avg'] = points_against_avg
    fillframe['bet_team_avg'] = bet_team_avg
    fillframe['bet_against_avg'] = bet_against_avg
    fillframe['bet_against_last'] = bet_against_last
    fillframe['rating'] = rating
    fillframe.dropna(inplace=True)

def stats_players(id_of_match, home='home'):
    home_players = []
    for j in range(1, 12):
            home_player = lookup_match.loc[lookup_match['id'] == id_of_match][home + '_player_' + str(j)].item()
            if not math.isnan(home_player):
                rating = lookup_player.loc[lookup_player['player_api_id'] == home_player]['MAX(overall_rating)'].item()
                home_players.append(rating)
    if len(home_players) > 3:
        avg_rating = 0
        for player in home_players:
            avg_rating =+ player
        avg_rating = avg_rating / len(home_players)
        return avg_rating
    else:
        return np.nan

def stats_general(id_of_match, points):
    pitlf = 0
    avgpoints = 0
    for j, key in enumerate(points.keys()):
        if key == id_of_match:
            if j-3 >= 0:
                for k, point in enumerate(points.values()):
                    if k < j:
                        avgpoints += point
                        if j - 3 < k:
                            pitlf += point
                avgpoints = (avgpoints / (j +1))
                return pitlf, avgpoints
            else:
                return np.nan, np.nan

def stats_bets(id_of_match, bets):
    avgbets = 0
    lastbet = 0
    for j, key in enumerate(bets.keys()):
        if key == id_of_match:
            if j-3 >= 0:
                for k, bet in enumerate(bets.values()):
                    if k < j:
                        if j - 3 < k:
                            avgbets += bet
                            lastbet = bet
                avgbets = (avgbets / 3)
                return avgbets, lastbet
            else:
                return np.nan, np.nan

In [95]:
prepareDF(df, df_comb)
prepareDF(df_test, df_comb_test)

  0%|          | 0/19553 [00:00<?, ?it/s]

  0%|          | 0/3039 [00:00<?, ?it/s]

In [96]:
import pyro
import pyro.distributions as dist
from pyro.nn import PyroModule, PyroSample
import torch
import torch.nn as nn
from pyro.infer.autoguide import AutoDiagonalNormal, AutoDelta
from pyro.infer import SVI, Trace_ELBO, Predictive


In [97]:
class BayesianRegression(PyroModule):
    def __init__(self, in_features):
        super().__init__()
        self.linear = PyroModule[nn.Linear](in_features, 1)
        self.linear.weight = PyroSample(dist.Normal(1., 2.).expand([1, in_features]).to_event(2))
        self.linear.bias = PyroSample(dist.Normal(0., 10.).expand([1]).to_event(1))

    def forward(self, x, y=None):
        sigma = pyro.sample("sigma", dist.Uniform(0., 10.))
        mean = self.linear(x).squeeze(-1)
        with pyro.plate("data", x.shape[0]):
            obs = pyro.sample("obs", dist.Normal(mean, sigma), obs=y)
        return mean

class BayesianNN(PyroModule):
    def __init__(self, in_features, h1=20, h2=20):
        super().__init__()
        self.input_features=in_features
        self.fc1 = PyroModule[nn.Linear](in_features, h1)
        self.fc1.weight = PyroSample(dist.Normal(0., 1.).expand([h1, in_features]).to_event(2))
        self.fc1.bias = PyroSample(dist.Normal(0., 1.).expand([h1]).to_event(1))
        self.fc2 = PyroModule[nn.Linear](h1, h2)
        self.fc2.weight = PyroSample(dist.Normal(0., 1.).expand([h2, h1]).to_event(2))
        self.fc2.bias = PyroSample(dist.Normal(0., 1.).expand([h2]).to_event(1))
        self.fc3 = PyroModule[nn.Linear](h2, 1)
        self.fc3.weight = PyroSample(dist.Normal(0., 1.).expand([1, h2]).to_event(2))
        self.fc3.bias = PyroSample(dist.Normal(0., 1.).expand([1]).to_event(1))
        self.relu = nn.ReLU()

    def forward(self, x, y=None):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        mu = self.fc3(x).squeeze(-1)
        sigma = pyro.sample("sigma", dist.Uniform(0., 1.))
        with pyro.plate("data", x.shape[0]):
            obs = pyro.sample("obs", dist.Normal(mu, sigma), obs=y)
        return mu

In [98]:
# 'points_itl'
# 'B365'
# 'points_itl_against'
# 'points_avg'
# 'points_against_avg'
# 'bet_team_avg'
# 'bet_against_avg'
# 'bet_against_last'

used_parameters = ['points_itl', 'points_itl_against', 'points_avg', 'points_against_avg',
                   'bet_team_avg', 'bet_against_avg', 'bet_against_avg', 'bet_against_last',
                   'home', 'rating']
predict_parameters = ['B365']

In [99]:
def evaluate_param(predict_param):
    x_data = torch.tensor(df_comb[used_parameters].values, dtype=torch.float)
    x_data_test = torch.tensor(df_comb_test[used_parameters].values, dtype=torch.float)
    y_data = torch.tensor([df_comb[predict_param].values], dtype=torch.float)
    model = BayesianNN(in_features=len(used_parameters))
    guide = AutoDiagonalNormal(model)
    adam = pyro.optim.Adam({"lr": 1e-3})
    svi = SVI(model, guide, adam, loss=Trace_ELBO())
    num_iterations = 5000
    bar = trange(num_iterations)
    pyro.clear_param_store()
    for j, epoch in enumerate(bar):
        # calculate the loss and take a gradient step
        loss = svi.step(x_data, y_data)
        if j % 50 == 0:
            bar.set_postfix(loss=f'{loss / x_data_test.shape[0]:.3f}')
    guide.requires_grad_(False)
    predictive = Predictive(model, guide=guide, num_samples=1000, return_sites=("linear.weight", "obs", "_RETURN"))
    preds = predictive(x_data_test)
    y_pred = preds['obs'].T.detach().numpy().mean(axis=1)
    y_std = preds['obs'].T.detach().numpy().std(axis=1)
    if 'linear.weight' in preds:
        weight = preds["linear.weight"]
        for i, param in enumerate(used_parameters):
            print(param + ': ' + str(np.round(torch.mean(weight[:,0, 0, i]).item(), 2)))
    error = 0
    within = 0
    for i, bet in enumerate(y_pred):
        error += abs(bet - df_comb_test[predict_param].values[i])
        if abs(bet - df_comb_test[predict_param].values[i]) < 0.5:
            within += 1
    avg_error = error / len(y_pred)
    within_percent = within / len(y_pred)
    print()
    print('avgerror: ' + str(np.round(avg_error, 2)))
    print('within_percent: ' + str(np.round(within_percent, 2)))

evaluate_param('B365')


  0%|          | 0/5000 [00:00<?, ?it/s]


avgerror: 1.21
within_percent: 0.52
