In [None]:
# default_exp team_strength

# Team strength

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
import os
import dotenv

In [None]:
#export
import abc
import collections
import datetime as dt

import mezzala
import numpy as np

import wingback.db

In [None]:
dotenv.load_dotenv()

wingback.db.queries.connect('postgresql://{user}:{password}@{host}:{port}/{database}'.format(
    host=os.environ['DB_HOST'],
    user=os.environ['DB_USER'],
    password=os.environ['DB_PASS'],
    database=os.environ['DB_NAME'],
    port=os.environ['DB_PORT'],
))

In [None]:
#export


class ModelABC:
    @abc.abstractmethod
    def fetch_data(self, league_ids, date):
        training_data = ...  # e.g. matches up-to, not including `date`
        return training_data
    
    @abc.abstractmethod
    def fit(self, data):
        return self
    
    @abc.abstractmethod
    def predict(self, data):
        predictions = ...
        return predictions
    
    @abc.abstractmethod
    def to_dict(self):
        return ...

In [None]:
#export


class Benchmark(ModelABC):
    """
    A benchmark model that gives the same predictions for every match.
    
    This prediction is simply an average of the observed scoreline frequency 
    within the training data.
    """
    
    def __init__(self, time_window=360):
        self._time_window = time_window
        
        self._data = None
    
    @property
    def time_window(self):
        return dt.timedelta(days=self._time_window)
    
    def fetch_data(self, league_ids, date):
        training_data = wingback.db.queries.fetch_matches(
            start=date-self.time_window, 
            end=date, 
            league_ids=league_ids,
            season_ids=[None]
        )
        
        return list(training_data)
    
    def fit(self, data):
        counts = collections.Counter((x['home_goals'], x['away_goals']) for x in data)
        self._data = [
            mezzala.ScorelinePrediction(
                home_goals=hg,
                away_goals=ag,
                probability=count/len(data)
            )
            for (hg, ag), count in counts.items()
        ]
        return self
    
    def predict(self, data):
        # Just make the same prediction for every match
        return [self._data]*len(data)
    
    def to_dict(self):
        return {
            'time_window': self._time_window
        }

In [None]:
benchmark_model = Benchmark()
target_date = dt.datetime(2021, 5, 12).date()

train = benchmark_model.fetch_data(
    league_ids=[1],
    date=target_date
)
benchmark_model.fit(train)

test = list(wingback.db.queries.fetch_matches(
    start=target_date,
    end=target_date+dt.timedelta(days=1), 
    league_ids=[1],
    season_ids=[None]
))

predictions = benchmark_model.predict(test)

print(f'Home team: {test[0]["home_team"]}')
print(f'Away team: {test[0]["away_team"]}')
for outcome, prediction in mezzala.scorelines_to_outcomes(predictions[0]).items():
    print(f'{outcome.value.ljust(9)}: {prediction.probability:0.2f}')

Home team: Chelsea
Away team: Arsenal
Home win : 0.38
Draw     : 0.23
Away win : 0.39


In [None]:
#exporti


def encode_parameter_key(key):
    if isinstance(key, mezzala.OffenceParameterKey):
        return ('Offence', key.label)
    if isinstance(key, mezzala.DefenceParameterKey):
        return ('Defence', key.label)
    if isinstance(key, mezzala.ParameterKey):
        return key.label
    return key

In [None]:
#export


class DCGoals(ModelABC):
    def __init__(self, time_window=360, epsilon=-0.0065, params=None):
        self._time_window = time_window
        self._epsilon = epsilon
        
        # Create the model
        base_adapter = mezzala.KeyAdapter(
            home_goals='home_goals',
            away_goals='away_goals',
            home_team='home_team_id',  # Might be nicer to do a tuple of (ID, name)?
            away_team='away_team_id',
        )
        self._model = mezzala.DixonColes(
            adapter=mezzala.LumpedAdapter(
                base_adapter,
                home_team=('Other team', 5),
                away_team=('Other team', 5),
            ),
            blocks=[
                mezzala.blocks.BaseRate(),
                mezzala.blocks.TeamStrength(),
                mezzala.blocks.HomeAdvantage(),
                mezzala.blocks.ConstantBlock(
                    mezzala.OffenceParameterKey('Other team'),
                    mezzala.DefenceParameterKey('Other team')
                ),
            ],
            weight=mezzala.weights.ExponentialWeight(
                epsilon=epsilon,  
                key=lambda x: x['days_ago']
            ),
            params=params
        )
    
    @property
    def time_window(self):
        return dt.timedelta(days=self._time_window)
    
    def fetch_data(self, league_ids, date):
        training_data = wingback.db.queries.fetch_matches(
            start=date-self.time_window, 
            end=date, 
            league_ids=league_ids,
            season_ids=[None]
        )
       
        return list(training_data)
    
    def fit(self, data):
        self._model.adapter.fit(data)
        self._model.fit(data)
        return self
    
    def predict(self, data):
        predictions = self._model.predict(data)
        return predictions
    
    def to_dict(self):
        return {
            'time_window': self._time_window,
            'epsilon': self._epsilon,
            'params': [
                (encode_parameter_key(k), v if not np.isnan(v) else None)
                for k, v in self._model.params.items()
            ]
        }

In [None]:
dc_model = DCGoals()

train = dc_model.fetch_data(
    league_ids=[1],
    date=target_date
)

dc_model.fit(train)

predictions = dc_model.predict(test)

print(f'Home team: {test[0]["home_team"]}')
print(f'Away team: {test[0]["away_team"]}')
for outcome, prediction in mezzala.scorelines_to_outcomes(predictions[0]).items():
    print(f'{outcome.value.ljust(9)}: {prediction.probability:0.2f}')

Home team: Chelsea
Away team: Arsenal
Home win : 0.43
Draw     : 0.28
Away win : 0.28


In [None]:
#export


class DCxG(ModelABC):
    def __init__(self, min_probability=0.1, time_window=360, epsilon=-0.0065):
        self._time_window = time_window
        self._epsilon = epsilon
        self.min_probability = min_probability
        
        base_adapter = mezzala.KeyAdapter(
            home_goals='home_goals',
            away_goals='away_goals',
            home_team='home_team_id',
            away_team='away_team_id',
        )
        self._model = mezzala.DixonColes(
            adapter=mezzala.LumpedAdapter(
                base_adapter,
                # On average, each match has about 6 resimulated scorelines
                # so we pick a cutoff of 30 (6*5) data points for lumping teams
                home_team=('Other team', 6*5),
                away_team=('Other team', 6*5),
            ),
            blocks=[
                mezzala.blocks.BaseRate(),
                mezzala.blocks.TeamStrength(),
                mezzala.blocks.HomeAdvantage(),
                mezzala.blocks.ConstantBlock(
                    mezzala.OffenceParameterKey('Other team'),
                    mezzala.DefenceParameterKey('Other team')
                ),
            ],
            weight=mezzala.weights.KeyWeight(
                lambda x: x['probability']*np.exp(self._epsilon*x['days_ago'])
            )
        )
        
    @property
    def time_window(self):
        return dt.timedelta(days=self._time_window)
    
    def fetch_data(self, league_ids, date):
        training_matches = list(wingback.db.queries.fetch_matches(
            start=date-self.time_window, 
            end=date, 
            league_ids=league_ids,
            season_ids=[None]
        ))
        training_resimulations = list(wingback.db.queries.fetch_resimulations(
            match_ids=[m['id'] for m in training_matches],
            min_probability=self.min_probability
        ))

        test_data = wingback.db.queries.fetch_matches(
            start=date, 
            end=date+dt.timedelta(days=1), 
            league_ids=league_ids,
            season_ids=[None]
        )
        
        # Merge matches and training data
        training_data = []
        for match in training_matches:
            training_data += [
                {**match, **t}
                for t in training_resimulations 
                if t['match_id'] == match['id']
            ]
        
        return list(training_data)
    
    def fit(self, data):
        self._model.adapter.fit(data)
        self._model.fit(data)
        return self
    
    def predict(self, data):
        predictions = self._model.predict(data)
        return predictions
    
    def to_dict(self):
        return {
            'time_window': self._time_window,
            'min_probability': self.min_probability,
            'epsilon': self._epsilon,
            'params': [
                (encode_parameter_key(k), v if not np.isnan(v) else None)
                for k, v in self._model.params.items()
            ]
        }

In [None]:
xg_model = DCxG(time_window=360, epsilon=0)  # No time weighting

train = xg_model.fetch_data(
    league_ids=[1],
    date=target_date
)
xg_model.fit(train)

predictions = xg_model.predict(test)

print(f'Home team: {test[0]["home_team"]}')
print(f'Away team: {test[0]["away_team"]}')
for outcome, prediction in mezzala.scorelines_to_outcomes(predictions[0]).items():
    print(f'{outcome.value.ljust(9)}: {prediction.probability:0.2f}')

  tau = np.where((home_goals == 0) & (away_goals == 0), 1 - home_rate*away_rate*rho, tau)


Home team: Chelsea
Away team: Arsenal
Home win : 0.60
Draw     : 0.22
Away win : 0.17


In [None]:
xg_model = DCxG(time_window=360, epsilon=-0.01)  # Incredibly aggressive time-weighting

train = xg_model.fetch_data(
    league_ids=[1],
    date=target_date
)
xg_model.fit(train)

predictions = xg_model.predict(test)

print(f'Home team: {test[0]["home_team"]}')
print(f'Away team: {test[0]["away_team"]}')
for outcome, prediction in mezzala.scorelines_to_outcomes(predictions[0]).items():
    print(f'{outcome.value.ljust(9)}: {prediction.probability:0.2f}')

Home team: Chelsea
Away team: Arsenal
Home win : 0.75
Draw     : 0.16
Away win : 0.09


## Model registry

Register models for use with the CLI

In [None]:
#export
eps_values = np.log(np.linspace(0.01, 0.95, 10))/365
eps_values

array([-0.0126169 , -0.00593881, -0.00416217, -0.00309335, -0.00232644,
       -0.00172793, -0.00123701, -0.00082083, -0.00045962, -0.00014053])

In [None]:
for eps, e180, e360 in (zip(eps_values, np.exp(eps_values*180), np.exp(eps_values*360))):
    print(f'For eps={eps:0.6f},\t{e180:0.5f} @180d,\t{e360:0.5f} @360d')

For eps=-0.012617,	0.10320 @180d,	0.01065 @360d
For eps=-0.005939,	0.34336 @180d,	0.11789 @360d
For eps=-0.004162,	0.47275 @180d,	0.22349 @360d
For eps=-0.003093,	0.57304 @180d,	0.32837 @360d
For eps=-0.002326,	0.65786 @180d,	0.43278 @360d
For eps=-0.001728,	0.73269 @180d,	0.53684 @360d
For eps=-0.001237,	0.80039 @180d,	0.64062 @360d
For eps=-0.000821,	0.86265 @180d,	0.74416 @360d
For eps=-0.000460,	0.92060 @180d,	0.84750 @360d
For eps=-0.000141,	0.97502 @180d,	0.95067 @360d


In [None]:
#export

MODEL_REGISTRY = {}


MODEL_REGISTRY['benchmark'] = Benchmark(time_window=730)

for eps in eps_values:
    MODEL_REGISTRY[f'dixon-coles{eps:0.6f}'] = DCGoals(time_window=730, epsilon=eps)
    MODEL_REGISTRY[f'dixon-coles-xg{eps:0.6f}'] = DCxG(time_window=730, epsilon=eps, min_probability=0.05)

In [None]:
MODEL_REGISTRY

{'benchmark': <__main__.Benchmark at 0x12841d278>,
 'dixon-coles-0.012617': <__main__.DCGoals at 0x12841d518>,
 'dixon-coles-xg-0.012617': <__main__.DCxG at 0x12841da90>,
 'dixon-coles-0.005939': <__main__.DCGoals at 0x12841dda0>,
 'dixon-coles-xg-0.005939': <__main__.DCxG at 0x1283d2128>,
 'dixon-coles-0.004162': <__main__.DCGoals at 0x1283d25f8>,
 'dixon-coles-xg-0.004162': <__main__.DCxG at 0x1283d2cf8>,
 'dixon-coles-0.003093': <__main__.DCGoals at 0x1283ef1d0>,
 'dixon-coles-xg-0.003093': <__main__.DCxG at 0x1283ef4e0>,
 'dixon-coles-0.002326': <__main__.DCGoals at 0x1283e8828>,
 'dixon-coles-xg-0.002326': <__main__.DCxG at 0x1283004a8>,
 'dixon-coles-0.001728': <__main__.DCGoals at 0x1283008d0>,
 'dixon-coles-xg-0.001728': <__main__.DCxG at 0x128300f98>,
 'dixon-coles-0.001237': <__main__.DCGoals at 0x1283032b0>,
 'dixon-coles-xg-0.001237': <__main__.DCxG at 0x128303588>,
 'dixon-coles-0.000821': <__main__.DCGoals at 0x128303828>,
 'dixon-coles-xg-0.000821': <__main__.DCxG at 0x1