In [None]:
# default_exp team_strength

# Team strength

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
import os
import dotenv

In [None]:
#export
import abc
import collections
import datetime as dt
import functools
import itertools

import mezzala
import numpy as np

import wingback.db

In [None]:
dotenv.load_dotenv()

wingback.db.queries.connect('postgresql://{user}:{password}@{host}:{port}/{database}'.format(
    host=os.environ['DB_HOST'],
    user=os.environ['DB_USER'],
    password=os.environ['DB_PASS'],
    database=os.environ['DB_NAME'],
    port=os.environ['DB_PORT'],
))

In [None]:
#export


class ModelABC:
    @abc.abstractmethod
    def fetch_data(self, league_ids, date):
        training_data = ...  # e.g. matches up-to, not including `date`
        return training_data
    
    @abc.abstractmethod
    def fit(self, data):
        return self
    
    @abc.abstractmethod
    def predict(self, data):
        predictions = ...
        return predictions
    
    @abc.abstractmethod
    def to_dict(self):
        return ...

In [None]:
#export


class Benchmark(ModelABC):
    """
    A benchmark model that gives the same predictions for every match.
    
    This prediction is simply an average of the observed scoreline frequency 
    within the training data.
    """
    
    def __init__(self, time_window=360):
        self._time_window = time_window
        
        self._data = None
    
    @property
    def time_window(self):
        return dt.timedelta(days=self._time_window)
    
    def fetch_data(self, league_ids, date):
        training_data = wingback.db.queries.fetch_matches(
            start=date-self.time_window, 
            end=date, 
            league_ids=league_ids,
            season_ids=[None]
        )
        
        return list(training_data)
    
    def fit(self, data):
        counts = collections.Counter((x['home_goals'], x['away_goals']) for x in data)
        self._data = [
            mezzala.ScorelinePrediction(
                home_goals=hg,
                away_goals=ag,
                probability=count/len(data)
            )
            for (hg, ag), count in counts.items()
        ]
        return self
    
    def predict(self, data):
        # Just make the same prediction for every match
        return [self._data]*len(data)
    
    def to_dict(self):
        return {
            'time_window': self._time_window
        }

In [None]:
benchmark_model = Benchmark()
target_date = dt.datetime(2021, 5, 12).date()

train = benchmark_model.fetch_data(
    league_ids=[1],
    date=target_date
)
benchmark_model.fit(train)

test = list(wingback.db.queries.fetch_matches(
    start=target_date,
    end=target_date+dt.timedelta(days=1), 
    league_ids=[1],
    season_ids=[None]
))

predictions = benchmark_model.predict(test)

print(f'Home team: {test[0]["home_team"]}')
print(f'Away team: {test[0]["away_team"]}')
for outcome, prediction in mezzala.scorelines_to_outcomes(predictions[0]).items():
    print(f'{outcome.value.ljust(9)}: {prediction.probability:0.2f}')

Home team: Chelsea
Away team: Arsenal
Home win : 0.39
Draw     : 0.23
Away win : 0.38


In [None]:
#exporti


def encode_parameter_key(key):
    if isinstance(key, mezzala.OffenceParameterKey):
        return ('Offence', key.label)
    if isinstance(key, mezzala.DefenceParameterKey):
        return ('Defence', key.label)
    if isinstance(key, mezzala.ParameterKey):
        return key.label
    return key


def decode_parameter_key(key):
    if isinstance(key, str):
        return mezzala.ParameterKey(key)
    if isinstance(key, list):
        off_def, label = key
        if off_def == 'Offence':
            return mezzala.OffenceParameterKey(label)
        if off_def == 'Defence':
            return mezzala.DefenceParameterKey(label)

In [None]:
#exporti


def init_model(weight, params=None):
    base_adapter = mezzala.KeyAdapter(
        home_goals='home_goals',
        away_goals='away_goals',
        home_team='home_team_id',  # Might be nicer to do a tuple of (ID, name)?
        away_team='away_team_id',
    )
    model = mezzala.DixonColes(
        adapter=mezzala.LumpedAdapter(
            base_adapter,
            home_team=('Other team', 5),
            away_team=('Other team', 5),
        ),
        blocks=[
            mezzala.blocks.BaseRate(),
            mezzala.blocks.TeamStrength(),
            mezzala.blocks.HomeAdvantage(),
            mezzala.blocks.ConstantBlock(
                mezzala.OffenceParameterKey('Other team'),
                mezzala.DefenceParameterKey('Other team')
            ),
        ],
        weight=weight,
        params=params
    )
    
    return model

In [None]:
#export


class DCGoals(ModelABC):
    def __init__(self, time_window=360, epsilon=-0.0065, params=None):
        self._time_window = time_window
        self._epsilon = epsilon
        
        # Create the model
        self._model = init_model(
            weight=mezzala.weights.ExponentialWeight(
                epsilon=epsilon,  
                key=lambda x: x['days_ago']
            ),
            params=params
        )
    
    @property
    def time_window(self):
        return dt.timedelta(days=self._time_window)
    
    def fetch_data(self, league_ids, date):
        training_data = wingback.db.queries.fetch_matches(
            start=date-self.time_window, 
            end=date, 
            league_ids=league_ids,
            season_ids=[None]
        )
       
        return list(training_data)
    
    def fit(self, data):
        self._model.adapter.fit(data)
        self._model.fit(data)
        return self
    
    def predict(self, data):
        predictions = self._model.predict(data)
        return predictions
    
    def to_dict(self):
        return {
            'time_window': self._time_window,
            'epsilon': self._epsilon,
            'params': [
                (encode_parameter_key(k), v if not np.isnan(v) else None)
                for k, v in self._model.params.items()
            ]
        }

In [None]:
dc_model = DCGoals()

train = dc_model.fetch_data(
    league_ids=[1],
    date=target_date
)

dc_model.fit(train)

predictions = dc_model.predict(test)

print(f'Home team: {test[0]["home_team"]}')
print(f'Away team: {test[0]["away_team"]}')
for outcome, prediction in mezzala.scorelines_to_outcomes(predictions[0]).items():
    print(f'{outcome.value.ljust(9)}: {prediction.probability:0.2f}')

  np.log(self._tau(home_goals, away_goals, home_rate, away_rate, rho))


Home team: Chelsea
Away team: Arsenal
Home win : 0.43
Draw     : 0.28
Away win : 0.29


In [None]:
#export


class DCxG(ModelABC):
    def __init__(self, min_probability=0.01, time_window=360, epsilon=-0.0065, params=None):
        self._time_window = time_window
        self._epsilon = epsilon
        self.min_probability = min_probability
        
        self._model = init_model(
            weight=mezzala.weights.KeyWeight(
                lambda x: x['probability']*np.exp(self._epsilon*x['days_ago'])
            ),
            params=params
        )
        
    @property
    def time_window(self):
        return dt.timedelta(days=self._time_window)
    
    def fetch_data(self, league_ids, date):
        training_matches = list(wingback.db.queries.fetch_matches(
            start=date-self.time_window, 
            end=date, 
            league_ids=league_ids,
            season_ids=[None]
        ))
        training_resimulations = list(wingback.db.queries.fetch_resimulations(
            match_ids=[m['id'] for m in training_matches],
            min_probability=self.min_probability
        ))

        test_data = wingback.db.queries.fetch_matches(
            start=date, 
            end=date+dt.timedelta(days=1), 
            league_ids=league_ids,
            season_ids=[None]
        )
        
        # Merge matches and training data
        training_data = []
        for match in training_matches:
            training_data += [
                {**match, **t}
                for t in training_resimulations 
                if t['match_id'] == match['id']
            ]
        
        # We return both the match data and the resim data because
        # we want to fit the adapter on the *match data* while fitting
        # the actual model on the xG resims
        return (list(training_matches), list(training_data))
    
    def fit(self, data):
        match_data, resim_data = data
        
        # Fit the adapter using the actual number of matches
        # (as opposed to the number of resimulations present...)
        self._model.adapter.fit(match_data)
        
        # And fit the model parameters on the xG resimulations
        self._model.fit(resim_data)

        return self
    
    def predict(self, data):
        predictions = self._model.predict(data)
        return predictions
    
    def to_dict(self):
        return {
            'time_window': self._time_window,
            'min_probability': self.min_probability,
            'epsilon': self._epsilon,
            'params': [
                (encode_parameter_key(k), v if not np.isnan(v) else None)
                for k, v in self._model.params.items()
            ]
        }

In [None]:
xg_model = DCxG(time_window=360, epsilon=0)  # No time weighting

train = xg_model.fetch_data(
    league_ids=[1],
    date=target_date
)
xg_model.fit(train)

predictions = xg_model.predict(test)

print(f'Home team: {test[0]["home_team"]}')
print(f'Away team: {test[0]["away_team"]}')
for outcome, prediction in mezzala.scorelines_to_outcomes(predictions[0]).items():
    print(f'{outcome.value.ljust(9)}: {prediction.probability:0.2f}')

  tau = np.where((home_goals == 0) & (away_goals == 0), 1 - home_rate*away_rate*rho, tau)


Home team: Chelsea
Away team: Arsenal
Home win : 0.57
Draw     : 0.24
Away win : 0.18


In [None]:
xg_model = DCxG(time_window=360, epsilon=-0.01)  # Incredibly aggressive time-weighting

train = xg_model.fetch_data(
    league_ids=[1],
    date=target_date
)
xg_model.fit(train)

predictions = xg_model.predict(test)

print(f'Home team: {test[0]["home_team"]}')
print(f'Away team: {test[0]["away_team"]}')
for outcome, prediction in mezzala.scorelines_to_outcomes(predictions[0]).items():
    print(f'{outcome.value.ljust(9)}: {prediction.probability:0.2f}')

Home team: Chelsea
Away team: Arsenal
Home win : 0.58
Draw     : 0.25
Away win : 0.16


In [None]:
#export


class DCEnsemble(ModelABC):
    def __init__(self, models=[], time_window=360):
        self.models = models
        self._time_window = time_window
        
        # Weight is irrelevant since _model.fit
        # is never actually called
        self._model = init_model(weight=lambda x: 1)
        
    @property
    def time_window(self):
        return dt.timedelta(days=self._time_window)
        
    @staticmethod
    def _fetch_backtest_params(model, league_ids, date):
        backtest = wingback.db.queries.fetch_backtest(
            model=model, 
            date=date, 
            league_ids=league_ids
        )
        params = backtest['json']['parameters']['params']
        return {decode_parameter_key(k): v for k, v in params}
    
    def fetch_data(self, league_ids, date):
        # Fetch models from database
        model_params = {
            (model, weight): self._fetch_backtest_params(model, league_ids, date)
            for model, weight in self.models
        }
        
        # We also need to fetch the "regular" data to fit the lumped adapter
        training_data = wingback.db.queries.fetch_matches(
            start=date-self.time_window, 
            end=date, 
            league_ids=league_ids,
            season_ids=[None]
        )
        
        return (model_params, list(training_data))
    
    def fit(self, data):
        model_params, match_data = data
        
        # Reduce parameter values
        
        # First, we need to get all the parameters used by the models in question
        # We take the intersection of each models' parameters. Although, since each
        # model should have exactly the same parameters, it shouldn't matter whether
        # we take the intersection or superset of all parameters
        # NOTE: is there a nice, pythonic way to transpose list-of-maps into map-of-lists?
        # NB: The data is a dict of {model_name: params}
        param_keys = functools.reduce(
            # Find the intersection of each models' parameters
            lambda x, y: x & y, 
            [set(params.keys()) for params in model_params.values()]
        )
        
        # To actually combine the parameters, we just take a weighted average
        # of the parameter values in real space (they are stored internally in
        # log space)
        params = {}
        for k in param_keys:
            param = np.average(
                # Shift parameters back from log-space into real values
                np.exp([p[k] for p in model_params.values()]),
                # Use weights for *weighted* average
                weights=[w for _, w in model_params.keys()]
            )
            
            # Finally, move parameter back into log-space
            params[k] = np.log(param)

        # Insert params into the model
        self._model.params = params
        
        # We also need to fit the lumped adapter
        self._model.adapter.fit(match_data)
        return self
    
    def predict(self, data):
        predictions = self._model.predict(data)
        return predictions
    
    def to_dict(self):
        return {
            'models': self.models,
            'params': [
                (encode_parameter_key(k), v if not np.isnan(v) else None)
                for k, v in self._model.params.items()
            ]
        }

In [None]:
ensemble_model = DCEnsemble([
    # Models are supplied with a name and a weight (for a weighted average)
    ('dixon-coles-0.008207', 0.5), 
    ('dixon-coles-xg-0.008207', 0.5)
])

train = ensemble_model.fetch_data(
    league_ids=[1],
    date=target_date
)
ensemble_model.fit(train)

predictions = ensemble_model.predict(test)

print(f'Home team: {test[0]["home_team"]}')
print(f'Away team: {test[0]["away_team"]}')
for outcome, prediction in mezzala.scorelines_to_outcomes(predictions[0]).items():
    print(f'{outcome.value.ljust(9)}: {prediction.probability:0.2f}')

Home team: Chelsea
Away team: Arsenal
Home win : 0.51
Draw     : 0.27
Away win : 0.22


In [None]:
ensemble_model = DCEnsemble([
    ('dixon-coles-0.008207', 0.8), 
    ('dixon-coles-xg-0.008207', 0.2)
])

train = ensemble_model.fetch_data(
    league_ids=[1],
    date=target_date
)
ensemble_model.fit(train)

predictions = ensemble_model.predict(test)

print(f'Home team: {test[0]["home_team"]}')
print(f'Away team: {test[0]["away_team"]}')
for outcome, prediction in mezzala.scorelines_to_outcomes(predictions[0]).items():
    print(f'{outcome.value.ljust(9)}: {prediction.probability:0.2f}')

Home team: Chelsea
Away team: Arsenal
Home win : 0.47
Draw     : 0.28
Away win : 0.26


In [None]:
ensemble_model = DCEnsemble([
    ('dixon-coles-0.008207', 0.2), 
    ('dixon-coles-xg-0.008207', 0.8)
])

train = ensemble_model.fetch_data(
    league_ids=[1],
    date=target_date
)
ensemble_model.fit(train)

predictions = ensemble_model.predict(test)

print(f'Home team: {test[0]["home_team"]}')
print(f'Away team: {test[0]["away_team"]}')
for outcome, prediction in mezzala.scorelines_to_outcomes(predictions[0]).items():
    print(f'{outcome.value.ljust(9)}: {prediction.probability:0.2f}')

Home team: Chelsea
Away team: Arsenal
Home win : 0.55
Draw     : 0.26
Away win : 0.19


## Model registry

Register models for use with the CLI

In [None]:
#export
eps_values = np.log(np.linspace(0.05, 0.95, 8))/365
eps_values

array([-0.00820749, -0.00471991, -0.00323409, -0.00227608, -0.00156766,
       -0.00100529, -0.00053893, -0.00014053])

In [None]:
for eps, e180, e360 in (zip(eps_values, np.exp(eps_values*180), np.exp(eps_values*360))):
    print(f'For eps={eps:0.6f},\t{e180:0.5f} @180d,\t{e360:0.5f} @360d')

For eps=-0.008207,	0.22824 @180d,	0.05209 @360d
For eps=-0.004720,	0.42759 @180d,	0.18284 @360d
For eps=-0.003234,	0.55870 @180d,	0.31215 @360d
For eps=-0.002276,	0.66385 @180d,	0.44070 @360d
For eps=-0.001568,	0.75414 @180d,	0.56873 @360d
For eps=-0.001005,	0.83448 @180d,	0.69635 @360d
For eps=-0.000539,	0.90755 @180d,	0.82365 @360d
For eps=-0.000141,	0.97502 @180d,	0.95067 @360d


In [None]:
#export

MODEL_REGISTRY = {}


MODEL_REGISTRY['benchmark'] = Benchmark(time_window=730)

for eps in eps_values:
    MODEL_REGISTRY[f'dixon-coles{eps:0.6f}'] = DCGoals(time_window=730, epsilon=eps)
    MODEL_REGISTRY[f'dixon-coles-xg{eps:0.6f}'] = DCxG(time_window=730, epsilon=eps, min_probability=0.01)
    
for xg_mix in np.linspace(0.05, 0.95, 8):
    MODEL_REGISTRY[f'ensemble-{xg_mix:0.5f}'] = DCEnsemble(
        [('dixon-coles-0.001568', 1-xg_mix),
         ('dixon-coles-xg-0.003234', xg_mix)], 
        time_window=730
    )

In [None]:
MODEL_REGISTRY

{'benchmark': <__main__.Benchmark at 0x125cbee80>,
 'dixon-coles-0.008207': <__main__.DCGoals at 0x125cbef98>,
 'dixon-coles-xg-0.008207': <__main__.DCxG at 0x125d22240>,
 'dixon-coles-0.004720': <__main__.DCGoals at 0x125d22470>,
 'dixon-coles-xg-0.004720': <__main__.DCxG at 0x125d22710>,
 'dixon-coles-0.003234': <__main__.DCGoals at 0x125d22978>,
 'dixon-coles-xg-0.003234': <__main__.DCxG at 0x125cc06a0>,
 'dixon-coles-0.002276': <__main__.DCGoals at 0x125cc0c18>,
 'dixon-coles-xg-0.002276': <__main__.DCxG at 0x125cc0f28>,
 'dixon-coles-0.001568': <__main__.DCGoals at 0x125cc0b70>,
 'dixon-coles-xg-0.001568': <__main__.DCxG at 0x125c90c88>,
 'dixon-coles-0.001005': <__main__.DCGoals at 0x125c90d30>,
 'dixon-coles-xg-0.001005': <__main__.DCxG at 0x125c62630>,
 'dixon-coles-0.000539': <__main__.DCGoals at 0x125c62898>,
 'dixon-coles-xg-0.000539': <__main__.DCxG at 0x125c62b00>,
 'dixon-coles-0.000141': <__main__.DCGoals at 0x125afc048>,
 'dixon-coles-xg-0.000141': <__main__.DCxG at 0x1