In [None]:
# default_exp core

# Core

> I guess we'll do everything in one notebook to start with??

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
import pprint

In [None]:
#export
import abc
import collections
import dataclasses
import itertools
import typing
import functools
import json

import numpy as np
import scipy.stats
import scipy.optimize

## Parameter keys

In [None]:
#export

# Parameter keys

@dataclasses.dataclass(frozen=True)
class DixonColesParameterKey:
    label: typing.Hashable


# Model constants
RHO_KEY = DixonColesParameterKey('Rho')
HFA_KEY = DixonColesParameterKey('Home-field advantage')
AVG_KEY = DixonColesParameterKey('Average rate')

OFFENCE_KEY = DixonColesParameterKey('Offence')
DEFENCE_KEY = DixonColesParameterKey('Defence')


def offence_key(label):
    return (OFFENCE_KEY, label)


def defence_key(label):
    return (DEFENCE_KEY, label)

## Weights

In [None]:
#export

class UniformWeight:
    @staticmethod
    def __call__(row):
        return 1.0


class ExponentialWeight:
    def __init__(self, epsilon, key):
        self.epsilon = epsilon
        self.key = key

    def __call__(self, row):
        return np.exp(self.epsilon*self.key(row))

## Data Adapters

In [None]:
class KeyAdapter:
    def __init__(self, home_team, away_team, home_goals, away_goals):
        self._home_team = home_team
        self._away_team = away_team
        self._home_goals = home_goals
        self._away_goals = away_goals

    def _get_in(self, row, item):
        if isinstance(item, list):
            return functools.reduce(lambda d, i: d[i], item, row)
        return row[item]

    def home_team(self, row):
        return self._get_in(row, self._home_team)

    def away_team(self, row):
        return self._get_in(row, self._away_team)

    def home_goals(self, row):
        return self._get_in(row, self._home_goals)

    def away_goals(self, row):
        return self._get_in(row, self._away_goals)      

In [None]:
class AttributeAdapter:
    def __init__(self, home_team, away_team, home_goals, away_goals):
        self._home_team = home_team
        self._away_team = away_team
        self._home_goals = home_goals
        self._away_goals = away_goals

    def home_team(self, row):
        return getattr(row, self._home_team)

    def away_team(self, row):
        return getattr(row, self._away_team)

    def home_goals(self, row):
        return getattr(row, self._home_goals)

    def away_goals(self, row):
        return getattr(row, self._away_goals)

In [None]:
class LumpedAdapter:
    """ Lump teams who appear below `min_matches` times (default 10) into one team """

    def __init__(self, base_adapter, data, min_matches=10, placeholder=DixonColesParameterKey('Other team')):
        self.base_adapter = base_adapter
        self.min_matches = min_matches
        self.placeholder = placeholder

        self.match_count = None
        self.train(data)
        
    def home_team(self, row):
        home_team = self.base_adapter.home_team(row)
        if self.match_count[home_team] <= self.min_matches:
            return self.placeholder
        return home_team

    def away_team(self, row):
        away_team = self.base_adapter.away_team(row)
        if self.match_count[away_team] <= self.min_matches:
            return self.placeholder
        return away_team

    def home_goals(self, row):
        return self.base_adapter.home_goals(row)

    def away_goals(self, row):
        return self.base_adapter.away_goals(row)

    def fit(self, data):
        home_match_count = collections.Counter(self.base_adapter.home_team(row) for row in data)
        away_match_count = collections.Counter(self.base_adapter.away_team(row) for row in data)
        self.match_count = home_match_count + away_match_count

## Modelling

In [None]:
class DixonColesError(Exception):
    pass

In [None]:
class DixonColesABC(abc.ABC):
    """
    A mixin containing fitting mechanics for variations of Dixon-Coles models
    """
    params = None
    weight = UniformWeight()

    @abc.abstractmethod
    def parse_params(self):
        """ 
        Construct parameter keys and constraints from data. 
        """
        parameter_keys = []
        constraints = {}
        return parameter_keys, constraints

    @abc.abstractmethod
    def home_rate(self, params, row):
        pass

    @abc.abstractmethod
    def away_rate(self, params, row):
        pass

    @abc.abstractmethod
    def home_goals(self, row):
        pass

    @abc.abstractmethod
    def away_goals(self, row):
        pass

    # Core methods

    @staticmethod
    def _assign_params(param_keys, param_values):
        return dict(zip(param_keys, param_values))

    @staticmethod
    def _tau(home_goals, away_goals, home_rate, away_rate, rho):
        tau = np.ones(len(home_goals))
        tau = np.where((home_goals == 0) & (away_goals == 0), 1 - home_rate*away_rate*rho, tau)
        tau = np.where((home_goals == 0) & (away_goals == 1), 1 + home_rate*rho, tau)
        tau = np.where((home_goals == 1) & (away_goals == 0), 1 + away_rate*rho, tau)
        tau = np.where((home_goals == 1) & (away_goals == 1), 1 - rho, tau)
        return tau

    def _log_like(self, home_goals, away_goals, home_rate, away_rate, params):
        try:
            rho = params[RHO_KEY]
        except KeyError:
            raise DixonColesError(
                f'Unable to find {RHO_KEY} in parameter set. Is it set in `parse_params`?'
            )

        return (
            scipy.stats.poisson.logpmf(home_goals, home_rate) +
            scipy.stats.poisson.logpmf(away_goals, away_rate) +
            np.log(self._tau(home_goals, away_goals, home_rate, away_rate, rho))
        )

    def objective_fn(self, data, param_keys, xs):
        params = self._assign_params(param_keys, xs)

        home_goals, away_goals = np.empty(len(data)), np.empty(len(data))
        home_rate, away_rate = np.empty(len(data)), np.empty(len(data))
        weights = np.empty(len(data))

        # NOTE: Should data adapter define the iteration?
        # E.g. dataframe adapter?
        for i, row in enumerate(data):
            home_goals[i] = self.home_goals(row)
            away_goals[i] = self.away_goals(row)
            home_rate[i] = self.home_rate(params, row)
            away_rate[i] = self.away_rate(params, row)
            weights[i] = self.weight(row)

        log_like = self._log_like(home_goals, away_goals, home_rate, away_rate, params)

        pseudo_log_like = log_like * weights
        return -np.sum(pseudo_log_like)

    def fit(self, data, **kwargs):
        param_keys, constraints = self.parse_params(data)

        init_params = (
            np.asarray([self.params.get(p, 0) for p in param_keys])
            if self.params
            else np.zeros(len(param_keys))
        )

        # Optimise!
        estimate = scipy.optimize.minimize(
            lambda xs: self.objective_fn(data, param_keys, xs),
            x0=init_params,
            constraints=constraints,
            **kwargs
        )

        # Parse the estimates into parameter map
        self.params = self._assign_params(param_keys, estimate.x)

        return self

    def predict_one(self, row, up_to=26):
        scorelines = list(itertools.product(range(up_to), repeat=2))

        home_goals = [h for h, a in scorelines]
        away_goals = [a for h, a in scorelines]
        home_rate = self.home_rate(self.params, row)
        away_rate = self.away_rate(self.params, row)

        probs = np.exp(self._log_like(home_goals, away_goals, home_rate, away_rate, self.params))

        # TODO: add a mixin/adapter to customise the indexing in the results dicts?
        # OR just use a custom dataclass for these...
        return [dict(zip(['home_goals', 'away_goals', 'probability'], vals))
                for vals in zip(home_goals, away_goals, probs)]

    def predict(self, data, up_to=26):
        scorelines = [self.predict_one(row, up_to=up_to) for row in data]
        return scorelines

In [None]:
class DixonColes(DixonColesABC):

    def __init__(self, adapter, weight=UniformWeight(), params=None):
        self.params = params
        self.adapter = adapter
        self.weight = weight

    def home_team(self, row):
        """ Returns the home team """
        return self.adapter.home_team(row)

    def away_team(self, row):
        """ Returns the away team """
        return self.adapter.away_team(row)

    def home_goals(self, row):
        """ Returns home goals scored """
        return self.adapter.home_goals(row)

    def away_goals(self, row):
        """ Returns away goals scored """
        return self.adapter.away_goals(row)

    def parse_params(self, data):
        """ Returns a tuple of (parameter_names, [constraints]) """
        teams = set(self.home_team(r) for r in data) | set(self.away_team(r) for r in data)
        n_teams = len(teams)

        offence = [offence_key(t) for t in teams]
        defence = [defence_key(t) for t in teams]

        return (
            offence + defence + [HFA_KEY, RHO_KEY],
            [
                # Force team offence parameters to average to 1
                {'fun': lambda x: 1 - np.mean(np.exp(x[0:n_teams])),
                 'type': 'eq'},
            ]
        )

    def home_rate(self, params, row):
        """ Returns home goalscoring rate """
        return np.exp(
            params[offence_key(self.home_team(row))] +
            params[defence_key(self.away_team(row))] +
            params[HFA_KEY] +
            params[AVG_KEY]
        )

    def away_rate(self, params, row):
        """ Returns away goalscoring rate """
        return np.exp(
            params.get(offence_key(self.away_team(row)), self.DEFAULT_OFF) +
            params.get(defence_key(self.home_team(row)), self.DEFAULT_DEF) +
            params[AVG_KEY]
        )

Example model fit

In [None]:
with open('../data/premier-league-1516.json', 'r') as f:
    pl_1516 = json.load(f)
    
pl_1516[0:3]

[{'date': '2015-08-08',
  'team1': 'Manchester United FC',
  'team2': 'Tottenham Hotspur FC',
  'score': {'ft': [1, 0]}},
 {'date': '2015-08-08',
  'team1': 'AFC Bournemouth',
  'team2': 'Aston Villa FC',
  'score': {'ft': [0, 1]}},
 {'date': '2015-08-08',
  'team1': 'Leicester City FC',
  'team2': 'Sunderland AFC',
  'score': {'ft': [4, 2]}}]

In [None]:
model = DixonColes(
    adapter=KeyAdapter(
        home_team='team1',
        away_team='team2',
        home_goals=['score', 'ft', 0],
        away_goals=['score', 'ft', 1],
    )
)
model.fit(pl_1516)

# All estimates should be valid numbers
assert all(not np.isnan(x) for x in model.params.values())

# Home advantage should be positive
assert 1.0 < np.exp(model.params[HFA_KEY]) < 2.0

pprint.pprint({k: np.exp(v) for k, v in model.params.items()})



{DixonColesParameterKey(label='Rho'): 0.9390832263308485,
 DixonColesParameterKey(label='Home-field advantage'): 1.2345780660052648,
 DixonColesParameterKey(label='Average rate'): 1.1692026726380644,
 (DixonColesParameterKey(label='Defence'), 'Arsenal FC'): 0.7396062391427994,
 (DixonColesParameterKey(label='Defence'), 'Newcastle United FC'): 1.3002453428513896,
 (DixonColesParameterKey(label='Offence'), 'AFC Bournemouth'): 0.8917281557631663,
 (DixonColesParameterKey(label='Offence'), 'Arsenal FC'): 1.254543653711057,
 (DixonColesParameterKey(label='Offence'), 'Aston Villa FC'): 0.5400583977176606,
 (DixonColesParameterKey(label='Offence'), 'Chelsea FC'): 1.1527088359171551,
 (DixonColesParameterKey(label='Defence'), 'Aston Villa FC'): 1.4948496974790357,
 (DixonColesParameterKey(label='Defence'), 'Chelsea FC'): 1.0779599286097092,
 (DixonColesParameterKey(label='Defence'), 'Crystal Palace FC'): 1.0158359461858912,
 (DixonColesParameterKey(label='Defence'), 'Everton FC'): 1.1248730392

In [None]:
scorelines = model.predict_one({
    'team1': 'Manchester City FC',
    'team2': 'Swansea City FC',
})

home_win = sum(s['probability'] for s in scorelines if s['home_goals'] > s['away_goals'])
draw = sum(s['probability'] for s in scorelines if s['home_goals'] == s['away_goals'])
away_win = sum(s['probability'] for s in scorelines if s['home_goals'] < s['away_goals'])

home_win, draw, away_win

(0.6645791886921023, 0.19833755250730678, 0.1370832588005913)