In [None]:
# default_exp adapters

# Data Adapters

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
import dataclasses
import typing

In [None]:
#export
import collections
import functools

import mezzala.parameters

## Basic adapters

In [None]:
#export


class KeyAdapter:
    """
    Get data from subscriptable objects.
    """
    
    def __init__(self, home_goals, away_goals, **kwargs):
        self._lookup = {
            'home_goals': home_goals,
            'away_goals': away_goals,
            **kwargs
        }
        
    def __repr__(self):
        args_repr = ', '.join(f'{k}={repr(v)}' for k, v in self._lookup.items())
        return f'KeyAdapter({args_repr})'

    def _get_in(self, row, item):
        if isinstance(item, list):
            return functools.reduce(lambda d, i: d[i], item, row)
        return row[item]
    
    def __getattr__(self, key): 
        def getter(row):
            return self._get_in(row, self._lookup[key])
        return getter

Anything subscriptable can be with this type of adapter. For example,
you might have input data as a list of tuples (e.g. using Python's
in-built `csv` library)

In [None]:
index_adapter = KeyAdapter(0, 1)

assert index_adapter.home_goals([1, 2]) == 1
assert index_adapter.away_goals([1, 2]) == 2

Or, you might be using a list of dicts.

In [None]:
dict_adapter = KeyAdapter('hg', 'ag', home_team='home', away_team='away')

example_dict = {
    'home': 'Team 1',
    'away': 'Team 2',
    'hg': 4,
    'ag': 3,
}

assert dict_adapter.home_goals(example_dict) == 4
assert dict_adapter.away_goals(example_dict) == 3
assert dict_adapter.home_team(example_dict) == 'Team 1'
assert dict_adapter.away_team(example_dict) == 'Team 2'

Nested data can be supplied using a list

In [None]:
nested_dict_adapter = KeyAdapter(
    home_goals=['scoreline', 0], 
    away_goals=['scoreline', 1]
)

example_nested_dict = {
    'scoreline': [1, 1]
}

assert nested_dict_adapter.home_goals(example_nested_dict) == 1
assert nested_dict_adapter.away_goals(example_nested_dict) == 1

`KeyAdapter` could be used alongside `pd.DataFrame.iterrows` as well; however, it is much faster when using `pd.DataFrame.itertuples`.

Likewise, you can't use a `KeyAdapter` with custom objects (e.g. dataclasses).

In this case, you need an `AttributeAdapter`.

In [None]:
#export


class AttributeAdapter:
    """
    Get data from object attributes.
    """
    def __init__(self, home_goals, away_goals, **kwargs):
        self._lookup = {
            'home_goals': home_goals,
            'away_goals': away_goals,
            **kwargs
        }
        
    def __repr__(self):
        args_repr = ', '.join(f'{k}={repr(v)}' for k, v in self._lookup.items())
        return f'KeyAdapter({args_repr})'
        
    def _get_in(self, row, item):
        if isinstance(item, list):
            return functools.reduce(getattr, item, row)
        return getattr(row, item)
    
    def __getattr__(self, key): 
        def getter(row):
            return self._get_in(row, self._lookup[key])
        return getter

In [None]:
@dataclasses.dataclass()
class ExampleData:
    hg: int
    ag: int
    home: str
    away: str


attr_adapter = AttributeAdapter('hg', 'ag', home_team='home', away_team='away')


example_attr = ExampleData(
    home='Another home team',
    away='Another away team',
    hg=5,
    ag=1,
)

assert attr_adapter.home_goals(example_attr) == 5
assert attr_adapter.away_goals(example_attr) == 1
assert attr_adapter.home_team(example_attr) == 'Another home team'
assert attr_adapter.away_team(example_attr) == 'Another away team'

As with `KeyAdapter`, nested attributes can also be fetched using lists

In [None]:
@dataclasses.dataclass()
class Scoreline:
    home: int
    away: int


@dataclasses.dataclass()
class ExampleNestedData:
    scoreline: Scoreline
    home: str
    away: str


nested_attr_adapter = AttributeAdapter(
    home_team='home',
    home_goals=['scoreline', 'home'], 
    away_team='away',
    away_goals=['scoreline', 'away'],
)

example_nested_attr = ExampleNestedData(
    home='Another home team',
    away='Another away team',
    scoreline=Scoreline(2, 5),
)

assert nested_attr_adapter.home_goals(example_nested_attr) == 2
assert nested_attr_adapter.away_goals(example_nested_attr) == 5

In [None]:
#export


class LambdaAdapter:
    """
    Get data from objects with an arbitrary function.
    """
    def __init__(self, home_goals, away_goals, **kwargs):
        self._lookup = {
            'home_goals': home_goals,
            'away_goals': away_goals,
            **kwargs
        }
        
    def __repr__(self):
        args_repr = ', '.join(f'{k}={repr(v)}' for k, v in self._lookup.items())
        return f'LambdaAdapter({args_repr})'
    
    def __getattr__(self, key): 
        def getter(row):
            return self._lookup[key](row)
        return getter

In [None]:
example_lambda_adapter = LambdaAdapter(
    home_team=lambda x: x.home,
    home_goals=lambda x: x.scoreline.home, 
    away_team=lambda x: 'jbvklajvbs',
    away_goals=lambda x: x.scoreline.away + 1,
)

assert example_lambda_adapter.home_team(example_nested_attr) == 'Another home team'
assert example_lambda_adapter.away_team(example_nested_attr) == 'jbvklajvbs'
assert example_lambda_adapter.home_goals(example_nested_attr) == 2
assert example_lambda_adapter.away_goals(example_nested_attr) == 6

## Composite adapters

In [None]:
#export


class LumpedAdapter:
    """ 
    Lump terms which have appeared below a minimum number of times in
    the training data into a placeholder term
    """

    def __init__(self, base_adapter, **kwargs):
        self.base_adapter = base_adapter
        
        # Match terms to placeholders
        # If multiple terms have the same placeholder (e.g. Home and Away
        # teams) they will share a counter
        self._term_lookup = kwargs
        
        self._counters = None
        
    def __repr__(self):
        args_repr = ', '.join(f'{k}={repr(v)}' for k, v in self._term_lookup.items())
        return f'LumpedAdapter(base_adapter={repr(self.base_adapter)}, {args_repr})'
        
    def fit(self, data):
        self._counters = {}
        for term, (placeholder, _) in self._term_lookup.items():
            # Initialise with an empty counter if it doesn't already exist
            # We need to do this so that multiple terms sharing the same counter
            # (home and away teams) are shared
            init_counter = self._counters.get(placeholder, collections.Counter())
            
            counter = collections.Counter(getattr(self.base_adapter, term)(row) for row in data)
            
            self._counters[placeholder] = init_counter + counter
        return self
    
    def __getattr__(self, key):
        if not self._counters:
            raise ValueError(
                'No counts found! You need to call `LumpedAdapter.fit` '
                'on the training data before you can use it!'
            )
        
        def getter(row):
            value = getattr(self.base_adapter, key)(row)
            placeholder, min_obs = self._term_lookup.get(key, (None, None))
            if placeholder and self._counters[placeholder][value] < min_obs:
                return placeholder
            return value
        return getter

In [None]:
example_lumped_data = [
    *([example_dict]*4),  # i.e., 'Team 1' and 'Team 2' appear in the data 4 times
    {'away': 'Team 1',    # 'Team 1' now appears an additional time, (5 total)
                          # Although this time appears as an *away* team
     'home': 'Team 3',    # While 'Team 3' appears once
     'hg': 4, 
     'ag': 3},
]


lumped_dict_adapter = LumpedAdapter(
    base_adapter=dict_adapter,
    home_team=('Other team', 5),      # Because `home_team` and `away_team` share the same
                                      # placeholder value ('Other team'), they are counted
                                      # together. I.e. a team has to appear at least 5 times
                                      # as _either_ the home team, or the away team
    away_team=('Other team', 5)
)
lumped_dict_adapter.fit(example_lumped_data)

lumped_dict_adapter

LumpedAdapter(base_adapter=KeyAdapter(home_goals='hg', away_goals='ag', home_team='home', away_team='away'), home_team=('Other team', 5), away_team=('Other team', 5))

In [None]:
example_lumped_1 = {
    'home': 'Team 1',
    'away': 'Team 3',
    'hg': 1, 
    'ag': 2
}

# A team with more than the minimum number of observations appears as before
assert lumped_dict_adapter.home_team(example_lumped_1) == 'Team 1'

# But a team with fewer observations appears as the placeholder
assert lumped_dict_adapter.away_team(example_lumped_1) == 'Other team'

# Meanwhile, values without a placeholder in the LumpedAdapter
# also appear as before
assert lumped_dict_adapter.home_goals(example_lumped_1) == 1
assert lumped_dict_adapter.away_goals(example_lumped_1) == 2

Using a lumped adapter can also allow you to handle items which didn't appear in the training set at all:

In [None]:
example_lumped_2 = {
    'home': 'Team 2',  # Only appeared 4 times, below threshold of 5
    'away': 'Team 4',  # Appeared 0 times in the data
    'hg': 1, 
    'ag': 2
}

assert lumped_dict_adapter.home_team(example_lumped_2) == 'Other team'
assert lumped_dict_adapter.away_team(example_lumped_2) == 'Other team'