In [6]:
import os
import sys
import warnings
import pandas as pd
import numpy as np
from sklearn.exceptions import DataConversionWarning
from sklearn.metrics import mean_absolute_error
import featuretools as ft
import featuretools.variable_types as vtypes

PROJECT_PATH = os.path.join(os.getcwd(), '../')

if PROJECT_PATH not in sys.path:
    sys.path.append(PROJECT_PATH)
    
from server.ml_models.all_model import AllModelData
from server.ml_models.match_model import MatchModelData
from server.ml_models import EnsembleModel

from src.model.metrics import yearly_performance_scores
from src.model.charts import graph_yearly_model_performance

SEED = 42

np.random.seed(SEED)
warnings.simplefilter("ignore", DataConversionWarning)

## Baseline with existing features

In [2]:
data = AllModelData(train_years=(None, 2016))
data.data



  res = PandasDataFrame.from_items(items)


Unnamed: 0,Unnamed: 1,Unnamed: 2,team,oppo_team,round_type,venue,win_odds,line_odds,oppo_win_odds,oppo_line_odds,rolling_pred_win_rate,oppo_rolling_pred_win_rate,...,oppo_last_week_result,oppo_last_week_score,oppo_cum_win_points,oppo_rolling_last_week_win_rate,oppo_win_streak,oppo_elo_rating,cum_percent,ladder_position,oppo_cum_percent,oppo_ladder_position
Adelaide,1991,1.0,Adelaide,Hawthorn,Regular,Football Park,0.00,0.0,0.00,0.0,0.000000,0.000000,...,0.0,64.0,0.0,0.608696,-2.0,995.596782,0.000000,7,0.000000,8
Adelaide,1991,2.0,Adelaide,Carlton,Regular,Football Park,0.00,0.0,0.00,0.0,0.000000,0.000000,...,0.0,113.0,0.0,0.478261,-2.0,986.669176,0.875706,5,0.729032,11
Adelaide,1991,3.0,Adelaide,Sydney,Regular,S.C.G.,0.00,0.0,0.00,0.0,0.000000,0.000000,...,0.0,75.0,0.0,0.217391,-2.0,952.109256,0.936508,8,1.226667,13
Adelaide,1991,4.0,Adelaide,Essendon,Regular,Windy Hill,0.00,0.0,0.00,0.0,0.000000,0.000000,...,1.0,167.0,8.0,0.739130,2.0,1000.158658,0.878282,6,1.009434,3
Adelaide,1991,5.0,Adelaide,West Coast,Regular,Subiaco,0.00,0.0,0.00,0.0,0.000000,0.000000,...,1.0,127.0,12.0,0.695652,3.0,998.714698,0.760073,9,0.953488,3
Adelaide,1991,6.0,Adelaide,Western Bulldogs,Regular,Football Park,0.00,0.0,0.00,0.0,0.000000,0.000000,...,1.0,118.0,12.0,0.565217,2.0,972.291459,0.722892,10,1.014644,6
Adelaide,1991,7.0,Adelaide,St Kilda,Regular,Moorabbin Oval,0.00,0.0,0.00,0.0,0.000000,0.000000,...,0.0,60.0,10.0,0.369565,-1.0,976.905181,0.839779,7,0.868085,9
Adelaide,1991,9.0,Adelaide,North Melbourne,Regular,M.C.G.,0.00,0.0,0.00,0.0,0.000000,0.000000,...,1.0,150.0,16.0,0.608696,3.0,975.410314,0.731121,12,1.164962,6
Adelaide,1991,10.0,Adelaide,Melbourne,Regular,Football Park,0.00,0.0,0.00,0.0,0.000000,0.000000,...,1.0,121.0,24.0,0.695652,5.0,993.725636,0.760804,12,1.244009,2
Adelaide,1991,11.0,Adelaide,Geelong,Regular,Kardinia Park,0.00,0.0,0.00,0.0,0.000000,0.000000,...,0.0,106.0,20.0,0.391304,-1.0,974.011800,0.783833,12,1.156687,7


In [3]:
# Data with venue & date added

ens_model = EnsembleModel()

scores = yearly_performance_scores([('ensemble', ens_model, {})], *data.train_data(), data_frame=True)
scores

Unnamed: 0,accuracy,error,model,year
0,0.785714,29.400674,ensemble,2011
1,0.792271,27.575879,ensemble,2012
2,0.753623,26.116556,ensemble,2013
3,0.731884,28.744106,ensemble,2014
4,0.713592,30.198007,ensemble,2015
5,0.736715,27.86601,ensemble,2016


## Add features via basic featuretools implementation

In [7]:
data.data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 29950 entries, (Adelaide, 1991, 1.0) to (Western Bulldogs, 2016, 27.0)
Data columns (total 83 columns):
team                                               29950 non-null object
oppo_team                                          29950 non-null object
round_type                                         29950 non-null object
venue                                              29950 non-null object
win_odds                                           29950 non-null float64
line_odds                                          29950 non-null float64
oppo_win_odds                                      29950 non-null float64
oppo_line_odds                                     29950 non-null float64
rolling_pred_win_rate                              29950 non-null float64
oppo_rolling_pred_win_rate                         29950 non-null float64
rolling_prev_match_kicks                           29950 non-null float64
rolling_prev_match_marks            

In [45]:
def match_id(row):
    home_team = row['team'] if row['at_home'] else row['oppo_team']
    away_team = row['oppo_team'] if row['at_home'] else row['team']

    return str(row['year']) + '.' + str(row['round_number']) + '.' + home_team + away_team


def match_ids(data_frame):
    return pd.Series([match_id(row) for _, row in data_frame.iterrows()], index=data_frame.index)


ft_df = data.data.assign(
    team_match_id=lambda df: df['team'] + '.' + df['year'].astype(str) + '.' + df['round_number'].astype(str),
    match_id=match_ids,
    day_after_match=lambda df: df['date'] + pd.Timedelta(days=1),
).merge((data.data.groupby(['year', 'round_number'])['date'].min() - pd.Timedelta(days=1))
        .rename('round_start_date')
        .reset_index(),
       on=['year', 'round_number'],
       how='left')

ft_df

Unnamed: 0,team,oppo_team,round_type,venue,win_odds,line_odds,oppo_win_odds,oppo_line_odds,rolling_pred_win_rate,oppo_rolling_pred_win_rate,...,oppo_win_streak,oppo_elo_rating,cum_percent,ladder_position,oppo_cum_percent,oppo_ladder_position,team_match_id,match_id,day_after_match,round_start_date
0,Adelaide,Hawthorn,Regular,Football Park,0.00,0.0,0.00,0.0,0.000000,0.000000,...,-2.0,995.596782,0.000000,7,0.000000,8,Adelaide.1991.1,1991.1.AdelaideHawthorn,1991-03-23,1991-03-21
1,Adelaide,Carlton,Regular,Football Park,0.00,0.0,0.00,0.0,0.000000,0.000000,...,-2.0,986.669176,0.875706,5,0.729032,11,Adelaide.1991.2,1991.2.AdelaideCarlton,1991-04-01,1991-03-29
2,Adelaide,Sydney,Regular,S.C.G.,0.00,0.0,0.00,0.0,0.000000,0.000000,...,-2.0,952.109256,0.936508,8,1.226667,13,Adelaide.1991.3,1991.3.SydneyAdelaide,1991-04-08,1991-04-05
3,Adelaide,Essendon,Regular,Windy Hill,0.00,0.0,0.00,0.0,0.000000,0.000000,...,2.0,1000.158658,0.878282,6,1.009434,3,Adelaide.1991.4,1991.4.EssendonAdelaide,1991-04-14,1991-04-11
4,Adelaide,West Coast,Regular,Subiaco,0.00,0.0,0.00,0.0,0.000000,0.000000,...,3.0,998.714698,0.760073,9,0.953488,3,Adelaide.1991.5,1991.5.West CoastAdelaide,1991-04-22,1991-04-18
5,Adelaide,Western Bulldogs,Regular,Football Park,0.00,0.0,0.00,0.0,0.000000,0.000000,...,2.0,972.291459,0.722892,10,1.014644,6,Adelaide.1991.6,1991.6.AdelaideWestern Bulldogs,1991-04-29,1991-04-24
6,Adelaide,St Kilda,Regular,Moorabbin Oval,0.00,0.0,0.00,0.0,0.000000,0.000000,...,-1.0,976.905181,0.839779,7,0.868085,9,Adelaide.1991.7,1991.7.St KildaAdelaide,1991-05-05,1991-05-02
7,Adelaide,North Melbourne,Regular,M.C.G.,0.00,0.0,0.00,0.0,0.000000,0.000000,...,3.0,975.410314,0.731121,12,1.164962,6,Adelaide.1991.9,1991.9.North MelbourneAdelaide,1991-05-18,1991-05-16
8,Adelaide,Melbourne,Regular,Football Park,0.00,0.0,0.00,0.0,0.000000,0.000000,...,5.0,993.725636,0.760804,12,1.244009,2,Adelaide.1991.10,1991.10.AdelaideMelbourne,1991-05-25,1991-05-23
9,Adelaide,Geelong,Regular,Kardinia Park,0.00,0.0,0.00,0.0,0.000000,0.000000,...,-1.0,974.011800,0.783833,12,1.156687,7,Adelaide.1991.11,1991.11.GeelongAdelaide,1991-06-02,1991-05-31


In [46]:
ft_df.columns

Index(['team', 'oppo_team', 'round_type', 'venue', 'win_odds', 'line_odds',
       'oppo_win_odds', 'oppo_line_odds', 'rolling_pred_win_rate',
       'oppo_rolling_pred_win_rate', 'rolling_prev_match_kicks',
       'rolling_prev_match_marks', 'rolling_prev_match_handballs',
       'rolling_prev_match_goals', 'rolling_prev_match_behinds',
       'rolling_prev_match_hit_outs', 'rolling_prev_match_tackles',
       'rolling_prev_match_rebounds', 'rolling_prev_match_inside_50s',
       'rolling_prev_match_clearances', 'rolling_prev_match_clangers',
       'rolling_prev_match_frees_for', 'rolling_prev_match_frees_against',
       'rolling_prev_match_contested_possessions',
       'rolling_prev_match_uncontested_possessions',
       'rolling_prev_match_contested_marks',
       'rolling_prev_match_marks_inside_50',
       'rolling_prev_match_one_percenters', 'rolling_prev_match_bounces',
       'rolling_prev_match_goal_assists', 'rolling_prev_match_time_on_ground',
       'last_year_brownlow_v

In [50]:
# Make team-match entity as base

variable_types = {
    'team': vtypes.Categorical,
    'oppo_team': vtypes.Categorical,
    'round_type': vtypes.Categorical,
    'venue': vtypes.Categorical,
    'year': vtypes.Ordinal,
    'round_number': vtypes.Ordinal,
    'at_home': vtypes.Boolean,
    'out_of_state': vtypes.Boolean,
    # These really should be 'last_week_win', 'oppo_last_week_win'
    'last_week_result': vtypes.Boolean,
    'oppo_last_week_result': vtypes.Boolean,
    'ladder_position': vtypes.Ordinal,
    'oppo_ladder_position': vtypes.Ordinal,
}

es = ft.EntitySet('Team Matches')
es = es.entity_from_dataframe(
    entity_id='team_matches',
    dataframe=ft_df,
    index='team_match_id',
    time_index='date',
    secondary_time_index={'day_after_match': ['score', 'oppo_score']},
    variable_types=variable_types
)

# Add match entity
es.normalize_entity('team_matches', 'matches', 'match_id', additional_variables=['venue'])
# Add team entity
es.normalize_entity('team_matches', 'teams', 'team', make_time_index=False)
# Add venue entity
es.normalize_entity('matches', 'venues', 'venue', make_time_index=False)
# Add year entity
es.normalize_entity('team_matches', 'years', 'year', make_time_index=False)
# Add round_number entity
es.normalize_entity('team_matches', 'round_numbers', 'round_number',
                    additional_variables=['round_type'], make_time_index=False)

# Take the index and the day before the first match of the round to use as a cutoff time
cutoff_times = (es['team_matches']
                .df[['team_match_id', 'round_start_date', 'score', 'oppo_score']]
                .sort_values(by='round_start_date'))

es

Entityset: Team Matches
  Entities:
    team_matches [Rows: 29950, Columns: 85]
    matches [Rows: 14981, Columns: 3]
    teams [Rows: 20, Columns: 1]
    venues [Rows: 44, Columns: 1]
    years [Rows: 120, Columns: 1]
    round_numbers [Rows: 28, Columns: 2]
  Relationships:
    team_matches.match_id -> matches.match_id
    team_matches.team -> teams.team
    matches.venue -> venues.venue
    team_matches.year -> years.year
    team_matches.round_number -> round_numbers.round_number

In [None]:
# Generate features using the constructed entityset
fm, features = ft.dfs(entityset=es,
                      target_entity='team_matches',
                      agg_primitives=["sum", "std", "max", "skew", "min", "mean", "count", "percent_true",
                                      "last", "trend", "time_since_last"],
                      trans_primitives=["day", "year", "month"]
                      ['weekend', 'weekday', 'day', 'month', 'year', 'time_since_previous'],
                      max_depth=2,
                      approximate='24h',
                      cutoff_time=cutoff_times,
                      verbose=True)