In [18]:
import os
import sys
import warnings
from functools import reduce, partial
import pandas as pd
import numpy as np
from sklearn.exceptions import DataConversionWarning
from sklearn.metrics import mean_absolute_error
import featuretools as ft
import featuretools.variable_types as vtypes

PROJECT_PATH = os.path.join(os.getcwd(), '../')

if PROJECT_PATH not in sys.path:
    sys.path.append(PROJECT_PATH)
    
from server.ml_models.all_model import AllModelData
from server.ml_models.match_model import MatchModelData
from server.ml_models.player_model import PlayerModelData
from server.ml_models.betting_model import BettingModelData
from server.ml_models import EnsembleModel

from src.model.metrics import yearly_performance_scores
from src.model.charts import graph_yearly_model_performance

SEED = 42

np.random.seed(SEED)
warnings.simplefilter("ignore", DataConversionWarning)

## Prepare raw data for featuretools
featuretools handles a lot of the data transformation that I was doing myself, and things got messing when I was trying to use ft after doing all my own aggregations/transformations, so I'm taking a step back and passing raw data to ft and letting them take it from there.

In [19]:
data_kwargs = {'data_transformers': [], 'index_cols': ['home_team', 'year', 'round_number']}
betting_data = BettingModelData
player_data = PlayerModelData
match_data = MatchModelData
# data = AllModelData(data_readers=[betting_data, player_data, match_data],
#                     data_reader_kwargs=[data_kwargs, data_kwargs, data_kwargs],
#                     category_cols=None)

# data.data

bd = betting_data(**data_kwargs)
pld = player_data(**data_kwargs)
md = match_data(**data_kwargs)

  res = PandasDataFrame.from_items(items)


In [20]:
SHARED_COLS = ['away_score', 'away_team', 'home_score', 'home_team', 'round_number', 'year']

raw_df = (md.data
          .merge(bd.data, how='left', on=SHARED_COLS)
          .sort_values(['year', 'round_number', 'home_team'])
          .reset_index(drop=True))

raw_df

Unnamed: 0,date,home_team,home_goals,home_behinds,home_score,away_team,away_goals,away_behinds,away_score,venue,home_margin,year,round_type,round_number,home_win_odds,home_line_odds,away_win_odds,away_line_odds
0,1897-05-08,Collingwood,5,11,41,St Kilda,2,4,16,Victoria Park,25,1897,Regular,1,,,,
1,1897-05-08,Fitzroy,6,13,49,Carlton,2,4,16,Brunswick St,33,1897,Regular,1,,,,
2,1897-05-08,Geelong,3,6,24,Essendon,7,5,47,Corio Oval,-23,1897,Regular,1,,,,
3,1897-05-08,Sydney,3,9,27,Melbourne,6,8,44,Lake Oval,-17,1897,Regular,1,,,,
4,1897-05-15,Essendon,4,6,30,Collingwood,8,2,50,East Melbourne,-20,1897,Regular,2,,,,
5,1897-05-15,Melbourne,9,10,64,Geelong,3,1,19,M.C.G.,45,1897,Regular,2,,,,
6,1897-05-15,St Kilda,3,8,26,Fitzroy,10,6,66,Junction Oval,-40,1897,Regular,2,,,,
7,1897-05-15,Sydney,6,4,40,Carlton,5,6,36,Lake Oval,4,1897,Regular,2,,,,
8,1897-05-22,Collingwood,6,5,41,Geelong,5,7,37,Victoria Park,4,1897,Regular,3,,,,
9,1897-05-24,Essendon,12,6,78,Carlton,6,5,41,East Melbourne,37,1897,Regular,3,,,,


In [21]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15395 entries, 0 to 15394
Data columns (total 18 columns):
date              15395 non-null datetime64[ns]
home_team         15395 non-null object
home_goals        15395 non-null int32
home_behinds      15395 non-null int32
home_score        15395 non-null int32
away_team         15395 non-null object
away_goals        15395 non-null int32
away_behinds      15395 non-null int32
away_score        15395 non-null int32
venue             15395 non-null object
home_margin       15395 non-null int32
year              15395 non-null int64
round_type        15395 non-null object
round_number      15395 non-null int32
home_win_odds     1801 non-null float64
home_line_odds    1801 non-null float64
away_win_odds     1801 non-null float64
away_line_odds    1801 non-null float64
dtypes: datetime64[ns](1), float64(4), int32(8), int64(1), object(4)
memory usage: 1.6+ MB


In [33]:
def match_id(data_frame):
    teams = data_frame['home_team'].str.cat(data_frame['away_team'], sep='.')
    # Need to sort teams alphabetically, because some edge cases with draws & repeated matches
    # make consistent IDs difficult if based on home/away team names
    sorted_teams = teams.map(lambda teams: '.'.join(sorted(teams.split('.'))))

    return (data_frame['year'].astype(str) + '.' +
            data_frame['round_number'].astype(str) + '.' +
            sorted_teams)

round_start = (raw_df.groupby(['year', 'round_number'])['date']
                  .min()
                  .rename('round_start_date')
                  .reset_index())
end_of_round = (
    (raw_df.groupby(['year', 'round_number'])['date'].max() + pd.Timedelta(hours=23, minutes=59, seconds=59))
    .rename('end_of_round')
    .reset_index()
)
end_of_season = end_of_round.groupby('year')['end_of_round'].max().rename('end_of_season').reset_index()

clean_df = (raw_df
             .fillna(0)
             .assign(
                 match_id=match_id,
                 # By default dates w/o time have 00:00:00 as their timestamp
                 end_of_day=lambda df: df['date'] + pd.Timedelta(hours=23, minutes=59, seconds=59),
             )
             .merge(round_start, on=['year', 'round_number'], how='left')
             .merge(end_of_round, on=['year', 'round_number'], how='left')
             .merge(end_of_season, on=['year'], how='left')
             # Sort by date and drop duplicates to get rid of finals replays due to draws
             .sort_values('date')
             .drop_duplicates(subset='match_id', keep="last"))

clean_df

Unnamed: 0,date,home_team,home_goals,home_behinds,home_score,away_team,away_goals,away_behinds,away_score,venue,...,round_number,home_win_odds,home_line_odds,away_win_odds,away_line_odds,match_id,end_of_day,round_start_date,end_of_round,end_of_season
0,1897-05-08,Collingwood,5,11,41,St Kilda,2,4,16,Victoria Park,...,1,0.00,0.0,0.00,0.0,1897.1.Collingwood.St Kilda,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59
1,1897-05-08,Fitzroy,6,13,49,Carlton,2,4,16,Brunswick St,...,1,0.00,0.0,0.00,0.0,1897.1.Carlton.Fitzroy,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59
2,1897-05-08,Geelong,3,6,24,Essendon,7,5,47,Corio Oval,...,1,0.00,0.0,0.00,0.0,1897.1.Essendon.Geelong,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59
3,1897-05-08,Sydney,3,9,27,Melbourne,6,8,44,Lake Oval,...,1,0.00,0.0,0.00,0.0,1897.1.Melbourne.Sydney,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59
4,1897-05-15,Essendon,4,6,30,Collingwood,8,2,50,East Melbourne,...,2,0.00,0.0,0.00,0.0,1897.2.Collingwood.Essendon,1897-05-15 23:59:59,1897-05-15,1897-05-15 23:59:59,1897-08-07 23:59:59
5,1897-05-15,Melbourne,9,10,64,Geelong,3,1,19,M.C.G.,...,2,0.00,0.0,0.00,0.0,1897.2.Geelong.Melbourne,1897-05-15 23:59:59,1897-05-15,1897-05-15 23:59:59,1897-08-07 23:59:59
6,1897-05-15,St Kilda,3,8,26,Fitzroy,10,6,66,Junction Oval,...,2,0.00,0.0,0.00,0.0,1897.2.Fitzroy.St Kilda,1897-05-15 23:59:59,1897-05-15,1897-05-15 23:59:59,1897-08-07 23:59:59
7,1897-05-15,Sydney,6,4,40,Carlton,5,6,36,Lake Oval,...,2,0.00,0.0,0.00,0.0,1897.2.Carlton.Sydney,1897-05-15 23:59:59,1897-05-15,1897-05-15 23:59:59,1897-08-07 23:59:59
8,1897-05-22,Collingwood,6,5,41,Geelong,5,7,37,Victoria Park,...,3,0.00,0.0,0.00,0.0,1897.3.Collingwood.Geelong,1897-05-22 23:59:59,1897-05-22,1897-05-24 23:59:59,1897-08-07 23:59:59
10,1897-05-22,Fitzroy,5,9,39,Melbourne,7,8,50,Brunswick St,...,3,0.00,0.0,0.00,0.0,1897.3.Fitzroy.Melbourne,1897-05-22 23:59:59,1897-05-22,1897-05-24 23:59:59,1897-08-07 23:59:59


In [131]:
WIN_POINTS = 4
# Constants for ELO calculations
BASE_RATING = 1000
K = 35.6
X = 0.49
M = 130
# Home Ground Advantage
HGA = 9
S = 250
CARRYOVER = 0.575
INDEX_COLS = ['team', 'year', 'round_number']
TEAM_CITIES = {
    "Adelaide": "Adelaide",
    "Brisbane": "Brisbane",
    "Carlton": "Melbourne",
    "Collingwood": "Melbourne",
    "Essendon": "Melbourne",
    "Fitzroy": "Melbourne",
    "Western Bulldogs": "Melbourne",
    "Fremantle": "Perth",
    "GWS": "Sydney",
    "Geelong": "Geelong",
    "Gold Coast": "Gold Coast",
    "Hawthorn": "Melbourne",
    "Melbourne": "Melbourne",
    "North Melbourne": "Melbourne",
    "Port Adelaide": "Adelaide",
    "Richmond": "Melbourne",
    "St Kilda": "Melbourne",
    "Sydney": "Sydney",
    "University": "Melbourne",
    "West Coast": "Perth",
}

CITIES = {
    "Adelaide": {"state": "SA", "lat": -34.9285, "long": 138.6007},
    "Sydney": {"state": "NSW", "lat": -33.8688, "long": 151.2093},
    "Melbourne": {"state": "VIC", "lat": -37.8136, "long": 144.9631},
    "Geelong": {"state": "VIC", "lat": -38.1499, "long": 144.3617},
    "Perth": {"state": "WA", "lat": -31.9505, "long": 115.8605},
    "Gold Coast": {"state": "QLD", "lat": -28.0167, "long": 153.4000},
    "Brisbane": {"state": "QLD", "lat": -27.4698, "long": 153.0251},
    "Launceston": {"state": "TAS", "lat": -41.4332, "long": 147.1441},
    "Canberra": {"state": "ACT", "lat": -35.2809, "long": 149.1300},
    "Hobart": {"state": "TAS", "lat": -42.8821, "long": 147.3272},
    "Darwin": {"state": "NT", "lat": -12.4634, "long": 130.8456},
    "Alice Springs": {"state": "NT", "lat": -23.6980, "long": 133.8807},
    "Wellington": {"state": "NZ", "lat": -41.2865, "long": 174.7762},
    "Euroa": {"state": "VIC", "lat": -36.7500, "long": 145.5667},
    "Yallourn": {"state": "VIC", "lat": -38.1803, "long": 146.3183},
    "Cairns": {"state": "QLD", "lat": -6.9186, "long": 145.7781},
    "Ballarat": {"state": "VIC", "lat": -37.5622, "long": 143.8503},
    "Shanghai": {"state": "CHN", "lat": 31.2304, "long": 121.4737},
    "Albury": {"state": "NSW", "lat": -36.0737, "long": 146.9135},
}


def city_lat_long(city):
    return CITIES[city]['lat'], CITIES[city]['long']


def team_match_id(df):
    return df['year'].astype(str) + '.' + df['round_number'].astype(str) + '.' + df['team']


def match_result(margin):
    if margin > 0:
        return 1
    if margin < 0:
        return 0
    return 0.5


def home_away_df(at_home, df):
    team_label = 'home_' if at_home else 'away_'
    margin = df['home_margin'] if at_home else df['home_margin'] * -1
    
    return (df
            .filter(regex=f'^{team_label}|year|round_number|match_id|date')
            .drop_duplicates()
            .rename(columns=lambda col: col.replace(team_label, ''))
            .assign(at_home=at_home,
                    team_match_id=team_match_id,
                    margin=margin,
                    oppo_score=lambda df: df['score'] - margin,
                    match_result=margin.map(match_result))
            .assign(match_points=lambda df: df['match_result'] * WIN_POINTS)
            .set_index(INDEX_COLS, drop=False)
            .rename_axis([None, None, None]))

# Calculates the ladder position at the end of the round of the given match
def ladder_position(data_frame):
    df = data_frame.sort_index()

    cum_match_points = (df
                        .groupby(['team', 'year'])["match_points"]
                        .cumsum())
    
    cum_score = (df
                 .groupby(['team', 'year'])["score"]
                 .cumsum())

    cum_oppo_score = (df
                      .groupby(['team', 'year'])['oppo_score']
                      .cumsum())
    
    # Pivot to get round-by-round match points and cumulative percent
    ladder_pivot_table = (df
                          .assign(cum_match_points=cum_match_points, cum_percent=(cum_score / cum_oppo_score))
                          .loc[:, INDEX_COLS + ['cum_match_points', 'cum_percent']]
                          .pivot_table(index=["year", "round_number"],
                                       values=["cum_match_points", "cum_percent"],
                                       columns="team",
                                       aggfunc={"cum_match_points": np.sum, "cum_percent": np.mean}))

    # To get round-by-round ladder ranks, we sort each round by win points & percent,
    # then save index numbers
    ladder_index = []
    ladder_values = []

    for year_round_idx, round_row in ladder_pivot_table.iterrows():
        sorted_row = round_row.unstack(level=0).sort_values(
            ["cum_match_points", "cum_percent"], ascending=False
        )

        for ladder_idx, team_name in enumerate(sorted_row.index.get_values()):
            ladder_index.append(tuple([team_name, *year_round_idx]))
            ladder_values.append(ladder_idx + 1)

    ladder_multi_index = pd.MultiIndex.from_tuples(
        ladder_index, names=tuple(INDEX_COLS)
    )
    ladder_position_col = pd.Series(
        ladder_values, index=ladder_multi_index, name="ladder_position"
    )

    return ladder_position_col


# Basing ELO calculations on:
# http://www.matterofstats.com/mafl-stats-journal/2013/10/13/building-your-own-team-rating-system.html
def _elo_formula(
    prev_elo_rating, prev_oppo_elo_rating, margin, at_home
):
    hga = HGA if at_home else HGA * -1
    expected_outcome = 1 / (
        1 + 10 ** ((prev_oppo_elo_rating - prev_elo_rating - hga) / S)
    )
    actual_outcome = X + 0.5 - X ** (1 + (margin / M))

    return prev_elo_rating + (K * (actual_outcome - expected_outcome))


def _cross_year_elo(elo_rating):
    return (elo_rating * CARRYOVER) + (BASE_RATING * (1 - CARRYOVER))


def _calculate_prev_elo_ratings(prev_match, prev_oppo_match, cum_elo_ratings, year):
    if cum_elo_ratings is None:
        return BASE_RATING, BASE_RATING

    if prev_match is None:
        prev_elo_rating = BASE_RATING
    else:
        prev_elo_rating = cum_elo_ratings.loc[prev_match.name]
        
        if prev_match["year"] != year:
            prev_elo_rating = _cross_year_elo(prev_elo_rating)
        
    if prev_oppo_match is None:
        prev_oppo_elo_rating = BASE_RATING
    else:
        prev_oppo_elo_rating = cum_elo_ratings.loc[prev_oppo_match.name]
        
        if prev_oppo_match["year"] != year:
            prev_oppo_elo_rating = _cross_year_elo(prev_oppo_elo_rating)
            
    if isinstance(prev_elo_rating, pd.Series):
        raise TypeError(
            f"ELO series returned a subsection of itself at index {prev_match.name} "
            "when a single value is expected. Check the data frame for duplicate "
            "index values."
        )
        
    if isinstance(prev_oppo_elo_rating, pd.Series):
        raise TypeError(
            f"ELO series returned a subsection of itself at index {prev_oppo_match.name} "
            "when a single value is expected. Check the data frame for duplicate "
            "index values."
        )
    
    return prev_elo_rating, prev_oppo_elo_rating


def _calculate_elo_rating(prev_match, prev_oppo_match, match_row, cum_elo_ratings):
    prev_elo_rating, prev_oppo_elo_rating = _calculate_prev_elo_ratings(
        prev_match, prev_oppo_match, cum_elo_ratings, match_row['year']
    )

    margin = match_row["margin"]
    at_home = bool(match_row["at_home"])

    return _elo_formula(
        prev_elo_rating, prev_oppo_elo_rating, margin, at_home
    )


def _get_previous_match(
    data_frame: pd.DataFrame, year: int, round_number: int, team: str
):
    prev_team_matches = data_frame.loc[
        (data_frame["team"] == team)
        & (data_frame["year"] == year)
        & (data_frame["round_number"] < round_number),
        :,
    ]

    # If we can't find any previous matches this season, filter by last season
    if not prev_team_matches.any().any():
        prev_team_matches = data_frame.loc[
            (data_frame["team"] == team) & (data_frame["year"] == year - 1), :
        ]

    if not prev_team_matches.any().any():
        return None

    return prev_team_matches.iloc[-1, :]


# Assumes df sorted by year & round_number, with ascending=True in order to find teams'
# previous matches
def _calculate_match_elo_rating(
    root_data_frame,
    cum_elo_ratings,
    items,
):
    data_frame = root_data_frame.copy()
    index, match_row = items
    year, round_number, team = index
    oppo_team = (data_frame
                 .loc[(data_frame['match_id'] == match_row['match_id']) & (data_frame['team'] != team), 'team']
                 .iloc[0])

    prev_match = _get_previous_match(data_frame, year, round_number, team)
    prev_oppo_match = _get_previous_match(data_frame, year, round_number, oppo_team)
    elo_rating = _calculate_elo_rating(prev_match, prev_oppo_match, match_row, cum_elo_ratings)

    elo_data = [elo_rating]
    elo_index = pd.MultiIndex.from_tuples([(year, round_number, team)])
    elo_ratings = pd.Series(data=elo_data, index=elo_index)

    if cum_elo_ratings is None:
        return elo_ratings.copy()

    return cum_elo_ratings.append(elo_ratings)


def add_elo_rating(data_frame):
    elo_data_frame = data_frame.reorder_levels(
        [1, 2, 0]
    ).sort_index(ascending=True)

    elo_column = (
        reduce(
            partial(_calculate_match_elo_rating, elo_data_frame),
            elo_data_frame.iterrows(),
            None,
        )
        .reorder_levels([2, 0, 1])
        .sort_index()
    )

    return elo_column

In [132]:
team_df = (pd
           .concat([home_away_df(True, clean_df), home_away_df(False, clean_df)], sort=True)
           .sort_index()
           .rename(columns={'goals': 'team_goals', 'behinds': 'team_behinds'})
           .assign(home_city=lambda df: df['team'].map(TEAM_CITIES),
                   ladder_position=ladder_position,
                   elo_rating=add_elo_rating)
           .assign(home_lat_long=lambda df: df['home_city'].map(city_lat_long))
           # Dropping shared columns with match data frame (except match_id)
           .drop(['date', 'year', 'round_number', 'round_start_date', 'oppo_score'], axis=1))

team_df

Unnamed: 0,Unnamed: 1,Unnamed: 2,at_home,team_behinds,team_goals,line_odds,margin,match_id,match_points,match_result,score,team,team_match_id,win_odds,home_city,ladder_position,elo_rating,home_lat_long
Adelaide,1991,1,True,11,24,0.0,86,1991.1.Adelaide.Hawthorn,4.0,1.0,155,Adelaide,1991.1.Adelaide,0.00,Adelaide,2,1005.464951,"(-34.9285, 138.6007)"
Adelaide,1991,2,True,9,12,0.0,-23,1991.2.Adelaide.Carlton,0.0,0.0,81,Adelaide,1991.2.Adelaide,0.00,Adelaide,3,1000.852303,"(-34.9285, 138.6007)"
Adelaide,1991,3,False,18,19,0.0,24,1991.3.Adelaide.Sydney,4.0,1.0,132,Adelaide,1991.3.Adelaide,0.00,Adelaide,5,999.782830,"(-34.9285, 138.6007)"
Adelaide,1991,4,False,11,6,0.0,-45,1991.4.Adelaide.Essendon,0.0,0.0,47,Adelaide,1991.4.Adelaide,0.00,Adelaide,8,995.684236,"(-34.9285, 138.6007)"
Adelaide,1991,5,False,11,9,0.0,-65,1991.5.Adelaide.West Coast,0.0,0.0,65,Adelaide,1991.5.Adelaide,0.00,Adelaide,8,989.203471,"(-34.9285, 138.6007)"
Adelaide,1991,6,True,14,19,0.0,31,1991.6.Adelaide.Western Bulldogs,4.0,1.0,128,Adelaide,1991.6.Adelaide,0.00,Adelaide,8,989.827156,"(-34.9285, 138.6007)"
Adelaide,1991,7,False,7,4,0.0,-131,1991.7.Adelaide.St Kilda,0.0,0.0,31,Adelaide,1991.7.Adelaide,0.00,Adelaide,10,971.161688,"(-34.9285, 138.6007)"
Adelaide,1991,9,False,10,18,0.0,-2,1991.9.Adelaide.North Melbourne,0.0,0.0,118,Adelaide,1991.9.Adelaide,0.00,Adelaide,11,972.056377,"(-34.9285, 138.6007)"
Adelaide,1991,10,True,16,15,0.0,34,1991.10.Adelaide.Melbourne,4.0,1.0,106,Adelaide,1991.10.Adelaide,0.00,Adelaide,8,976.070235,"(-34.9285, 138.6007)"
Adelaide,1991,11,False,9,14,0.0,-84,1991.11.Adelaide.Geelong,0.0,0.0,93,Adelaide,1991.11.Adelaide,0.00,Adelaide,10,966.432188,"(-34.9285, 138.6007)"


In [133]:
VENUE_CITIES = {
    "Football Park": "Adelaide",
    "S.C.G.": "Sydney",
    "Windy Hill": "Melbourne",
    "Subiaco": "Perth",
    "Moorabbin Oval": "Melbourne",
    "M.C.G.": "Melbourne",
    "Kardinia Park": "Geelong",
    "Victoria Park": "Melbourne",
    "Waverley Park": "Melbourne",
    "Princes Park": "Melbourne",
    "Western Oval": "Melbourne",
    "W.A.C.A.": "Perth",
    "Carrara": "Gold Coast",
    "Gabba": "Brisbane",
    "Docklands": "Melbourne",
    "York Park": "Launceston",
    "Manuka Oval": "Canberra",
    "Sydney Showground": "Sydney",
    "Adelaide Oval": "Adelaide",
    "Bellerive Oval": "Hobart",
    "Marrara Oval": "Darwin",
    "Traeger Park": "Alice Springs",
    "Perth Stadium": "Perth",
    "Stadium Australia": "Sydney",
    "Wellington": "Wellington",
    "Lake Oval": "Melbourne",
    "East Melbourne": "Melbourne",
    "Corio Oval": "Geelong",
    "Junction Oval": "Melbourne",
    "Brunswick St": "Melbourne",
    "Punt Rd": "Melbourne",
    "Glenferrie Oval": "Melbourne",
    "Arden St": "Melbourne",
    "Olympic Park": "Melbourne",
    "Yarraville Oval": "Melbourne",
    "Toorak Park": "Melbourne",
    "Euroa": "Euroa",
    "Coburg Oval": "Melbourne",
    "Brisbane Exhibition": "Brisbane",
    "North Hobart": "Hobart",
    "Bruce Stadium": "Canberra",
    "Yallourn": "Yallourn",
    "Cazaly's Stadium": "Cairns",
    "Eureka Stadium": "Ballarat",
    "Blacktown": "Sydney",
    "Jiangwan Stadium": "Shanghai",
    "Albury": "Albury",
}


team_cols = clean_df.filter(regex='^(home_|away_)').columns
match_df = (clean_df
            .drop(team_cols, axis=1)
            .assign(venue_city=lambda df: df['venue'].map(VENUE_CITIES))
            .assign(venue_lat_long=lambda df: df['venue_city'].map(city_lat_long)))

match_df

Unnamed: 0,date,venue,year,round_type,round_number,match_id,end_of_day,round_start_date,end_of_round,end_of_season,venue_city,venue_lat_long
0,1897-05-08,Victoria Park,1897,Regular,1,1897.1.Collingwood.St Kilda,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59,Melbourne,"(-37.8136, 144.9631)"
1,1897-05-08,Brunswick St,1897,Regular,1,1897.1.Carlton.Fitzroy,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59,Melbourne,"(-37.8136, 144.9631)"
2,1897-05-08,Corio Oval,1897,Regular,1,1897.1.Essendon.Geelong,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59,Geelong,"(-38.1499, 144.3617)"
3,1897-05-08,Lake Oval,1897,Regular,1,1897.1.Melbourne.Sydney,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59,Melbourne,"(-37.8136, 144.9631)"
4,1897-05-15,East Melbourne,1897,Regular,2,1897.2.Collingwood.Essendon,1897-05-15 23:59:59,1897-05-15,1897-05-15 23:59:59,1897-08-07 23:59:59,Melbourne,"(-37.8136, 144.9631)"
5,1897-05-15,M.C.G.,1897,Regular,2,1897.2.Geelong.Melbourne,1897-05-15 23:59:59,1897-05-15,1897-05-15 23:59:59,1897-08-07 23:59:59,Melbourne,"(-37.8136, 144.9631)"
6,1897-05-15,Junction Oval,1897,Regular,2,1897.2.Fitzroy.St Kilda,1897-05-15 23:59:59,1897-05-15,1897-05-15 23:59:59,1897-08-07 23:59:59,Melbourne,"(-37.8136, 144.9631)"
7,1897-05-15,Lake Oval,1897,Regular,2,1897.2.Carlton.Sydney,1897-05-15 23:59:59,1897-05-15,1897-05-15 23:59:59,1897-08-07 23:59:59,Melbourne,"(-37.8136, 144.9631)"
8,1897-05-22,Victoria Park,1897,Regular,3,1897.3.Collingwood.Geelong,1897-05-22 23:59:59,1897-05-22,1897-05-24 23:59:59,1897-08-07 23:59:59,Melbourne,"(-37.8136, 144.9631)"
10,1897-05-22,Brunswick St,1897,Regular,3,1897.3.Fitzroy.Melbourne,1897-05-22 23:59:59,1897-05-22,1897-05-24 23:59:59,1897-08-07 23:59:59,Melbourne,"(-37.8136, 144.9631)"


In [134]:
team_match_df = match_df.merge(team_df, on='match_id', how='left')

team_match_df

Unnamed: 0,date,venue,year,round_type,round_number,match_id,end_of_day,round_start_date,end_of_round,end_of_season,...,match_points,match_result,score,team,team_match_id,win_odds,home_city,ladder_position,elo_rating,home_lat_long
0,1897-05-08,Victoria Park,1897,Regular,1,1897.1.Collingwood.St Kilda,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59,...,4.0,1.0,41,Collingwood,1897.1.Collingwood,0.00,Melbourne,2,1001.498804,"(-37.8136, 144.9631)"
1,1897-05-08,Victoria Park,1897,Regular,1,1897.1.Collingwood.St Kilda,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59,...,0.0,0.0,16,St Kilda,1897.1.St Kilda,0.00,Melbourne,7,998.172401,"(-37.8136, 144.9631)"
2,1897-05-08,Brunswick St,1897,Regular,1,1897.1.Carlton.Fitzroy,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59,...,0.0,0.0,16,Carlton,1897.1.Carlton,0.00,Melbourne,8,997.274476,"(-37.8136, 144.9631)"
3,1897-05-08,Brunswick St,1897,Regular,1,1897.1.Carlton.Fitzroy,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59,...,4.0,1.0,49,Fitzroy,1897.1.Fitzroy,0.00,Melbourne,1,1002.151964,"(-37.8136, 144.9631)"
4,1897-05-08,Corio Oval,1897,Regular,1,1897.1.Essendon.Geelong,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59,...,4.0,1.0,47,Essendon,1897.1.Essendon,0.00,Melbourne,3,1002.805636,"(-37.8136, 144.9631)"
5,1897-05-08,Corio Oval,1897,Regular,1,1897.1.Essendon.Geelong,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59,...,0.0,0.0,24,Geelong,1897.1.Geelong,0.00,Geelong,6,996.916139,"(-38.1499, 144.3617)"
6,1897-05-08,Lake Oval,1897,Regular,1,1897.1.Melbourne.Sydney,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59,...,4.0,1.0,44,Melbourne,1897.1.Melbourne,0.00,Melbourne,4,1002.290984,"(-37.8136, 144.9631)"
7,1897-05-08,Lake Oval,1897,Regular,1,1897.1.Melbourne.Sydney,1897-05-08 23:59:59,1897-05-08,1897-05-08 23:59:59,1897-08-07 23:59:59,...,0.0,0.0,27,Sydney,1897.1.Sydney,0.00,Sydney,5,997.557110,"(-33.8688, 151.2093)"
8,1897-05-15,East Melbourne,1897,Regular,2,1897.2.Collingwood.Essendon,1897-05-15 23:59:59,1897-05-15,1897-05-15 23:59:59,1897-08-07 23:59:59,...,4.0,1.0,50,Collingwood,1897.2.Collingwood,0.00,Melbourne,3,1004.156143,"(-37.8136, 144.9631)"
9,1897-05-15,East Melbourne,1897,Regular,2,1897.2.Collingwood.Essendon,1897-05-15 23:59:59,1897-05-15,1897-05-15 23:59:59,1897-08-07 23:59:59,...,0.0,0.0,30,Essendon,1897.2.Essendon,0.00,Melbourne,4,999.937986,"(-37.8136, 144.9631)"


In [52]:
def player_team_match_id(df):
    return df['team_match_id'] + '.' + df['player_id'].astype(str)

def playing_for_team_match_id(df):
    return df['year'].astype(str) + '.' + df['round_number'].astype(str) + '.' + df['playing_for']

player_dates = team_match_df[['end_of_season', 'end_of_day', 'round_start_date', 'team_match_id']]
player_df = (pld.data
             .assign(team_match_id=playing_for_team_match_id,
                     player_team_match_id=player_team_match_id)
             .merge(player_dates, on='team_match_id', how='left')
             .drop(SHARED_COLS + ['player_name', 'match_id'], axis=1))
player_df

Unnamed: 0,player_id,playing_for,kicks,marks,handballs,goals,behinds,hit_outs,tackles,rebounds,...,marks_inside_50,one_percenters,bounces,goal_assists,time_on_ground,team_match_id,player_team_match_id,end_of_season,end_of_day,round_start_date
0,1,Adelaide,15.0,9.0,14.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0,1991.1.Adelaide,1991.1.Adelaide.1,1991-09-28 23:59:59,1991-03-22 23:59:59,1991-03-22
1,11504,Adelaide,7.0,4.0,10.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,1991.1.Adelaide,1991.1.Adelaide.11504,1991-09-28 23:59:59,1991-03-22 23:59:59,1991-03-22
2,11505,Adelaide,3.0,2.0,5.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0,1991.1.Adelaide,1991.1.Adelaide.11505,1991-09-28 23:59:59,1991-03-22 23:59:59,1991-03-22
3,11507,Adelaide,11.0,9.0,10.0,4.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,1991.1.Adelaide,1991.1.Adelaide.11507,1991-09-28 23:59:59,1991-03-22 23:59:59,1991-03-22
4,11509,Adelaide,10.0,4.0,4.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0,1991.1.Adelaide,1991.1.Adelaide.11509,1991-09-28 23:59:59,1991-03-22 23:59:59,1991-03-22
5,11510,Adelaide,6.0,5.0,7.0,4.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,1991.1.Adelaide,1991.1.Adelaide.11510,1991-09-28 23:59:59,1991-03-22 23:59:59,1991-03-22
6,11512,Adelaide,13.0,10.0,3.0,2.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,1991.1.Adelaide,1991.1.Adelaide.11512,1991-09-28 23:59:59,1991-03-22 23:59:59,1991-03-22
7,11513,Adelaide,4.0,3.0,4.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0,1991.1.Adelaide,1991.1.Adelaide.11513,1991-09-28 23:59:59,1991-03-22 23:59:59,1991-03-22
8,1257,Hawthorn,12.0,2.0,6.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0,1991.1.Hawthorn,1991.1.Hawthorn.1257,1991-09-28 23:59:59,1991-03-22 23:59:59,1991-03-22
9,1289,Adelaide,1.0,5.0,9.0,0.0,0.0,10.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0,1991.1.Adelaide,1991.1.Adelaide.1289,1991-09-28 23:59:59,1991-03-22 23:59:59,1991-03-22


In [130]:
team_match_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30774 entries, 0 to 30773
Data columns (total 27 columns):
date                30774 non-null datetime64[ns]
venue               30774 non-null object
year                30774 non-null int64
round_type          30774 non-null object
round_number        30774 non-null int32
match_id            30774 non-null object
end_of_day          30774 non-null datetime64[ns]
round_start_date    30774 non-null datetime64[ns]
end_of_round        30774 non-null datetime64[ns]
end_of_season       30774 non-null datetime64[ns]
venue_city          30774 non-null object
venue_lat_long      30774 non-null object
at_home             30774 non-null bool
team_behinds        30774 non-null int32
team_goals          30774 non-null int32
line_odds           30774 non-null float64
margin              30774 non-null int32
match_points        30774 non-null float64
match_result        30774 non-null float64
score               30774 non-null int32
team            

In [54]:
player_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 353247 entries, 0 to 353246
Data columns (total 29 columns):
player_id                  353247 non-null int32
playing_for                353247 non-null object
kicks                      353247 non-null float64
marks                      353247 non-null float64
handballs                  353247 non-null float64
goals                      353247 non-null float64
behinds                    353247 non-null float64
hit_outs                   353247 non-null float64
tackles                    353247 non-null float64
rebounds                   353247 non-null float64
inside_50s                 353247 non-null float64
clearances                 353247 non-null float64
clangers                   353247 non-null float64
frees_for                  353247 non-null float64
frees_against              353247 non-null float64
brownlow_votes             353247 non-null float64
contested_possessions      353247 non-null float64
uncontested_possessions  

In [135]:
# Make match entity as base

PLAYER_MATCH_COLS = [
    'kicks', 'marks', 'handballs', 'goals', 'behinds', 'hit_outs', 'tackles',
    'rebounds', 'inside_50s', 'clearances', 'clangers', 'frees_for', 'frees_against', 'contested_possessions',
    'uncontested_possessions', 'contested_marks', 'marks_inside_50', 'one_percenters', 'bounces',
    'goal_assists', 'time_on_ground'
]
MATCH_COLS = ['team_behinds', 'team_goals', 'margin', 'match_points', 'match_result', 'score', 'elo_rating']

match_variable_types = {
    'date': vtypes.Datetime,
    'venue': vtypes.Categorical,
    'year': vtypes.Ordinal,
    'round_type': vtypes.Categorical,
    'round_number': vtypes.Ordinal,
    'match_id': vtypes.Categorical,
    'venue_city': vtypes.Categorical,
    'venue_lat_long': vtypes.LatLong,
    'at_home': vtypes.Boolean,
    'match_result': vtypes.Categorical,
    'team': vtypes.Categorical,
    'team_match_id': vtypes.Categorical,
    'home_city': vtypes.Categorical,
    'ladder_position': vtypes.Ordinal,
    'home_lat_long': vtypes.LatLong,
}

player_variable_types = {
    'player_id': vtypes.Categorical,
    'playing_for': vtypes.Categorical,
}

match_secondary_times = {
    'end_of_day': MATCH_COLS,
    'end_of_round': ['ladder_position'],
}

player_secondary_times = {
    'end_of_day': PLAYER_MATCH_COLS,
    'end_of_season': ['brownlow_votes']
}

es = ft.EntitySet('Team Matches')
es = es.entity_from_dataframe(
    entity_id='team_matches',
    dataframe=team_match_df,
    index='team_match_id',
    # Most of the fixture data is known at the beginning of the season, but not all,
    # so setting it to the start of the round simplifies things
    time_index='round_start_date',
    variable_types=match_variable_types,
    secondary_time_index=match_secondary_times,
)

es = es.entity_from_dataframe(
    entity_id='player_matches',
    dataframe=player_df,
    index='player_team_match_id',
    # Most of the fixture data is known at the beginning of the season, but not all,
    # so setting it to the start of the round simplifies things
    time_index='round_start_date',
    variable_types=player_variable_types,
    secondary_time_index=player_secondary_times,
)

# Relationship between team matches and player matches
rel_team_player_match = ft.Relationship(es['team_matches']['team_match_id'], es['player_matches']['team_match_id'])
es = es.add_relationship(rel_team_player_match)

# Add match entity
es.normalize_entity(
    'team_matches', 'matches', 'match_id',
    make_time_index=True,
    additional_variables=['date', 'venue', 'venue_city', 'venue_lat_long', 'year', 'round_number', 'round_type']
)
# Add team entity
es.normalize_entity('team_matches', 'teams', 'team',
                    make_time_index=False,
                    additional_variables=['home_city', 'home_lat_long'])
# Add player entity
es.normalize_entity('player_matches', 'players', 'player_id', make_time_index=True)

# Add venue entity
es.normalize_entity('matches', 'venues', 'venue', make_time_index=False,
                    additional_variables=['venue_city', 'venue_lat_long'])
# Add year entity
es.normalize_entity('matches', 'years', 'year', make_time_index=False)
# Add round_number entity
es.normalize_entity('matches', 'round_numbers', 'round_number',
                    additional_variables=['round_type'], make_time_index=False)

# Take the index and the day before the first match of the round to use as a cutoff time
cutoff_times = (es['team_matches']
                .df[['team_match_id', 'round_start_date', 'margin']]
                # Setting the cutoff after round start to make sure fixture data is available
                .assign(round_start_date=lambda df: df['round_start_date'] + pd.Timedelta(hours=1))
                .rename(columns={'round_start_date': 'cutoff_time'})
                .sort_values('cutoff_time'))

es

Entityset: Team Matches
  Entities:
    team_matches [Rows: 30774, Columns: 18]
    player_matches [Rows: 353247, Columns: 29]
    matches [Rows: 15387, Columns: 6]
    teams [Rows: 20, Columns: 3]
    players [Rows: 5174, Columns: 2]
    venues [Rows: 47, Columns: 3]
    years [Rows: 122, Columns: 1]
    round_numbers [Rows: 28, Columns: 2]
  Relationships:
    player_matches.team_match_id -> team_matches.team_match_id
    team_matches.match_id -> matches.match_id
    team_matches.team -> teams.team
    player_matches.player_id -> players.player_id
    matches.venue -> venues.venue
    matches.year -> years.year
    matches.round_number -> round_numbers.round_number

In [121]:
# Generate features using the constructed entityset
fm, features = ft.dfs(entityset=es,
                      target_entity='team_matches',
                      agg_primitives=["sum", "mean", "last"],
                      trans_primitives=['haversine', 'subtract'],
                      max_depth=1,
                      cutoff_time=cutoff_times,
                      ignore_variables={
                          'player_matches': ['player_id', 'playing_for']
                      }
                      chunks=20
                      verbose=True)

Built 79 features










Elapsed: 00:00 | Remaining: ? | Progress:   0%|          | Calculated: 0/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 03:30 | Remaining: 35:02 | Progress:   9%|▉         | Calculated: 1/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 06:24 | Remaining: 29:55 | Progress:  18%|█▊        | Calculated: 2/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 09:54 | Remaining: 27:00 | Progress:  27%|██▋       | Calculated: 3/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 12:14 | Remaining: 21:26 | Progress:  36%|███▋      | Calculated: 4/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 13:50 | Remaining: 15:45 | Progress:  45%|████▌     | Calculated: 5/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 15:24 | Remaining: 11:32 | Progress:  55%|█████▍    | Calculated: 6/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 17:46 | Remaining: 09:17 | Progress:  64%|██████▎   | Calculated: 7/11 chunks[A[A[

In [136]:
fm

Unnamed: 0_level_0,line_odds,win_odds,match_id,at_home,team,SUM(player_matches.kicks),SUM(player_matches.marks),SUM(player_matches.handballs),SUM(player_matches.goals),SUM(player_matches.behinds),...,LAST(player_matches.time_on_ground),LAST(player_matches.player_id),LAST(player_matches.playing_for),win_odds - line_odds,line_odds - win_odds,matches.venue,matches.year,matches.round_number,teams.home_city,label
team_match_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1897.1.Carlton,0.0,0.00,1897.1.Carlton.Fitzroy,False,Carlton,0.0,0.0,0.0,0.0,0.0,...,,,,0.00,0.00,Brunswick St,1897,1,Melbourne,-33
1897.1.Collingwood,0.0,0.00,1897.1.Collingwood.St Kilda,True,Collingwood,0.0,0.0,0.0,0.0,0.0,...,,,,0.00,0.00,Victoria Park,1897,1,Melbourne,25
1897.1.Essendon,0.0,0.00,1897.1.Essendon.Geelong,False,Essendon,0.0,0.0,0.0,0.0,0.0,...,,,,0.00,0.00,Corio Oval,1897,1,Melbourne,23
1897.1.Fitzroy,0.0,0.00,1897.1.Carlton.Fitzroy,True,Fitzroy,0.0,0.0,0.0,0.0,0.0,...,,,,0.00,0.00,Brunswick St,1897,1,Melbourne,33
1897.1.Geelong,0.0,0.00,1897.1.Essendon.Geelong,True,Geelong,0.0,0.0,0.0,0.0,0.0,...,,,,0.00,0.00,Corio Oval,1897,1,Geelong,-23
1897.1.Melbourne,0.0,0.00,1897.1.Melbourne.Sydney,False,Melbourne,0.0,0.0,0.0,0.0,0.0,...,,,,0.00,0.00,Lake Oval,1897,1,Melbourne,17
1897.1.St Kilda,0.0,0.00,1897.1.Collingwood.St Kilda,False,St Kilda,0.0,0.0,0.0,0.0,0.0,...,,,,0.00,0.00,Victoria Park,1897,1,Melbourne,-25
1897.1.Sydney,0.0,0.00,1897.1.Melbourne.Sydney,True,Sydney,0.0,0.0,0.0,0.0,0.0,...,,,,0.00,0.00,Lake Oval,1897,1,Sydney,-17
1897.2.Carlton,0.0,0.00,1897.2.Carlton.Sydney,False,Carlton,0.0,0.0,0.0,0.0,0.0,...,,,,0.00,0.00,Lake Oval,1897,2,Melbourne,-4
1897.2.Collingwood,0.0,0.00,1897.2.Collingwood.Essendon,False,Collingwood,0.0,0.0,0.0,0.0,0.0,...,,,,0.00,0.00,East Melbourne,1897,2,Melbourne,20


In [137]:
fm.columns

Index(['line_odds', 'win_odds', 'match_id', 'at_home', 'team',
       'SUM(player_matches.kicks)', 'SUM(player_matches.marks)',
       'SUM(player_matches.handballs)', 'SUM(player_matches.goals)',
       'SUM(player_matches.behinds)', 'SUM(player_matches.hit_outs)',
       'SUM(player_matches.tackles)', 'SUM(player_matches.rebounds)',
       'SUM(player_matches.inside_50s)', 'SUM(player_matches.clearances)',
       'SUM(player_matches.clangers)', 'SUM(player_matches.frees_for)',
       'SUM(player_matches.frees_against)',
       'SUM(player_matches.brownlow_votes)',
       'SUM(player_matches.contested_possessions)',
       'SUM(player_matches.uncontested_possessions)',
       'SUM(player_matches.contested_marks)',
       'SUM(player_matches.marks_inside_50)',
       'SUM(player_matches.one_percenters)', 'SUM(player_matches.bounces)',
       'SUM(player_matches.goal_assists)',
       'SUM(player_matches.time_on_ground)', 'MEAN(player_matches.kicks)',
       'MEAN(player_matches.marks)'

In [119]:
# Generate features using the constructed entityset
fm2, features2 = ft.dfs(entityset=es,
                      target_entity='team_matches',
                      agg_primitives=["sum", "max", "min", "mean", "last", "trend"],
                      trans_primitives=["month", 'year', 'haversine', 'subtract', 'divide', 'cum_sum', 'cum_mean'],
                      max_depth=2,
                      cutoff_time=cutoff_times,
                      chunks=50
                      n_jobs=-1
                      verbose=True)

Built 79 features










Elapsed: 00:00 | Remaining: ? | Progress:   0%|          | Calculated: 0/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 00:22 | Remaining: 03:46 | Progress:   9%|▉         | Calculated: 1/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 00:46 | Remaining: 03:25 | Progress:  18%|█▊        | Calculated: 2/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 01:08 | Remaining: 03:00 | Progress:  27%|██▋       | Calculated: 3/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 01:33 | Remaining: 02:44 | Progress:  36%|███▋      | Calculated: 4/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 01:54 | Remaining: 02:17 | Progress:  45%|████▌     | Calculated: 5/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 02:15 | Remaining: 01:51 | Progress:  55%|█████▍    | Calculated: 6/11 chunks[A[A[A[A[A[A[A[A[A[A









Elapsed: 02:35 | Remaining: 01:26 | Progress:  64%|██████▎   | Calculated: 7/11 chunks[A[A[