In [373]:
import numpy as np
import pandas as pd
from tqdm import tqdm

pd.options.mode.chained_assignment = None

In [374]:
data_matches = pd.read_csv("../../data/football_matches_a.csv", dtype={'manager_pt1': str, 'manager_pt2': str})
data_matches_test_case = data_matches.copy(deep=True)
data_matches['date'] = pd.to_datetime(data_matches['date'], dayfirst=True)
data_matches['date'] = data_matches['date'].dt.date
data_matches.sort_values(by='date', inplace=True)
data_matches.reset_index(inplace=True, drop=True)

  data_matches = pd.read_csv("../../data/football_matches_a.csv", dtype={'manager_pt1': str, 'manager_pt2': str})


In [375]:
assert len(data_matches) == len(data_matches_test_case), "Length of new df not equal to original."

In [376]:
def duplicate_to_team_and_opponent(df_matches):
    df_matches = df_matches[['match_id', 'league', 'date', 'pt1', 'pt2', 'score_pt1', 'score_pt2']]
    df_matches['conceded_pt1'] = df_matches['score_pt2']
    df_matches['conceded_pt2'] = df_matches['score_pt1']

    df_matches_copy = df_matches.copy()
    df_matches = df_matches.rename(columns={'pt1': 'team', 'pt2': 'opponent', 'score_pt1': 'team_goals_scored',
                                            'score_pt2': 'opponent_goals_scored', 'conceded_pt1': 'team_goals_conceded',
                                            'conceded_pt2': 'opponent_goals_conceded'})
    df_matches_copy = df_matches_copy.rename(columns={'pt2': 'team', 'pt1': 'opponent', 'score_pt2': 'team_goals_scored',
                                            'score_pt1': 'opponent_goals_scored', 'conceded_pt2': 'team_goals_conceded',
                                            'conceded_pt1': 'opponent_goals_conceded'})
    # change the below, to be the columns that are in the initial df plus the new ones created.
    df_matches_copy = df_matches_copy[['match_id', 'league', 'date', 'team', 'opponent', 'team_goals_scored', 'opponent_goals_scored',
                                        'team_goals_conceded', 'opponent_goals_conceded',
                                        ]]
    df_matches.loc[:, 'home'] = 1
    df_matches_copy.loc[:, 'home'] = 0
    df_matches = pd.concat([df_matches, df_matches_copy])
    df_matches.sort_values(by='date', inplace=True)
    df_matches.reset_index(inplace=True, drop=True)

    return df_matches

In [377]:
data_matches = duplicate_to_team_and_opponent(data_matches)

In [378]:
assert len(data_matches) == len(data_matches_test_case)*2, "Length of duplicated df not equal to original."

In [379]:
data_matches.head()

Unnamed: 0,match_id,league,date,team,opponent,team_goals_scored,opponent_goals_scored,team_goals_conceded,opponent_goals_conceded,home
0,l_r_vicenza_sampdoria_1997-08-31,Serie A,1997-08-31,sampdoria,l_r_vicenza,2.0,1.0,1.0,2.0,1
1,atalanta_bologna_1997-08-31,Serie A,1997-08-31,bologna,atalanta,2.0,4.0,4.0,2.0,0
2,juventus_lecce_1997-08-31,Serie A,1997-08-31,lecce,juventus,0.0,2.0,2.0,0.0,0
3,lazio_napoli_1997-08-31,Serie A,1997-08-31,napoli,lazio,0.0,2.0,2.0,0.0,0
4,l_r_vicenza_sampdoria_1997-08-31,Serie A,1997-08-31,l_r_vicenza,sampdoria,1.0,2.0,2.0,1.0,0


In [380]:
def split_home_and_away(df):
    df_home = df[df['home']==1].copy(deep=True)
    df_away = df[df['home']==0].copy(deep=True)
    return df_home, df_away

def calculate_rolling_mean_team(df, df_home, df_away, stat_list):
    for stat in stat_list:
        df.loc[:, stat + '_avg'] = df[stat].shift(1).rolling(19).mean()
        if not any(substring in stat for substring in ['home', 'away']):
            df_home.loc[:, stat + '_avg_home'] = df_home[stat].shift(1).rolling(19).mean()
            df_away.loc[:, stat + '_avg_away'] = df_away[stat].shift(1).rolling(19).mean()
    
    df_stat_home_and_away = pd.concat([df_home, df_away])
    avg_cols = [col for col in df_stat_home_and_away.columns if 'avg' in col]
    df_stat_home_and_away = df_stat_home_and_away[avg_cols + ['match_id', 'team']]
    df_merge = df.merge(df_stat_home_and_away, on=['match_id', 'team'], how='inner',
#              validate='1:1'
            )
    return df_merge

def get_avg_columns(df):
    return [col for col in df.columns if 'avg' in col]

def calc_rolling_average(df, team, stat_list, for_team=True):
    if for_team:
        df_team = df[df['team']==team]
    else:
        df_team = df[df['opponent']==team]
    df_home, df_away = split_home_and_away(df_team)
    df_merge = calculate_rolling_mean_team(df_team, df_home, df_away, stat_list)
    return df_merge
    
def get_rolling_average(df, stats_list, for_team=True):
    teams_list = df['team'].unique().tolist()
    df['row_num'] = np.arange(1, len(df)+1)
    df_average = pd.DataFrame(columns=df.columns)

    for team in tqdm(teams_list):
        df_temp = calc_rolling_average(df, team, stats_list, for_team)
        df_average = pd.concat([df_average, df_temp])

    df_average.drop('row_num', axis=1, inplace=True)

    return df_average

In [381]:
# test
df_test = data_matches.copy(deep=True)
df_home_test, df_away_test = split_home_and_away(df_test)

In [382]:
assert len(df_home_test) == len(data_matches_test_case), 'home matches missing.'
assert len(df_away_test) == len(data_matches_test_case), 'away matches missing.'

In [383]:
team_stats = ['team_goals_scored',
              'team_goals_conceded',
              ]
opponent_stats = ['opponent_goals_scored',
                  'opponent_goals_conceded',
                  ]

In [384]:
data_averages_team = get_rolling_average(df_test, team_stats)
data_averages_opponent = get_rolling_average(df_test, opponent_stats, for_team=False)

100%|█████████████████████████████████████████| 680/680 [00:07<00:00, 95.40it/s]
100%|█████████████████████████████████████████| 680/680 [00:07<00:00, 94.40it/s]


In [385]:
data_averages = merge_on_common_columns(data_averages_team, data_averages_opponent)

In [386]:
assert len(data_averages) == len(data_matches_test_case)*2, 'matches missing'

In [387]:
def pivot_df(df, scored_or_conceded):
    return df.pivot_table('team_goals_' + scored_or_conceded, index='date', columns='league')

def check_columns(df_true, df_test):
    assert df_true.columns.tolist().sort() == df_test.columns.to_list().sort(), "Column for home and away not identical."
    
def calculate_averages(df, column, home_away):
    df_copy = df.copy().reset_index()
    s_h = df_copy[['date', column]].copy()
    s_h[column + f'_avg_{home_away}'] = s_h[column].dropna().shift(1).rolling(19).mean()
    s_h.drop(column, axis=1, inplace=True)
    df_avg = df.merge(s_h, left_on='date', right_on='date')
    return df_avg

def merge_on_common_columns(df1, df2):
    common_columns = list(set(df1.columns).intersection(df2.columns))
    df = pd.merge(df1, df2, on=common_columns)
    return df

In [388]:
def get_league_average(df, scored_or_conceded):
    df_home, df_away = split_home_and_away(df)
    df_league_home_avg = pivot_df(df_home, scored_or_conceded)
    df_league_away_avg = pivot_df(df_away, scored_or_conceded)
    check_columns(df_league_home_avg, df_league_away_avg)
    columns = df_league_home_avg.columns
    
    df_league_home_avg_test = df_league_home_avg.copy(deep=True)
    df_league_away_avg_test = df_league_away_avg.copy(deep=True)
    
    for column in columns:
        df_league_home_avg = calculate_averages(df_league_home_avg, column, 'home')
        df_league_away_avg = calculate_averages(df_league_away_avg, column, 'away')
        
    assert df_league_home_avg.shape == df_league_away_avg.shape, "Home and away are different shapes"

    df_league_home_avg = df_league_home_avg.ffill()
    df_league_away_avg = df_league_away_avg.ffill()

    df_league_home_avg.reset_index(inplace=True)
    df_league_away_avg.reset_index(inplace=True)

    df_home_avg_merge = pd.merge(df, df_league_home_avg.melt(id_vars='date')
                                    # .assign(league = lambda x: x['variable'])
                                    , on=['league','date'])
    df_away_avg_merge = pd.merge(df, df_league_away_avg.melt(id_vars='date')
                                    # .assign(league = lambda x: x['variable'])
                                    , on=['league','date'])

    df_home_avg_merge.rename(columns={'value': 'league_home_goals_'  + scored_or_conceded}, inplace=True)
    df_away_avg_merge.rename(columns={'value': 'league_away_goals_'  + scored_or_conceded}, inplace=True)

    df_merge = df_home_avg_merge.copy()
    df_merge['league_away_goals_' + scored_or_conceded] = df_away_avg_merge['league_away_goals_' + scored_or_conceded]

    columns = columns.values.tolist()
    df_league_home_avg.drop(columns, axis=1, inplace=True)
    df_league_away_avg.drop(columns, axis=1, inplace=True)
    for column in df_league_home_avg.columns:
        if column[-9:] == '_avg_home':
            df_league_home_avg.rename(columns={column: column[:-9]}, inplace=True)
    for column in df_league_away_avg.columns:
        if column[-9:] == '_avg_away':
            df_league_away_avg.rename(columns={column: column[:-9]}, inplace=True)
    
    df_home_avg_merge = pd.merge(df_home_avg_merge, df_league_home_avg.melt(id_vars='date')
                                    # .assign(league = lambda x: x['variable'])
                                    , on=['league','date'])
    df_away_avg_merge = pd.merge(df_away_avg_merge, df_league_away_avg.melt(id_vars='date')
                                    # .assign(league = lambda x: x['variable'])
                                    , on=['league','date'])

    df_home_avg_merge.rename(columns={'value': 'league_home_goals_'  + scored_or_conceded + '_avg'}, inplace=True)
    df_away_avg_merge.rename(columns={'value': 'league_away_goals_'  + scored_or_conceded + '_avg'}, inplace=True)

    df_merge_avg = df_home_avg_merge.copy()
    df_merge_avg['league_away_goals_' + scored_or_conceded + '_avg'] = df_away_avg_merge['league_away_goals_' + scored_or_conceded + '_avg']
    df_merge = merge_on_common_columns(df_merge, df_merge_avg)
    df_merge.sort_values(by='date', inplace=True)

    return df_merge

In [389]:
df_test = get_league_average(df_test, 'scored')
df_test = get_league_average(df_test, 'conceded')

In [390]:
assert len(df_test) == len(data_matches_test_case)*2, "League averages are wrong length."

In [391]:
data_averages = merge_on_common_columns(data_averages, df_test)

In [392]:
assert len(data_averages) == len(data_matches_test_case)*2, "League averages are wrong length."

In [393]:
def calc_strength(df):
    conditions = [
        df['home'] == 1,
        df['home'] == 0
    ]

    output_team_score = [
        df['team_goals_scored_avg_home'] / df['league_home_goals_scored_avg'],
        df['team_goals_scored_avg_away'] / df['league_away_goals_scored_avg']
    ]

    output_team_concede = [
        df['team_goals_conceded_avg_home'] / df['league_home_goals_conceded_avg'],
        df['team_goals_conceded_avg_away'] / df['league_away_goals_conceded_avg']
    ]

    output_opponent_score = [
        df['opponent_goals_scored_avg_away'] / df['league_away_goals_scored_avg'],
        df['opponent_goals_scored_avg_home'] / df['league_home_goals_scored_avg']
    ]

    output_opponent_concede = [
        df['opponent_goals_conceded_avg_away'] / df['league_away_goals_conceded_avg'],
        df['opponent_goals_conceded_avg_home'] / df['league_home_goals_conceded_avg']
    ]

    df['team_attack_strength'] = np.select(conditions, output_team_score, 'Other').astype(float)
    df['team_defense_strength'] = np.select(conditions, output_team_concede, 'Other').astype(float)
    df['opponent_attack_strength'] = np.select(conditions, output_opponent_score, 'Other').astype(float)
    df['opponent_defense_strength'] = np.select(conditions, output_opponent_concede, 'Other').astype(float)

    df.replace(np.nan, 0, inplace=True)
    df.replace('nan', 0, inplace=True)

    return df

In [394]:
def calc_lambda(df):
    strength_columns = ['team_attack_strength',
                        'team_defense_strength',
                        'opponent_attack_strength',
                        'opponent_defense_strength']
    df[strength_columns].fillna(0, inplace=True)
    conditions = [
        df['home'] == 1,
        df['home'] == 0,
    ]

    output_team = [
        df['league_home_goals_scored_avg'] * df['team_attack_strength'] * df['opponent_defense_strength'],
        df['league_away_goals_scored_avg'] * df['team_attack_strength'] * df['opponent_defense_strength'],
    ]

    output_opponent = [
        df['league_away_goals_scored_avg'] * df['opponent_attack_strength'] * df['team_defense_strength'],
        df['league_home_goals_scored_avg'] * df['opponent_attack_strength'] * df['team_defense_strength'],
    ]

    df['team_lambda'] = np.select(conditions, output_team, 'Other')
    df['opponent_lambda'] = np.select(conditions, output_opponent, 'Other')

    return df

In [395]:
data_averages = calc_strength(data_averages)
data_averages = calc_lambda(data_averages)
data_averages.reset_index(inplace=True, drop=True)

In [396]:
assert len(data_averages) == len(data_matches_test_case)*2, "Different to imported data length."