In [7]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [8]:
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_selection import RFECV
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn import feature_selection
from sklearn import metrics
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split



In [9]:
def load_afl_data(pred_round):
    df_2017 = pd.read_csv("../data/afl_results_2017.csv")
    #print(df_2017.shape)
    df_2018 = pd.read_csv("../data/afl_results_2018.csv")
    #print(df_2018.shape)
    df_2019 = pd.read_csv("../data/afl_results_2019.csv")
    #print(df_2019.shape)
    df_2020 = pd.read_csv("../data/afl_results_2020.csv")
    #print(df_2020.shape)
    df_2021 = pd.read_csv("../data/afl_results_2021.csv")
    #print(df_2021.shape)
    df_2022 = pd.read_csv("../data/afl_results_2022.csv")
    pred_round_results = df_2022[df_2022['round.roundNumber'] == pred_round]
    df_2022 = df_2022[df_2022['round.roundNumber'] < pred_round]
    
    
    #print(df_2022.shape)
    df_all = pd.concat([df_2017, df_2018, df_2019, df_2020, df_2021,df_2022], axis=0)
    df_all['Date'] = pd.to_datetime(df_all['match.date']).dt.strftime("%Y-%m-%d")
    df_players_2017 = pd.read_csv("../data/afl_players_stats_2017.csv")
    #print(df_players_2017.shape)
    df_players_2018 = pd.read_csv("../data/afl_players_stats_2018.csv")
    #print(df_players_2018.shape)
    df_players_2019 = pd.read_csv("../data/afl_players_stats_2019.csv")
    #print(df_players_2019.shape)
    df_players_2020 = pd.read_csv("../data/afl_players_stats_2020.csv")
    #print(df_players_2020.shape)
    df_players_2021 = pd.read_csv("../data/afl_players_stats_2021.csv")
    #print(df_players_2021.shape)
    df_players_2022 = pd.read_csv("../data/afl_players_stats_2022.csv")
    df_players_2022 = df_players_2022[df_players_2022['Round'] < pred_round]
    
    #print(df_players_2022.shape)
    df_players = pd.concat([df_players_2017, df_players_2018, df_players_2019,df_players_2020,df_players_2021,df_players_2022], axis=0)
    #print(df_players.shape)
    #df_players.columns
    
    df_fixture = pd.read_csv("../data/fixture_2022.csv")
    df_next_games_teams = df_fixture[(df_fixture['round.roundNumber'] == pred_round)]
    df_next_games_teams = df_next_games_teams[['home.team.name','away.team.name','venue.name','compSeason.year','round.roundNumber']]
    df_next_games_teams = df_next_games_teams.rename(columns={'home.team.name': 'match.homeTeam.name', 'away.team.name': 'match.awayTeam.name','compSeason.year':'round.year'})
    df_next_games_teams['match.matchId'] = np.arange(len(df_next_games_teams))
    
    return df_all, df_players, df_fixture, df_next_games_teams, pred_round_results

def get_aggregate_player_stats(df=None):

    agg_stats = (df.rename(columns={ # Rename columns to lowercase
                    'Home.team': 'match.homeTeam.name',
                    'Away.team': 'match.awayTeam.name',
                    })
                   .groupby(by=['Date', 'Season', 'match.homeTeam.name', 'match.awayTeam.name'], as_index=False) # Groupby to aggregate the stats for each game
                   .sum()
                   #.drop(columns=['DE', 'TOG', 'Match_id']) # Drop columns
                   .assign(date=lambda df: pd.to_datetime(df.Date, format="%Y-%m-%d")) # Create a datetime object
                   .sort_values(by='Date')
                   .reset_index(drop=True))
    return agg_stats

In [10]:
df_all, df_players, df_fixture, df_next_games_teams, pred_round_results = load_afl_data(8)
agg_player = get_aggregate_player_stats(df_players)
afl_df = df_all.merge(agg_player, on=['Date', 'match.homeTeam.name', 'match.awayTeam.name'], how='left')
print(afl_df.shape)

(1053, 123)


In [11]:
# Add average goal diff for home and away team rolling 4 games

afl_df['HTGDIFF'] = afl_df['homeTeamScore.matchScore.goals'] - afl_df['awayTeamScore.matchScore.goals']
afl_df['ATGDIFF'] = afl_df['awayTeamScore.matchScore.goals'] - afl_df['homeTeamScore.matchScore.goals']

def from_dict_value_to_df(d):
    """
    input = dictionary 
    output = dataframe as part of all the values from the dictionary
    """
    df = pd.DataFrame()
    for v in d.values():
        df = pd.concat([df,v])
    return df

def avg_goal_diff(df, avg_h_a_diff, a_h_team, a_h_goal_letter):
    """
    input: 
        df = dataframe with all results
        avg_h_a_diff = name of the new column
        a_h_team = HomeTeam or AwayTeam
        a_h_goal_letter = 'H' for home or 'A' for away
    output: 
        avg_per_team = dictionary with with team as key and columns as values with new column H/ATGDIFF
    """
    df[avg_h_a_diff] = 0
    avg_per_team = {}
    all_teams = df[a_h_team].unique()
    for t in all_teams:
        df_team = df[df[a_h_team]==t].fillna(0)
        result = df_team['{}TGDIFF'.format(a_h_goal_letter)].rolling(4).mean()
        df_team[avg_h_a_diff] = result
        avg_per_team[t] = df_team
    return avg_per_team

d_AVGFTHG = avg_goal_diff(afl_df, 'AVGHTGDIFF', 'match.homeTeam.name', 'H')
df_AVGFTHG = from_dict_value_to_df(d_AVGFTHG)
df_AVGFTHG.sort_index(inplace=True)
d_AVGFTAG = avg_goal_diff(df_AVGFTHG, 'AVGATGDIFF', 'match.awayTeam.name', 'A')
afl_df = from_dict_value_to_df(d_AVGFTAG)
afl_df.sort_index(inplace=True)
afl_df['AVGATGDIFF'].fillna(0, inplace=True)




In [12]:
afl_df['goal_diff'] = afl_df['homeTeamScore.matchScore.goals'] - afl_df['awayTeamScore.matchScore.goals']

for index, row in df_all[df_all['match.status']=='CONCLUDED'].iterrows():
    if afl_df['goal_diff'][index] > 0:
        afl_df.at[index,'result'] = 1   # 1 is a win
    else:
        afl_df.at[index,'result'] = 0  # 0 is a loss 

In [13]:
def previous_data(df, h_or_a_team, column, letter, past_n):
    """
    input: 
        df = dataframe with all results
        a_h_team = HomeTeam or AwayTeam
        column = column selected to get previous data from
    output:
        team_with_past_dict = dictionary with team as a key and columns as values with new 
                              columns with past value
    """
    d = dict()
    team_with_past_dict = dict()
    all_teams = df[h_or_a_team].unique()
    for team in all_teams:
        n_games = len(df[df[h_or_a_team]==team])
        team_with_past_dict[team] = df[df[h_or_a_team]==team]
        for i in range(1, past_n):
            d[i] = team_with_past_dict[team].assign(
                result=team_with_past_dict[team].groupby(h_or_a_team)[column].shift(i)
            ).fillna({'{}_X'.format(column): 0})
            team_with_past_dict[team]['{}_{}_{}'.format(letter, column, i)] = d[i].result
    return team_with_past_dict

def previous_data_call(df, side, column, letter, iterations):
    d = previous_data(df, side, column, letter, iterations)
    df_result= from_dict_value_to_df(d)
    df_result.sort_index(inplace=True)
    return df_result

df_last_home_results = previous_data_call(afl_df, 'match.homeTeam.name', 'result', 'H', 3)
df_last_away_results = previous_data_call(df_last_home_results, 'match.awayTeam.name', 'result', 'A', 3)
df_last_last_HTGDIFF_results = previous_data_call(df_last_away_results, 'match.homeTeam.name', 'HTGDIFF', 'H', 3)
df_last_last_ATGDIFF_results = previous_data_call(df_last_last_HTGDIFF_results, 'match.awayTeam.name', 'ATGDIFF', 'A', 3)
df_last_AVGFTHG_results = previous_data_call(df_last_last_ATGDIFF_results, 'match.homeTeam.name', 'AVGHTGDIFF', 'H', 2)
df_last_AVGFTAG_results = previous_data_call(df_last_AVGFTHG_results, 'match.awayTeam.name', 'AVGATGDIFF', 'A', 2)
afl_df = df_last_AVGFTAG_results.copy()

In [14]:

all_cols = ['match.matchId','match.date', 'match.status', 'match.venue', 'match.homeTeam.name', 'match.awayTeam.name','venue.name', 'venue.state', 'round.name', 'round.year', 'round.roundNumber', 'status',
'homeTeamScore.rushedBehinds', 'homeTeamScore.minutesInFront',
       'homeTeamScore.matchScore.totalScore', 'homeTeamScore.matchScore.goals',
       'homeTeamScore.matchScore.behinds',
       'homeTeamScore.matchScore.superGoals', 'awayTeamScore.rushedBehinds',
       'awayTeamScore.minutesInFront', 'awayTeamScore.matchScore.totalScore',
       'awayTeamScore.matchScore.goals', 'awayTeamScore.matchScore.behinds',
       'awayTeamScore.matchScore.superGoals', 'weather.tempInCelsius',
       'homeTeamScoreChart.goals', 'homeTeamScoreChart.leftBehinds',
       'homeTeamScoreChart.rightBehinds', 'homeTeamScoreChart.leftPosters',
       'homeTeamScoreChart.rightPosters', 'homeTeamScoreChart.rushedBehinds',
       'homeTeamScoreChart.touchedBehinds', 'awayTeamScoreChart.goals',
       'awayTeamScoreChart.leftBehinds', 'awayTeamScoreChart.rightBehinds',
       'awayTeamScoreChart.leftPosters', 'awayTeamScoreChart.rightPosters',
       'awayTeamScoreChart.rushedBehinds', 'awayTeamScoreChart.touchedBehinds', 
       'HQ1G', 'HQ1B', 'HQ2G',
       'HQ2B', 'HQ3G', 'HQ3B', 'HQ4G', 'HQ4B', 'Home.score', 'AQ1G', 'AQ1B',
       'AQ2G', 'AQ2B', 'AQ3G', 'AQ3B', 'AQ4G', 'AQ4B', 'Away.score',
       'Kicks', 'Marks', 'Handballs', 'Goals', 'Behinds', 'Hit.Outs',
       'Tackles', 'Rebounds', 'Inside.50s', 'Clearances', 'Clangers',
       'Frees.For', 'Frees.Against', 'Brownlow.Votes', 'Contested.Possessions',
       'Uncontested.Possessions', 'Contested.Marks', 'Marks.Inside.50',
       'One.Percenters', 'Bounces', 'Goal.Assists', 'Time.on.Ground..',
       'Substitute', 'group_id', 'HTGDIFF', 'ATGDIFF', 'AVGHTGDIFF',
       'AVGATGDIFF', 'goal_diff', 'result', 'H_result_1', 'H_result_2',
       'A_result_1', 'A_result_2', 'H_HTGDIFF_1', 'H_HTGDIFF_2', 'A_ATGDIFF_1',
       'A_ATGDIFF_2', 'H_AVGHTGDIFF_1', 'A_AVGATGDIFF_1']

non_feature_cols = ['match.matchId','match.date', 'match.status', 'match.venue', 'match.homeTeam.name', 'match.awayTeam.name','venue.name', 'venue.state', 'round.name', 'round.year', 'round.roundNumber', 'status','Season']
feature_cols = [
       'homeTeamScore.rushedBehinds', 'homeTeamScore.minutesInFront',
       'homeTeamScore.matchScore.totalScore', 'homeTeamScore.matchScore.goals',
       'homeTeamScore.matchScore.behinds',
       'homeTeamScore.matchScore.superGoals', 'awayTeamScore.rushedBehinds',
       'awayTeamScore.minutesInFront', 'awayTeamScore.matchScore.totalScore',
       'awayTeamScore.matchScore.goals', 'awayTeamScore.matchScore.behinds',
       'awayTeamScore.matchScore.superGoals', 'weather.tempInCelsius',
       'homeTeamScoreChart.goals', 'homeTeamScoreChart.leftBehinds',
       'homeTeamScoreChart.rightBehinds', 'homeTeamScoreChart.leftPosters',
       'homeTeamScoreChart.rightPosters', 'homeTeamScoreChart.rushedBehinds',
       'homeTeamScoreChart.touchedBehinds', 'awayTeamScoreChart.goals',
       'awayTeamScoreChart.leftBehinds', 'awayTeamScoreChart.rightBehinds',
       'awayTeamScoreChart.leftPosters', 'awayTeamScoreChart.rightPosters',
       'awayTeamScoreChart.rushedBehinds', 'awayTeamScoreChart.touchedBehinds', 
       'HQ1G', 'HQ1B', 'HQ2G',
       'HQ2B', 'HQ3G', 'HQ3B', 'HQ4G', 'HQ4B', 'Home.score', 'AQ1G', 'AQ1B',
       'AQ2G', 'AQ2B', 'AQ3G', 'AQ3B', 'AQ4G', 'AQ4B', 'Away.score',
       'Kicks', 'Marks', 'Handballs', 'Goals', 'Behinds', 'Hit.Outs',
       'Tackles', 'Rebounds', 'Inside.50s', 'Clearances', 'Clangers',
       'Frees.For', 'Frees.Against', 'Brownlow.Votes', 'Contested.Possessions',
       'Uncontested.Possessions', 'Contested.Marks', 'Marks.Inside.50',
       'One.Percenters', 'Bounces', 'Goal.Assists', 'Time.on.Ground..',
       'Substitute', 'group_id', 'HTGDIFF', 'ATGDIFF', 'AVGHTGDIFF',
       'AVGATGDIFF', 'goal_diff', 'result', 'H_result_1', 'H_result_2',
       'A_result_1', 'A_result_2', 'H_HTGDIFF_1', 'H_HTGDIFF_2', 'A_ATGDIFF_1',
       'A_ATGDIFF_2', 'H_AVGHTGDIFF_1', 'A_AVGATGDIFF_1']

In [15]:
afl_df = afl_df[all_cols] 

afl_df = afl_df.rename(columns={col: 'f_' + col for col in afl_df if col not in non_feature_cols})


In [16]:
# TEST ELO APPLIER



afl_df.head()

Unnamed: 0,match.matchId,match.date,match.status,match.venue,match.homeTeam.name,match.awayTeam.name,venue.name,venue.state,round.name,round.year,...,f_H_result_1,f_H_result_2,f_A_result_1,f_A_result_2,f_H_HTGDIFF_1,f_H_HTGDIFF_2,f_A_ATGDIFF_1,f_A_ATGDIFF_2,f_H_AVGHTGDIFF_1,f_A_AVGATGDIFF_1
0,CD_M20170140101,2017-03-23 08:20:00,CONCLUDED,CD_V40,Carlton,Richmond,MCG,VIC,Round 1,2017,...,,,,,,,,,,
1,CD_M20170140102,2017-03-24 08:50:00,CONCLUDED,CD_V40,Collingwood,Western Bulldogs,MCG,VIC,Round 1,2017,...,,,,,,,,,,
2,CD_M20170140103,2017-03-25 05:35:00,CONCLUDED,CD_V190,St Kilda,Melbourne,Marvel Stadium,VIC,Round 1,2017,...,,,,,,,,,,
3,CD_M20170140104,2017-03-25 05:35:00,CONCLUDED,CD_V60,Sydney Swans,Port Adelaide,SCG,NSW,Round 1,2017,...,,,,,,,,,,
4,CD_M20170140106,2017-03-25 08:25:00,CONCLUDED,CD_V40,Essendon,Hawthorn,MCG,VIC,Round 1,2017,...,,,,,,,,,,


In [17]:

def create_training_and_test_data(afl_df,df_next_games_teams):
    
    # Define a function which returns a DataFrame with the expontential moving average for each numeric stat
    def create_exp_weighted_avgs(df, span):
        # Create a copy of the df with only the game id and the team - we will add cols to this df
        ema_features = df[['match.matchId', 'match.homeTeam.name']].copy()

        feature_names = [col for col in df.columns if col.startswith('f_')] # Get a list of columns we will iterate over

        for feature_name in feature_names:
            feature_ema = (df.groupby('match.homeTeam.name')[feature_name]
                             .transform(lambda row: (row.ewm(span=span)
                                                        .mean()
                                                        .shift(1))))
            ema_features[feature_name] = feature_ema

        return ema_features

        # Define a function which finds the elo for each team in each game and returns a dictionary with the game ID as a key and the
    # elos as the key's value, in a list. It also outputs the probabilities and a dictionary of the final elos for each team
    def elo_applier(df, k_factor):
        # Initialise a dictionary with default elos for each team
        elo_dict = {team: 1500 for team in df['match.homeTeam.name'].unique()}
        elos, elo_probs = {}, {}

        # Get a home and away dataframe so that we can get the teams on the same row
        #home_df = df.loc[df.home_game == 1, ['match.homeTeam.name', 'match.matchId', 'f_margin', 'home_game']].rename(columns={'team': 'home_team'})
        #away_df = df.loc[df.home_game == 0, ['match.homeTeam.name', 'match.matchId']].rename(columns={'team': 'away_team'})

        #df = (pd.merge(home_df, away_df, on='game')
        #        .sort_values(by='game')
        #        .drop_duplicates(subset='game', keep='first')
        #        .reset_index(drop=True))


        # Loop over the rows in the DataFrame
        for index, row in df.iterrows():
            # Get the Game ID
            game_id = row['match.matchId']

            # Get the margin
            margin = row['f_goal_diff']

            # If the game already has the elos for the home and away team in the elos dictionary, go to the next game
            if game_id in elos.keys():
                continue

            # Get the team and opposition
            home_team = row['match.homeTeam.name']
            away_team = row['match.awayTeam.name']

            # Get the team and opposition elo score
            home_team_elo = elo_dict[home_team]
            away_team_elo = elo_dict[away_team]

            # Calculated the probability of winning for the team and opposition
            prob_win_home = 1 / (1 + 10**((away_team_elo - home_team_elo) / 400))
            prob_win_away = 1 - prob_win_home

            # Add the elos and probabilities our elos dictionary and elo_probs dictionary based on the Game ID
            elos[game_id] = [home_team_elo, away_team_elo]
            elo_probs[game_id] = [prob_win_home, prob_win_away]

            # Calculate the new elos of each team
            if margin > 0: # Home team wins; update both teams' elo
                new_home_team_elo = home_team_elo + k_factor*(1 - prob_win_home)
                new_away_team_elo = away_team_elo + k_factor*(0 - prob_win_away)
            elif margin < 0: # Away team wins; update both teams' elo
                new_home_team_elo = home_team_elo + k_factor*(0 - prob_win_home)
                new_away_team_elo = away_team_elo + k_factor*(1 - prob_win_away)
            elif margin == 0: # Drawn game' update both teams' elo
                new_home_team_elo = home_team_elo + k_factor*(0.5 - prob_win_home)
                new_away_team_elo = away_team_elo + k_factor*(0.5 - prob_win_away)

            # Update elos in elo dictionary
            elo_dict[home_team] = new_home_team_elo
            elo_dict[away_team] = new_away_team_elo

        return elos, elo_probs, elo_dict
    
    afl_df['train_data'] = 1
    df_next_games_teams['train_data'] = 0
    
    afl_data = afl_df.append(df_next_games_teams).reset_index(drop=True)
    
    features_rolling_averages = create_exp_weighted_avgs(afl_data, span=10)
    
    features = afl_data[['match.date', 'match.matchId', 'match.homeTeam.name', 'match.awayTeam.name', 'venue.name','round.year','train_data']].copy()
    features = pd.merge(features, features_rolling_averages, on=['match.matchId', 'match.homeTeam.name'])
    
    form_btwn_teams = afl_df[['match.matchId', 'match.homeTeam.name', 'match.awayTeam.name', 'f_goal_diff']].copy()


    elos, elo_probs, elo_dict = elo_applier(afl_data, 30)
    # Add our created features - elo, efficiency etc.
    
    features = (features.assign(f_elo_home=lambda df: df['match.matchId'].map(elos).apply(lambda x: x[0]),
                                                f_elo_away=lambda df: df['match.matchId'].map(elos).apply(lambda x: x[1]))
                                          .reset_index(drop=True))
    
#    form_btwn_teams_inv = pd.DataFrame()

#    for index, row in form_btwn_teams.iterrows():
#        home = row['match.homeTeam.name']
#        away = row['match.awayTeam.name']
#        matchid = row['match.matchId']
#        margin = row['f_goal_diff']

#        form_btwn_teams_inv = form_btwn_teams_inv.append({'match.matchId': matchid, 'match.homeTeam.name': away, 'match.awayTeam.name': home, 'f_goal_diff': -1*margin}, ignore_index=True)

#    form_btwn_teams['f_form_margin_btwn_teams'] = (form_btwn_teams.groupby(['match.homeTeam.name', 'match.awayTeam.name'])['f_goal_diff']
#                                                              .transform(lambda row: row.rolling(5).mean().shift())
#                                                              .fillna(0))

#    form_btwn_teams['f_form_past_5_btwn_teams'] = \
#    (form_btwn_teams.assign(win=lambda df: df.apply(lambda row: 1 if row.f_goal_diff > 0 else 0, axis='columns'))
#                  .groupby(['match.homeTeam.name', 'match.awayTeam.name'])['win']
#                  .transform(lambda row: row.rolling(5).mean().shift() * 5)
#                  .fillna(0))


    #print(features.shape)
    # Merge to our features df
    #features = pd.merge(features, form_btwn_teams_1.drop(columns=['f_goal_diff']), on=['match.matchId', 'match.homeTeam.name', 'match.awayTeam.name'])
    #print(features.shape)
    
    # Get the result and merge to the feature_df

    match_results = (afl_df.assign(result=lambda df: df.apply(lambda row: 1 if row['f_goal_diff'] > 0 else 0, axis=1)))
    # Merge result column to feature_df
    feature_df = pd.merge(features, match_results[['match.matchId', 'result']], on='match.matchId')

    return feature_df,features_rolling_averages, afl_data, features
    

In [18]:
feature_df, features_rolling_averages, afl_data, features = create_training_and_test_data(afl_df,df_next_games_teams)
feature_columns = [col for col in feature_df if col.startswith('f_')]


In [19]:
features['f_elo_home'] = features['f_elo_home']/1500
features['f_elo_away'] = features['f_elo_away']/1500




In [20]:

# Build model from feature_df

feature_df = feature_df.dropna()

all_X = feature_df.loc[:, feature_columns]
all_y = feature_df.loc[:, 'result']

X_train, X_test, y_train, y_test = train_test_split(all_X, all_y, test_size=0.30, random_state=42)

# Scale features
scaler = StandardScaler()
X_train[feature_columns] = scaler.fit_transform(X_train[feature_columns])
X_test[feature_columns] = scaler.transform(X_test[feature_columns])


In [21]:
# Create a list of standard classifiers
classifiers = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),

    #Gaussian Processes
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    
    #Navies Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #SVM
    svm.SVC(probability=True),
    svm.NuSVC(probability=True),
    
    #Discriminant Analysis
    discriminant_analysis.LinearDiscriminantAnalysis(),
    discriminant_analysis.QuadraticDiscriminantAnalysis(),

    
    #xgboost: http://xgboost.readthedocs.io/en/latest/model.html
    XGBClassifier()    
]

# Define a functiom which finds the best algorithms for our modelling task
def find_best_algorithms(classifier_list, X, y):
    # This function is adapted from https://www.kaggle.com/yassineghouzam/titanic-top-4-with-ensemble-modeling
    # Cross validate model with Kfold stratified cross validation
    kfold = StratifiedKFold(n_splits=5)
    
    # Grab the cross validation scores for each algorithm
    cv_results = [cross_val_score(classifier, X, y, scoring = "neg_log_loss", cv = kfold) for classifier in classifier_list]
    cv_means = [cv_result.mean() * -1 for cv_result in cv_results]
    cv_std = [cv_result.std() for cv_result in cv_results]
    algorithm_names = [alg.__class__.__name__ for alg in classifiers]
    
    # Create a DataFrame of all the CV results
    cv_results = pd.DataFrame({
        "Mean Log Loss": cv_means,
        "Log Loss Std": cv_std,
        "Algorithm": algorithm_names
    })
    
    
    return cv_results.sort_values(by='Mean Log Loss').reset_index(drop=True)

In [22]:
best_algos = find_best_algorithms(classifiers, X_train, y_train)
best_algos




Unnamed: 0,Mean Log Loss,Log Loss Std,Algorithm
0,0.660736,0.009009,SVC
1,0.67063,0.013604,NuSVC
2,0.678665,0.017137,RandomForestClassifier
3,0.67993,0.020397,ExtraTreesClassifier
4,0.683995,0.051406,LogisticRegressionCV
5,0.689025,0.002086,AdaBoostClassifier
6,0.692601,0.000326,GaussianProcessClassifier
7,0.713101,0.041002,LinearDiscriminantAnalysis
8,0.715146,0.045681,GradientBoostingClassifier
9,0.950586,0.018488,XGBClassifier


In [23]:
def new_scorer(y,y_pred):
    xx = pd.DataFrame({'y':list(y), 'y_pred': list(y_pred)}, columns=['y','y_pred'])     
    xx.loc[xx['y'].astype(int)==0,'actual_score'] = 1+np.log2(1-xx['y_pred'].astype(float))
   
    xx.loc[xx['y'].astype(int)==1,'actual_score'] = 1+np.log2(xx['y_pred'].astype(float))
   
    return np.mean(xx['actual_score'])

In [24]:
kfold = StratifiedKFold(n_splits=5)
cv_scores = cross_val_score(ensemble.RandomForestClassifier(), X_train, y_train, scoring='accuracy', cv=kfold)
cv_scores.mean()

0.5882631038026721

In [25]:
# Try a logistic regression model and see how it performs in terms of accuracy
kfold = StratifiedKFold(n_splits=5)
cv_scores = cross_val_score(linear_model.LogisticRegressionCV(), X_train, y_train, scoring='accuracy', cv=kfold)
cv_scores.mean()

0.5953854059609455

In [26]:
#new_scorer(X_train, y_train)

#kfold = StratifiedKFold(n_splits=5)
#cv_scores = cross_val_score(linear_model.LogisticRegressionCV(), X_train, y_train, scoring=new_scorer, cv=kfold)
#cv_scores.mean()

In [27]:
kfold = StratifiedKFold(n_splits=5)
cv_scores = cross_val_score(XGBClassifier(), X_train, y_train, scoring='accuracy', cv=kfold)
cv_scores.mean()



0.5940184994861254

In [28]:
# Define a function which optimises the hyperparameters of our chosen algorithms
def optimise_hyperparameters(train_x, train_y, algorithms, parameters):
    kfold = StratifiedKFold(n_splits=5)
    best_estimators = []
    
    for alg, params in zip(algorithms, parameters):
        gs = GridSearchCV(alg, param_grid=params, cv=kfold, scoring='neg_log_loss', verbose=1)
        gs.fit(train_x, train_y)
        best_estimators.append(gs.best_estimator_)
    return best_estimators

# Define our parameters to run a grid search over
lr_grid = {
    "C": [0.0001, 0.001, 0.01, 0.05, 0.2, 0.5],
    "solver": ["newton-cg", "lbfgs", "liblinear"]
}

# Add our algorithms and parameters to lists to be used in our function
alg_list = [LogisticRegression(), XGBClassifier(), ensemble.RandomForestClassifier()]
param_list = [lr_grid]

In [29]:
# Find the best estimators, then add our other estimators which don't need optimisation
best_estimators = optimise_hyperparameters(X_train, y_train, alg_list, param_list)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


In [30]:
lr_best_params = best_estimators[0].get_params()
lr_best_params

{'C': 0.01,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [31]:
best_estimators

[LogisticRegression(C=0.01)]

In [None]:
XGB_model = XGBClassifier(
     learning_rate =0.1,
     n_estimators=1000,
     max_depth=5,
     min_child_weight=1,
     gamma=0,
     subsample=0.8,
     colsample_bytree=0.8,
     objective= 'binary:logistic',
     nthread=4,
     scale_pos_weight=1,
     seed=27)
XGB_model.fit(X_train, y_train)
final_predictions_xgb = XGB_model.predict(X_test)

accuracy = (final_predictions_xgb == y_test).mean() * 100

print("Our accuracy in predicting test data is: {:.2f}%".format(accuracy))

In [64]:
XGB_model = XGBClassifier(
     learning_rate =0.1,
     n_estimators=1000,
     max_depth=7,
     min_child_weight=3,
     gamma=0.2,
     subsample=0.8,
     colsample_bytree=0.8,
     objective= 'binary:logistic',
     nthread=4,
     scale_pos_weight=1,
     seed=27)
XGB_model.fit(X_train, y_train)
final_predictions_xgb = XGB_model.predict(X_test)

accuracy = (final_predictions_xgb == y_test).mean() * 100

print("Our accuracy in predicting test data is: {:.2f}%".format(accuracy))


Our accuracy in predicting test data is: 62.54%


In [65]:
importance = np.round(XGB_model.feature_importances_,4)
dictionary = dict(zip(feature_columns, importance))
sorted_dictionary=sorted(dictionary.items(), key=lambda x:x[1], reverse=True)
names=[]
values=[]
for i in range(0, len(importance)):
    print('Feature Importance: {:35} {}%'.format(
        sorted_dictionary[i][0], np.round(sorted_dictionary[i][1]*100,4))
         )
    names.append(sorted_dictionary[i][0])
    values.append(np.round(sorted_dictionary[i][1]*100,4))

Feature Importance: f_Rebounds                          2.72%
Feature Importance: f_A_result_2                        2.27%
Feature Importance: f_elo_home                          2.14%
Feature Importance: f_AQ1B                              2.02%
Feature Importance: f_homeTeamScore.minutesInFront      1.82%
Feature Importance: f_AQ3G                              1.82%
Feature Importance: f_Tackles                           1.8%
Feature Importance: f_HQ3G                              1.75%
Feature Importance: f_elo_away                          1.69%
Feature Importance: f_result                            1.67%
Feature Importance: f_Marks.Inside.50                   1.61%
Feature Importance: f_HQ3B                              1.59%
Feature Importance: f_Away.score                        1.56%
Feature Importance: f_AQ1G                              1.54%
Feature Importance: f_AQ4G                              1.54%
Feature Importance: f_HTGDIFF                           1.53%
Feature I

In [33]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=140, max_depth=5,
 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1, scoring='roc_auc',n_jobs=4, cv=5)
gsearch1.fit(X_train,y_train)
#gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
sorted(gsearch1.cv_results_.keys())



['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_max_depth',
 'param_min_child_weight',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [35]:
test_scores = gsearch1.cv_results_["mean_test_score"]
print(test_scores)

params = gsearch1.cv_results_["params"]

print(params[best_index_])

best_index_ = gsearch1.best_index_
print(best_index_)

array([0.64049622, 0.63871422, 0.64055047, 0.63954775, 0.6380201 ,
       0.63496735, 0.63059113, 0.64440708, 0.63415394, 0.6352818 ,
       0.63984514, 0.64196777])

{'max_depth': 7, 'min_child_weight': 3}

7

In [59]:
param_test3 = {
 'gamma':[i/10.0 for i in range(0,5)]
}
gsearch3 = GridSearchCV(estimator = XGBClassifier(
     learning_rate =0.1,
     n_estimators=140,
     max_depth=7,
     min_child_weight=3,
     gamma=0,
     subsample=0.8,
     colsample_bytree=0.8,
     objective= 'binary:logistic',
     nthread=4,
     scale_pos_weight=1,
     seed=27), 
 param_grid = param_test3, scoring='roc_auc',n_jobs=4, cv=5)
gsearch3.fit(X_train,y_train)




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8,
                                     enable_categorical=False, gamma=0,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=7, min_child_weight=3,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=140, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weight

In [60]:
test_scores = gsearch3.cv_results_["mean_test_score"]
test_scores

array([0.64440708, 0.64486907, 0.64539413, 0.63851667, 0.64338918])

In [63]:
sorted(gsearch3.cv_results_.keys())


print(gsearch3.cv_results_["param_gamma"])



masked_array(data=[0.0, 0.1, 0.2, 0.3, 0.4],
             mask=[False, False, False, False, False],
       fill_value='?',
            dtype=object)

In [None]:
# try gamma = 0.2

In [49]:
print("The best_index_ is %d" % best_index_)
#print("The params selected is %d" % gsearch1.cv_results_["params"][best_index_])
print(
    "The corresponding accuracy score is %.2f"
    % gsearch1.cv_results_["mean_test_score"][best_index_]
)
plt.show()

The best_index_ is 7
The corresponding accuracy score is 0.64


In [1225]:
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
final_predictions_xgb = xgb.predict(X_test)

accuracy = (final_predictions_xgb == y_test).mean() * 100

print("Our accuracy in predicting test data is: {:.2f}%".format(accuracy))

Our accuracy in predicting test data is: 63.55%


In [1226]:
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
final_predictions_svm = svm_model.predict(X_test)

accuracy = (final_predictions_svm == y_test).mean() * 100

print("Our accuracy in predicting test data is: {:.2f}%".format(accuracy))



Our accuracy in predicting test data is: 61.20%


In [72]:
param_test4 = {
 'subsample':[i/10.0 for i in range(6,10)],
 'colsample_bytree':[i/10.0 for i in range(6,10)]
}
gsearch4 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=177, max_depth=7,
 min_child_weight=3, gamma=0.2, subsample=0.8, colsample_bytree=0.8,
 objective= 'binary:logistic', nthread=4, scale_pos_weight=1,seed=27), 
 param_grid = param_test4, scoring='roc_auc',n_jobs=4, cv=5)
gsearch4.fit(X_train,y_train)




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.8,
                                     enable_categorical=False, gamma=0.2,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=7, min_child_weight=3,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=177, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_weig

In [76]:
test_scores = gsearch4.cv_results_["mean_test_score"]
print(test_scores)

params = gsearch4.cv_results_["params"]

#print(params[best_index_])

best_index_ = gsearch4.best_index_
print(best_index_)

[0.63210276 0.64153403 0.63952115 0.6301174  0.6339146  0.63166744
 0.63132741 0.63716252 0.63478474 0.6268784  0.64316587 0.64258789
 0.63981939 0.63113047 0.62603141 0.62992805]
10


In [77]:
param_test6 = {
 'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
gsearch6 = GridSearchCV(estimator = XGBClassifier(
     learning_rate =0.1,
     n_estimators=1000,
     max_depth=7,
     min_child_weight=3,
     gamma=0.2,
     subsample=0.9,  #0.2
     colsample_bytree=0.7,
     objective= 'binary:logistic',
     nthread=4,
     scale_pos_weight=1,
     seed=27), 
 param_grid = param_test6, scoring='roc_auc',n_jobs=4, cv=5)
gsearch6.fit(X_train,y_train)




GridSearchCV(cv=5,
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=0.7,
                                     enable_categorical=False, gamma=0.2,
                                     gpu_id=None, importance_type=None,
                                     interaction_constraints=None,
                                     learning_rate=0.1, max_delta_step=None,
                                     max_depth=7, min_child_weight=3,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=1000, n_jobs=None, nthread=4,
                                     num_parallel_tree=None, predictor=None,
                                     random_state=None, reg_alpha=None,
                                     reg_lambda=None, scale_pos_wei

In [80]:
test_scores = gsearch6.cv_results_["mean_test_score"]
print(test_scores)

params = gsearch6.cv_results_["params"]
print(params)
#print(params[best_index_])

best_index_ = gsearch6.best_index_
print(best_index_)

[0.63248993 0.63915488 0.62902933 0.63224597 0.5       ]
[{'reg_alpha': 1e-05}, {'reg_alpha': 0.01}, {'reg_alpha': 0.1}, {'reg_alpha': 1}, {'reg_alpha': 100}]
1


In [85]:
XGB_model = XGBClassifier(
     learning_rate =0.1,
     n_estimators=1000,
     max_depth=7,
     min_child_weight=3,
     gamma=0.2,
     subsample=0.9,  #0.2
     colsample_bytree=0.7,
	 reg_alpha=0.01,
     objective= 'binary:logistic',
     nthread=4,
     scale_pos_weight=1,
     seed=42)
XGB_model.fit(X_train, y_train)
final_predictions_xgb = XGB_model.predict(X_test)

accuracy = (final_predictions_xgb == y_test).mean() * 100

print("Our accuracy in predicting test data is: {:.2f}%".format(accuracy))

Our accuracy in predicting test data is: 63.55%


In [1227]:
#new_scorer(y_test,final_predictions)

In [1228]:
importance = np.round(xgb.feature_importances_,4)
dictionary = dict(zip(feature_columns, importance))
sorted_dictionary=sorted(dictionary.items(), key=lambda x:x[1], reverse=True)
names=[]
values=[]
for i in range(0, len(importance)):
    print('Feature Importance: {:35} {}%'.format(
        sorted_dictionary[i][0], np.round(sorted_dictionary[i][1]*100,4))
         )
    names.append(sorted_dictionary[i][0])
    values.append(np.round(sorted_dictionary[i][1]*100,4))

Feature Importance: f_elo_home                          4.08%
Feature Importance: f_AQ1B                              3.61%
Feature Importance: f_AQ4G                              3.47%
Feature Importance: f_Goal.Assists                      3.38%
Feature Importance: f_One.Percenters                    3.25%
Feature Importance: f_homeTeamScore.matchScore.behinds  2.89%
Feature Importance: f_Away.score                        2.28%
Feature Importance: f_homeTeamScore.minutesInFront      2.2%
Feature Importance: f_HQ1G                              2.2%
Feature Importance: f_A_result_2                        2.18%
Feature Importance: f_elo_away                          2.14%
Feature Importance: f_Clangers                          2.04%
Feature Importance: f_homeTeamScoreChart.leftBehinds    2.03%
Feature Importance: f_homeTeamScore.rushedBehinds       2.01%
Feature Importance: f_Marks                             1.94%
Feature Importance: f_Contested.Marks                   1.77%
Feature Im

In [1229]:
kfold = StratifiedKFold(n_splits=10)
cv_scores = cross_val_score(linear_model.LogisticRegression(**lr_best_params), X_train, y_train, scoring='neg_log_loss', cv=kfold)
cv_scores.mean()

-0.6543352746244679

In [1230]:
lr = LogisticRegression(**lr_best_params)
lr.fit(X_train, y_train)
final_predictions_lr = lr.predict(X_test)

accuracy = (final_predictions_lr == y_test).mean() * 100

print("Our accuracy in predicting test data is: {:.2f}%".format(accuracy))

Our accuracy in predicting test data is: 61.54%


In [1231]:
next_round_features = features[features['train_data']==0][feature_columns]

next_round_predictions = lr.predict(next_round_features)


In [1232]:
df_next_games_teams

Unnamed: 0,match.homeTeam.name,match.awayTeam.name,venue.name,round.year,round.roundNumber,match.matchId,train_data
63,Port Adelaide,Western Bulldogs,Adelaide Oval,2022,8,0,0
64,Fremantle,North Melbourne,Optus Stadium,2022,8,1,0
65,Richmond,Collingwood,MCG,2022,8,2,0
66,Sydney Swans,Gold Coast Suns,SCG,2022,8,3,0
67,GWS Giants,Geelong Cats,Manuka Oval,2022,8,4,0
68,Essendon,Hawthorn,Marvel Stadium,2022,8,5,0
69,Brisbane Lions,West Coast Eagles,Gabba,2022,8,6,0
70,Melbourne,St Kilda,MCG,2022,8,7,0
71,Carlton,Adelaide Crows,Marvel Stadium,2022,8,8,0


In [1233]:
pred_round_results.columns


pred_round_results['result'] = np.where(pred_round_results['homeTeamScore.matchScore.totalScore']>pred_round_results['awayTeamScore.matchScore.totalScore'],1,0)

pred_round_results[['match.homeTeam.name','match.awayTeam.name','round.roundNumber','homeTeamScore.matchScore.totalScore','awayTeamScore.matchScore.totalScore','result']]



Unnamed: 0,match.homeTeam.name,match.awayTeam.name,round.roundNumber,homeTeamScore.matchScore.totalScore,awayTeamScore.matchScore.totalScore,result
63,Port Adelaide,Western Bulldogs,8,86,69,1
64,Fremantle,North Melbourne,8,102,24,1
65,Richmond,Collingwood,8,113,86,1
66,Sydney Swans,Gold Coast Suns,8,61,75,0
67,GWS Giants,Geelong Cats,8,35,88,0
68,Essendon,Hawthorn,8,108,81,1
69,Brisbane Lions,West Coast Eagles,8,105,30,1
70,Melbourne,St Kilda,8,93,55,1
71,Carlton,Adelaide Crows,8,116,68,1


In [1234]:
next_round_predictions

array([0, 1, 0, 0, 0, 1, 1, 1, 1], dtype=int64)

In [1235]:
prediction_probs = lr.predict_proba(next_round_features)
prediction_probs.round(2)

array([[0.52, 0.48],
       [0.06, 0.94],
       [1.  , 0.  ],
       [0.62, 0.38],
       [0.53, 0.47],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ],
       [0.  , 1.  ]])

In [1236]:
actual_results = pred_round_results[['match.homeTeam.name','match.awayTeam.name','round.roundNumber','homeTeamScore.matchScore.totalScore','awayTeamScore.matchScore.totalScore','result']]
actual_results

Unnamed: 0,match.homeTeam.name,match.awayTeam.name,round.roundNumber,homeTeamScore.matchScore.totalScore,awayTeamScore.matchScore.totalScore,result
63,Port Adelaide,Western Bulldogs,8,86,69,1
64,Fremantle,North Melbourne,8,102,24,1
65,Richmond,Collingwood,8,113,86,1
66,Sydney Swans,Gold Coast Suns,8,61,75,0
67,GWS Giants,Geelong Cats,8,35,88,0
68,Essendon,Hawthorn,8,108,81,1
69,Brisbane Lions,West Coast Eagles,8,105,30,1
70,Melbourne,St Kilda,8,93,55,1
71,Carlton,Adelaide Crows,8,116,68,1


In [1237]:
df_next_games_teams['pred_home_result'] =  next_round_predictions
df_next_games_teams['pred_home_prob'] = prediction_probs[:,1].round(3)



In [1238]:

actual_results = pred_round_results[['match.homeTeam.name','match.awayTeam.name','round.roundNumber','homeTeamScore.matchScore.totalScore','awayTeamScore.matchScore.totalScore','result']]

df_next_games_teams = pd.merge(df_next_games_teams, actual_results, on=['match.homeTeam.name', 'match.awayTeam.name'])
    
df_next_games_teams


Unnamed: 0,match.homeTeam.name,match.awayTeam.name,venue.name,round.year,round.roundNumber_x,match.matchId,train_data,pred_home_result,pred_home_prob,round.roundNumber_y,homeTeamScore.matchScore.totalScore,awayTeamScore.matchScore.totalScore,result
0,Port Adelaide,Western Bulldogs,Adelaide Oval,2022,8,0,0,0,0.483,8,86,69,1
1,Fremantle,North Melbourne,Optus Stadium,2022,8,1,0,1,0.94,8,102,24,1
2,Richmond,Collingwood,MCG,2022,8,2,0,0,0.0,8,113,86,1
3,Sydney Swans,Gold Coast Suns,SCG,2022,8,3,0,0,0.377,8,61,75,0
4,GWS Giants,Geelong Cats,Manuka Oval,2022,8,4,0,0,0.466,8,35,88,0
5,Essendon,Hawthorn,Marvel Stadium,2022,8,5,0,1,1.0,8,108,81,1
6,Brisbane Lions,West Coast Eagles,Gabba,2022,8,6,0,1,1.0,8,105,30,1
7,Melbourne,St Kilda,MCG,2022,8,7,0,1,1.0,8,93,55,1
8,Carlton,Adelaide Crows,Marvel Stadium,2022,8,8,0,1,1.0,8,116,68,1


In [1239]:
df_next_games_teams['score_1'] = 0.0
df_next_games_teams['score_2'] = 0.0

for i in range(len(df_next_games_teams)):
    
    p = df_next_games_teams['pred_home_prob'].values[i] 
    
    if p > 0.9:
        p = 0.9
    elif p < 0.1:
        p = 0.1
    
    if df_next_games_teams['homeTeamScore.matchScore.totalScore'].values[i] == df_next_games_teams['awayTeamScore.matchScore.totalScore'].values[i]:
        df_next_games_teams['score_1'].values[i] = 1.0 + 0.5 * np.log2(p*(1-p))
        df_next_games_teams['score_2'].values[i] = 1.0 + 0.5 * np.log2(p*(1-p))
        df_next_games_teams['score_3'].values[i] = 1.0 + 0.5 * np.log2(q*(1-q))

    elif (df_next_games_teams['pred_home_result'].values[i] == df_next_games_teams['result'].values[i]):
        df_next_games_teams['score_1'].values[i] = 1.0 + np.log2(p)
        if df_next_games_teams['pred_home_result'].values[i] == 1:
            df_next_games_teams['score_2'].values[i] = 1.0 + np.log2(p)
            df_next_games_teams['score_3'].values[i] = 1.0 + np.log2(q)
        elif df_next_games_teams['pred_home_result'].values[i] == 0:
            df_next_games_teams['score_2'].values[i] = 1.0 + np.log2(1.0-p)
            df_next_games_teams['score_3'].values[i] = 1.0 + np.log2(1.0-q)

    elif df_next_games_teams['pred_home_result'].values[i] != df_next_games_teams['result'].values[i]:
        df_next_games_teams['score_1'].values[i] = 1.0 + np.log2(1.0 - p)

        if df_next_games_teams['pred_home_result'].values[i] == 1:
            df_next_games_teams['score_2'].values[i] = 1.0 + np.log2(1.0 - p)
            df_next_games_teams['score_3'].values[i] = 1.0 + np.log2(1.0 - q)
        elif df_next_games_teams['pred_home_result'].values[i] == 0:
            df_next_games_teams['score_2'].values[i] = 1.0 + np.log2(1.0-(1.0-p))
            df_next_games_teams['score_3'].values[i] = 1.0 + np.log2(1.0-(1.0-q))
        
        
        
    

In [1240]:
df_next_games_teams


Unnamed: 0,match.homeTeam.name,match.awayTeam.name,venue.name,round.year,round.roundNumber_x,match.matchId,train_data,pred_home_result,pred_home_prob,round.roundNumber_y,homeTeamScore.matchScore.totalScore,awayTeamScore.matchScore.totalScore,result,score_1,score_2
0,Port Adelaide,Western Bulldogs,Adelaide Oval,2022,8,0,0,0,0.483,8,86,69,1,0.048236,0.048236
1,Fremantle,North Melbourne,Optus Stadium,2022,8,1,0,1,0.94,8,102,24,1,0.847997,0.0
2,Richmond,Collingwood,MCG,2022,8,2,0,0,0.0,8,113,86,1,0.847997,0.847997
3,Sydney Swans,Gold Coast Suns,SCG,2022,8,3,0,0,0.377,8,61,75,0,-0.407364,0.317304
4,GWS Giants,Geelong Cats,Manuka Oval,2022,8,4,0,0,0.466,8,35,88,0,-0.101598,0.094912
5,Essendon,Hawthorn,Marvel Stadium,2022,8,5,0,1,1.0,8,108,81,1,0.847997,0.0
6,Brisbane Lions,West Coast Eagles,Gabba,2022,8,6,0,1,1.0,8,105,30,1,0.847997,0.0
7,Melbourne,St Kilda,MCG,2022,8,7,0,1,1.0,8,93,55,1,0.847997,0.0
8,Carlton,Adelaide Crows,Marvel Stadium,2022,8,8,0,1,1.0,8,116,68,1,0.847997,0.0


In [1241]:
print(df_next_games_teams['score_1'].sum())
print(df_next_games_teams['score_2'].sum())

4.627255913581319
1.3084488075951792


In [99]:


# Can we train and retrain model to get max score for a Round?

# predict rounds and plot score based on model




In [1274]:

from test import *
import test

In [1276]:
asd = test(1)

TypeError: 'module' object is not callable