# 05 - BackTesting

In [1]:
# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os

#notebook settings
pd.set_option('display.max_columns', None)

#pipelines
from sklearn.pipeline import Pipeline

# building a pipeline to preprocess the data
from sklearn.metrics import classification_report

In [2]:
# load the model
# Load the model from the file
filename = os.path.join('..','models','football_model.pkl')
loaded_model = joblib.load(filename)


In [3]:
# load X_test, y_test and back testing data
X_test = pd.read_csv(os.path.join('..','data','processed','X_test.csv'))
y_test = pd.read_csv(os.path.join('..','data','processed','y_test.csv'))
data_for_back_testing = pd.read_csv(os.path.join('..','data','processed','data_for_back_testing.csv'))

# make predictions
y_pred = loaded_model.predict(X_test)

In [4]:
X_test.head()

Unnamed: 0,home_team,away_team,day_of_week,month,day_of_week_sin,day_of_week_cos,month_sin,month_cos,home_roll_3_avg_home_corners,away_roll_3_avg_home_corners,home_roll_3_avg_away_corners,away_roll_3_avg_away_corners,home_roll_3_avg_home_yellow_cards,away_roll_3_avg_home_yellow_cards,home_roll_3_avg_away_yellow_cards,away_roll_3_avg_away_yellow_cards,home_roll_3_avg_home_red_cards,away_roll_3_avg_home_red_cards,home_roll_3_avg_away_red_cards,away_roll_3_avg_away_red_cards,home_roll_3_avg_home_shots_to_away_shots,away_roll_3_avg_home_shots_to_away_shots,home_roll_3_avg_home_fouls_to_away_fouls,away_roll_3_avg_home_fouls_to_away_fouls,home_roll_3_avg_away_shots_to_home_shots,away_roll_3_avg_away_shots_to_home_shots,home_roll_3_avg_away_fouls_to_home_fouls,away_roll_3_avg_away_fouls_to_home_fouls,home_roll_5_avg_home_corners,away_roll_5_avg_home_corners,home_roll_5_avg_away_corners,away_roll_5_avg_away_corners,home_roll_5_avg_home_yellow_cards,away_roll_5_avg_home_yellow_cards,home_roll_5_avg_away_yellow_cards,away_roll_5_avg_away_yellow_cards,home_roll_5_avg_home_red_cards,away_roll_5_avg_home_red_cards,home_roll_5_avg_away_red_cards,away_roll_5_avg_away_red_cards,home_roll_5_avg_home_shots_to_away_shots,away_roll_5_avg_home_shots_to_away_shots,home_roll_5_avg_away_shots_to_home_shots,away_roll_5_avg_away_shots_to_home_shots,home_cumulative_points,away_cumulative_points
0,man city,bolton,6,5,-2.449294e-16,1.0,0.5,-0.8660254,6.0,7.666667,6.0,5.0,1.0,1.333333,2.0,2.333333,0.0,0.0,0.0,0.0,0.707158,2.342857,1.041667,0.944258,1.458689,0.525,1.022222,1.177489,6.2,7.6,6.0,5.0,1.4,1.6,2.8,1.8,0.0,0.0,0.2,0.0,1.452866,1.74619,1.035214,0.787727,39,15
1,swansea,west ham,6,12,-2.449294e-16,1.0,-2.449294e-16,1.0,3.666667,6.666667,5.666667,3.333333,2.333333,2.0,1.666667,0.666667,0.0,0.0,0.0,0.333333,0.615278,2.344444,0.883319,1.156695,1.643182,0.427706,2.734127,0.962251,3.4,4.4,5.0,5.4,2.0,1.8,2.2,1.8,0.0,0.4,0.0,0.2,0.796067,1.576667,1.459935,1.389957,10,15
2,liverpool,man united,5,10,-0.8660254,0.5,-0.8660254,0.5,8.333333,6.333333,3.666667,6.666667,2.333333,2.0,3.333333,0.666667,0.0,0.0,0.0,0.0,1.478211,1.296037,1.543651,6.37037,0.853705,0.788095,0.835294,0.31131,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,8
3,stoke,bournemouth,5,11,-0.8660254,0.5,-0.5,0.8660254,6.0,4.333333,4.666667,5.666667,2.0,1.666667,1.666667,2.0,0.0,0.333333,0.0,0.0,1.394661,1.848485,0.653846,0.914646,0.728214,0.815686,1.581481,1.249183,7.0,4.8,5.8,5.4,2.4,2.0,2.2,1.6,0.0,0.2,0.0,0.2,1.12013,1.964091,1.003595,0.679412,7,5
4,sunderland,man united,5,3,-0.8660254,0.5,1.0,6.123234000000001e-17,7.666667,5.333333,4.666667,5.333333,1.666667,0.666667,2.666667,1.666667,0.0,0.0,0.666667,0.0,1.555556,1.970085,0.701465,0.9,0.840278,0.736957,1.574074,1.491667,6.0,4.0,4.8,5.6,1.2,1.0,2.6,2.0,0.0,0.0,0.4,0.0,1.377778,1.42942,1.504167,1.113602,18,35


In [6]:
# what are the probability of the model predicting the target variable
y_pred_proba = loaded_model.predict_proba(X_test)
display(y_pred_proba)

# what are the team names that goint to win
y_pred = loaded_model.predict(X_test)
display(y_pred)

array([[0.1251052 , 0.10553091, 0.76936394],
       [0.555249  , 0.34006715, 0.10468385],
       [0.23830177, 0.4415077 , 0.32019052],
       ...,
       [0.0630325 , 0.21932243, 0.7176451 ],
       [0.7698022 , 0.16441253, 0.0657852 ],
       [0.00105211, 0.00551937, 0.9934285 ]], dtype=float32)

array([2, 0, 1, ..., 2, 0, 2])

In [7]:
print(data_for_back_testing.shape)
data_for_back_testing.head()

(7320, 3)


Unnamed: 0,implied_home_win_prob,implied_draw_prob,implied_away_win_prob
0,0.404145,0.28601,0.309845
1,0.185958,0.273467,0.540575
2,0.391387,0.285412,0.323201
3,0.540575,0.273467,0.185958
4,0.323341,0.289997,0.386662


In [8]:
y_pred_proba

array([[0.1251052 , 0.10553091, 0.76936394],
       [0.555249  , 0.34006715, 0.10468385],
       [0.23830177, 0.4415077 , 0.32019052],
       ...,
       [0.0630325 , 0.21932243, 0.7176451 ],
       [0.7698022 , 0.16441253, 0.0657852 ],
       [0.00105211, 0.00551937, 0.9934285 ]], dtype=float32)

In [None]:
# Create a DataFrame to store team names and y_true values
team_names = X_test.loc[:, ['home_team', 'away_team']].copy()

#add the tru labels
team_names['true_results'] = y_test

# Add predicted results to the DataFrame decoded from the label encoding
team_names['predicted_results'] = y_pred


# Unpack y_pred_proba (probabilities for [Away, Draw, Home])
team_names[['away_prob', 'draw_prob', 'home_prob']] = pd.DataFrame(y_pred_proba, index=team_names.index)

# Ensure index alignment between team_names and data_for_back_testing for implied probabilities
team_names[['implied_home_win_prob', 'implied_draw_prob', 'implied_away_win_prob']] = data_for_back_testing.loc[team_names.index, ['implied_home_win_prob', 'implied_draw_prob', 'implied_away_win_prob']]

# Reorder columns for better readability
team_names = team_names[['home_team', 'away_team', 'true_results','predicted_results', 
                         'home_prob', 'draw_prob', 'away_prob', 
                         'implied_home_win_prob', 'implied_draw_prob', 
                         'implied_away_win_prob']]


# Print column names to ensure everything is added correctly
print(team_names.columns)

# Display the first few rows of the DataFrame
team_names.head()

Index(['home_team', 'away_team', 'true_results', 'predicted_results',
       'home_prob', 'draw_prob', 'away_prob', 'implied_home_win_prob',
       'implied_draw_prob', 'implied_away_win_prob'],
      dtype='object')


Unnamed: 0,home_team,away_team,true_results,predicted_results,home_prob,draw_prob,away_prob,implied_home_win_prob,implied_draw_prob,implied_away_win_prob
0,man city,bolton,H,2,0.769364,0.105531,0.125105,0.404145,0.28601,0.309845
1,swansea,west ham,D,0,0.104684,0.340067,0.555249,0.185958,0.273467,0.540575
2,liverpool,man united,D,1,0.320191,0.441508,0.238302,0.391387,0.285412,0.323201
3,stoke,bournemouth,A,1,0.348626,0.490506,0.160868,0.540575,0.273467,0.185958
4,sunderland,man united,A,0,0.062179,0.200977,0.736845,0.323341,0.289997,0.386662


In [17]:
# Define a function to calculate the Brier score
def calculate_brier_score(predictions, true_outcome):
    return np.mean((predictions - true_outcome) ** 2)

# Assuming you have the following DataFrame as `team_names`
# Add the one-hot encoded true predictions as a new column based on 'true_predictions'
def encode_result(result):
    # 'H' = [1, 0, 0], 'D' = [0, 1, 0], 'A' = [0, 0, 1]
    if result == 'H':
        return [1, 0, 0]
    elif result == 'D':
        return [0, 1, 0]
    elif result == 'A':
        return [0, 0, 1]

team_names['true_predictions_brier'] = team_names['true_results'].apply(encode_result)

# Now calculate the Brier score for each row using the predicted probabilities
team_names['brier_score_market'] = team_names.apply(
    lambda row: calculate_brier_score(np.array([row['home_prob'], row['draw_prob'], row['away_prob']]), row['true_predictions_brier']),
    axis=1
)

# calculate the brier score for the model
team_names['brier_score_model'] = team_names.apply(
    lambda row: calculate_brier_score(np.array([row['implied_home_win_prob'], row['implied_draw_prob'], row['implied_away_win_prob']]), row['true_predictions_brier']),
    axis=1
)


# Optionally, calculate the average Brier score for the entire dataset
average_brier_score_market = team_names['brier_score_market'].mean()

# Optionally, calculate the average Brier score for the entire dataset
average_brier_score_model = team_names['brier_score_model'].mean()


# encode predicted_results to H,D,A
def decode_result(result):
    if result == 2:
        return 'H'
    elif result == 1:
        return 'D'
    elif result == 0:
        return 'A'
    
team_names['predicted_results'] = team_names['predicted_results'].apply(decode_result)


# Display the Brier score for each row and the average Brier score
display(team_names[['home_team', 'away_team', 'true_results', 'predicted_results', 'brier_score_model', 'brier_score_market']])
print(f"Average Brier Score Market: {average_brier_score_market}")
print(f"Average Brier Score Model: {average_brier_score_model}")

Unnamed: 0,home_team,away_team,true_results,predicted_results,brier_score_model,brier_score_market
0,man city,bolton,H,,0.177616,0.026660
1,swansea,west ham,D,,0.284884,0.251591
2,liverpool,man united,D,,0.256093,0.157074
3,stoke,bournemouth,A,,0.343223,0.355426
4,sunderland,man united,A,,0.188277,0.037836
...,...,...,...,...,...,...
1459,newcastle,stoke,H,,0.095651,0.004713
1460,aston villa,everton,A,,0.256851,0.012267
1461,man city,aston villa,H,,0.151064,0.043933
1462,wolves,everton,A,,0.236278,0.028117


Average Brier Score Market: 0.14939229928433911
Average Brier Score Model: 0.2311581039249381
