In [359]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import numpy as np
import pandas as pd

In [361]:
# Soccer is a game of fine margins. As such, even when one teams outscores the other, there may be variation in other non-scoring metrics, such as:
  # SPI (Soccer Power Index) ratings
  # xG (Expected Goal) tallies
  # Win probability %
  # Adjusted xG
  # Non-Shot xG
# As such, we will run through the FiveThirtyEight SPI dataset, examining Barclays Premier League matches between 2018-19 and 2020-21.
# If all indicators are in favor of one team, we will label them as "UNANIMOUS", meaning that the performance indicators were unanimous in favor of one team.
# If the indicatorse are mixed between favoring one side or the other, it will be "MIXED."
df = pd.read_csv("https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv").dropna()
df = df[(df.league=="Barclays Premier League") & (df.season >= 2018)].reset_index(drop=True)

In [362]:
# First, we need to generate comparisons between the performance metrics of the home and away team.
# Usually margins like these would be generated as absolute values, but since we want to know how the teams stack up relative to one another,
# we need negative values to clearly differentiate the teams, so we're just subtracing the away performance from the home performance.
df['h_result'] = df['score1'] - df['score2']
df['h_spi'] = df['spi1'] - df['spi2']
df['h_xg'] = df['xg1'] - df['xg2']
df['h_prob'] = df['prob1'] - df['prob2']
df['h_adj'] = df['adj_score1'] - df['adj_score2']
df['h_nsxg'] = df['nsxg1'] - df['nsxg2']

In [363]:
results = []
spis = []
xgs = []
probs = []
adjs = []
nsxgs = []

In [364]:
# This function does a lot of the heavy lifting.
# If a performance metric is greater than 0 (thus, in favor of the home team), it's "HOME."
# Else, if a performance metric is less than 0 (thus, in favor of the away team), it's "AWAY."
# Else, it's a "DRAW" (unlikely with certain metrics like xG, which go to the hundredths decimal point, but not important since we are comparing all six).
def convert(array, list_to_return):
  array = df[array]
  for x in range(len(array)):
    x = array[x]
    if x > 0:
      list_to_return.append("HOME")
    elif x < 0:
      list_to_return.append("AWAY")
    else:
      list_to_return.append("DRAW")

In [365]:
result = convert("h_result", results)
spi = convert("h_spi", spis)
xg = convert("h_xg", xgs)
prob = convert("h_prob", probs)
adj = convert("h_adj", adjs)
nsxg = convert("h_nsxg", nsxgs)

In [366]:
# Inserting our newly-generated lists into the dataframe at the expense of the original calculations.
df = df.drop(columns=['h_result','h_spi','h_xg','h_prob','h_adj','h_nsxg'])
df['result'] = results
df['spi'] = spis
df['xg'] = xgs
df['prob'] = probs
df['adj'] = adjs
df['nsxg'] = xgs

In [367]:
y_list = []
master_result = []

In [368]:
# We create a list for each row of the dataset, containing only our newly-generated metrics.
for y in range(len(df)):
  y1 = df['result'][y]
  y2 = df['spi'][y]
  y3 = df['xg'][y]
  y4 = df['prob'][y]
  y5 = df['adj'][y]
  y6 = df['nsxg'][y]
  y_list.append(list([y1, y2, y3, y4, y5, y6]))

In [369]:
# If all the elements in the row list are the same, it's "UNANIMOUS." Otherwise, "MIXED."
for z in range(len(y_list)):
  z = y_list[z]
  a = z.count(z[0])==len(z)
  if a == True:
    master_result.append("UNANIMOUS")
  else:
    master_result.append("MIXED")

In [370]:
df['master'] = master_result

In [371]:
# Splitting the relevant metrics into predictors or targets.
X_final = df[['score1','score2','spi1','spi2','xg1','xg2','prob1','prob2','adj_score1','adj_score2','nsxg1','nsxg2']]
y_final = df['master']

In [374]:
accuracy_values = []

In [375]:
# The model will run for 100 rounds, with the random_state increasing by 1 each time so our results are reproducible.
def model():
  global count
  count = 0
  while count < 100:
    X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.25, random_state=count)
    model = XGBClassifier(n_estimators=400, eval_metric='mlogloss')
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    accuracy_values.append(accuracy)
    count += 1

In [376]:
model = model()

In [395]:
# Quick function to convert the accuracy values to integers (to compute percentages).
def round_it(num):
  num = int(num * 100)
  return num

In [396]:
acc_min = round_it(np.min(accuracy_values))
acc_med = round_it(np.median(accuracy_values))
acc_mean = round_it(np.mean(accuracy_values))
acc_max = round_it(np.max(accuracy_values))

In [397]:
print("Accuracy Measures over {} Rounds:".format(count))
print("Minimum: {}% | Median: {}% | Mean: {}% | Maximum: {}%".format(acc_min, acc_med, acc_mean, acc_max))

Accuracy Measures over 100 Rounds:
Minimum: 90% | Median: 93% | Mean: 93% | Maximum: 97%
