In [73]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
import numpy as np
import pandas as pd

In [74]:
# In looking at FiveThirtyEight's soccer statistics, we are eager to compare results (HOME WIN, AWAY WIN, and DRAW)
# to the pre-game probabilities assigned to each team and determine whether or not the favorite emerged victorious.
df = pd.read_csv("https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv").dropna()

In [75]:
# First, we generate the home margin to calculate the net number of goals the home team scored vs. the away side.
df['h_margin'] = df['score1'] - df['score2']

In [76]:
# Our empty list which will be populated with results.
results = []

In [77]:
# If the home margin is greater than 0, it's a home win.
# If it equals 0, it's a draw.
# If it's less than 0, it's an away win.
for x in df['h_margin']:
  if x > 0:
    results.append("HOME WIN")
  elif x == 0:
    results.append("DRAW")
  else:
    results.append("AWAY WIN")

In [78]:
# Add results to the dataframe.
df['result'] = results

In [79]:
# Now we want to examine the home probability margin. Excluding the probability of a draw, which team
# (home or away) has a greater probability of winning?
df['h_prob_margin'] = df['prob1'] - df['prob2']
prob_margins = []

In [80]:
# If the home team has a margin of greater than or equal to 0 (unlikely == 0 since the range of values is 0-1),
# then the home team is the favorite. The away team, the inverse.
for y in df['h_prob_margin']:
  if y >= 0:
    prob_margins.append("HOME FAVORITE")
  else:
    prob_margins.append("AWAY FAVORITE")

In [81]:
# Add the win probability buckets to the dataframe.
df['prob_bucket'] = prob_margins

In [82]:
# No need for the h_margin and h_prob_margin columns anymore.
df = df.drop(columns=['h_margin', 'h_prob_margin'])

In [83]:
# We want to judge whether the pre-match probabilities (in terms of who is favored to win) is equal to the outcome.
comparison_df = df[['result','prob_bucket']]
comparisons = []

In [84]:
# Generating the range for the length of the comparison dataframe, which is also the length of the main df.
ran = range(0, len(comparison_df), 1)

In [85]:
# If home was favored and won, it's "EXPECTED HOME".
# If away was favored and won, "EXPECTED AWAY."
# If away was favored and lost, it's "UNEXPECTED HOME".
# If home was favored and lost, it's "UNEXPECTED AWAY"
# Otherwise, it's "no decisive outcome," which means a draw.
for z in ran:
  z = comparison_df.iloc[z,:]
  if (z[0]=="HOME WIN") & (z[1]=="HOME FAVORITE"):
    comparisons.append("EXPECTED HOME")
  elif (z[0]=="AWAY WIN") & (z[1]=="AWAY FAVORITE"):
    comparisons.append("EXPECTED AWAY")
  elif (z[0]=="AWAY WIN") & (z[1]=="HOME FAVORITE"):
    comparisons.append("UNEXPECTED AWAY")
  elif (z[0]=="HOME WIN") & (z[1]=="AWAY FAVORITE"):
    comparisons.append("UNEXPECTED HOME")
  else:
    comparisons.append("NO DECISIVE OUTCOME")

In [86]:
# Adding the comparisons to the dataframe. This will be our target variable, so probably not necessary, but I prefer it for consistency.
df['result_type'] = comparisons

In [87]:
# Predictor and target definition. We will use both the scores and probabilitiesof home and away.
X = df[['score1','score2','prob1','prob2']]
y = df['result_type']

In [88]:
# Since the amount of data is pretty extensive, 0.2 is a fine test size.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=6)

In [89]:
# The normal XGBRF depth is 3, but we'll use 6.
model = XGBRFClassifier(objective='multiclass:softprob', max_depth=6).fit(X_train, y_train)

In [90]:
# Generating predictions based on the test predictors.
predictions = model.predict(X_test)

In [93]:
# Accuracy to the 1000th should be suitable. Since we're generating an accuracy percentage to close out the experiment, we will multiply the rounded result by 100.
accuracy = np.round(accuracy_score(y_test, predictions) * 100, 3)

In [94]:
print("Test Accuracy: {}%".format(accuracy))

Test Accuracy: 99.451%
