In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
# The probability margin is the absolute value of the difference
# between the two teams in terms of who is more likely to win: np.abs(df['prob1'] - df['prob2']).
# There is also probtie which predicts the probability the game ends up level.
# Simple three-layer neural network to anticipate which
# bucket of probability margin a given match falls into:
    # If X is less than the mean probability margin (MPM), it's "WITHIN MEAN PROBABILITY MARGIN."
    # If X is greater than or equal to MPM, but less than MPM + 1 standard deviation (SDPM), it's "SLIGHT PROBABILITY ADVANTAGE."
    # If X is greater than MPM + SDPM, but less than MPM + (SDPM * 1.5), it's "MODERATE PROBABILITY ADVANTAGE."
    # If X is greater than MPM + (SDPM * 1.5), meaning it is an "outlier", it's "SIGNIFICANT PROBABILITY ADVANTAGE."
# We will use statistics from the Barclays Premier League from the start of the 2018-19 season.
df = pd.read_csv("https://projects.fivethirtyeight.com/soccer-api/club/spi_matches.csv")
df = df[(df.league=="Barclays Premier League") & (df.season >= 2018)]

In [3]:
# A three-layer model is idea because it has enough filtering capability to generalize and get rid of the noise.
# That said, we will throw in an l2 regularizer just to trim it even further.
def load_model():
    model = tf.keras.models.Sequential()
    model.add(tf.keras.layers.Dense(8, kernel_regularizer=tf.keras.regularizers.l2(0.001), activation='relu'))
    model.add(tf.keras.layers.Dense(8, activation='relu'))
    model.add(tf.keras.layers.Dense(4, activation='softmax'))
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [4]:
# The y variables need to be converted from their existing state after train_test_split to categorical.
# That said, it's a bit more than just calling to_categorical...the array in question has to be factorized, then converted
# to a NumPy array, then converted to categorical.
# This results in a four-column array of one-hot values for the variables.
# We have to do it twice, so best to create a basic function for it.
def cat(array):
    array = tf.keras.utils.to_categorical(np.asarray(pd.factorize(array)[0]))
    return array

In [5]:
# OK, now the fun part. Probability margin, MPM, and SDPM generation.
df['prob_margin'] = np.abs(df['prob1'] - df['prob2'])
mpm = np.mean(df['prob_margin'])
sdpm = np.std(df['prob_margin'])

In [6]:
# Empty list.
pm = []

In [7]:
# It's bucketing time.
for x in df['prob_margin']:
    if x < mpm:
        pm.append("WITHIN MEAN PROBABILITY MARGIN")
    elif (x >= mpm) & (x < (mpm + sdpm)):
        pm.append("SLIGHT PROBABILITY ADVANTAGE")
    elif (x >= (mpm + sdpm)) & (x < (mpm + (sdpm * 1.5))):
        pm.append("MODERATE PROBABILITY ADVANTAGE")
    else:
        pm.append("SIGNIFICANT PROBABILITY ADVANTAGE")

In [8]:
# Assigning PM to our new column, prob_bucket.
df['prob_bucket'] = pm

In [9]:
# Since there are four output classes, we need to have the identical number of columns as our predictor variables (4).
# We haven't used probtie yet, but it would be good to include.
X_final = df[['prob1', 'prob2', 'prob_margin', 'probtie']]
y_final = df[['prob_bucket']].copy()
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size=0.2, random_state=6)

In [10]:
# Converting y_train and y_test to categorical.
y_train = cat(y_train["prob_bucket"])
y_test = cat(y_test["prob_bucket"])

In [11]:
# We want to see if there is a more optimal number of epochs.
# We will set it to tell us with a patience setting of 5.
early_stop = tf.keras.callbacks.EarlyStopping(monitor="val_loss", mode="min", patience=5, restore_best_weights=True)

In [12]:
# 8 epochs.
model = load_model()
history = model.fit(X_train, y_train, batch_size=128, epochs=25, validation_data=(X_test, y_test), callbacks=[early_stop])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25


In [13]:
# We could see the number of epochs, but easier to just tie it to an object.
num_epochs = len(history.__dict__['epoch'])

In [14]:
# Given that early_stop stopped after 8 epochs, we will run 8 as well.
# Again, since it's tied to an object, not much extra work involved.
model.fit(X_train, y_train, epochs=num_epochs, batch_size=128, verbose=1, validation_split=0.2)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<tensorflow.python.keras.callbacks.History at 0x2150d9c0760>

In [15]:
# Generating the predictions.
predictions = model.predict(X_test)

In [16]:
# We'll take it.
loss, accuracy = model.evaluate(y_test, predictions)



In [17]:
# For good measure to close it out.
print("Test Accuracy: {}%".format(np.round(accuracy * 100),2))

Test Accuracy: 97.0%
