In [23]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,roc_auc_score,confusion_matrix,accuracy_score,f1_score,roc_curve
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler
from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.layers import Activation, Dense, Dropout, Embedding, LSTM
from keras.utils import to_categorical
import re
from IPython.display import display
import os
import string
import time
import random
import matplotlib.pyplot as plt
# random.seed(10)

In [2]:
# Import Data
games_folder_path = "C:/Users/Craig/Documents/Thesis/Thomas_Gallagher_Thesis/Data/AFL_Stats_sorted/Year/Games/"
games_2012 = pd.read_csv(games_folder_path + '2012.csv', index_col=False, parse_dates=['date'])
games_2013 = pd.read_csv(games_folder_path + '2013.csv', index_col=False, parse_dates=['date'])
games_2014 = pd.read_csv(games_folder_path + '2014.csv', index_col=False, parse_dates=['date'])
games_2015 = pd.read_csv(games_folder_path + '2015.csv', index_col=False, parse_dates=['date'])
games_2016 = pd.read_csv(games_folder_path + '2016.csv', index_col=False, parse_dates=['date'])
games_2017 = pd.read_csv(games_folder_path + '2017.csv', index_col=False, parse_dates=['date'])
games_2018 = pd.read_csv(games_folder_path + '2018.csv', index_col=False, parse_dates=['date'])
games_2019 = pd.read_csv(games_folder_path + '2019.csv', index_col=False, parse_dates=['date'])
games_2020 = pd.read_csv(games_folder_path + '2020.csv', index_col=False, parse_dates=['date'])
games_2021 = pd.read_csv(games_folder_path + '2021.csv', index_col=False, parse_dates=['date'])


all_games = pd.read_csv(games_folder_path + 'games_sorted.csv', index_col=False, parse_dates=['date'])

  games_2014 = pd.read_csv(games_folder_path + '2014.csv', index_col=False, parse_dates=['date'])


In [3]:
# One Hot Encoding Teams
encoding = LabelEncoder()
encoding.fit(all_games["homeTeam"].values)

def OHE_Teams(games):
    home_teams = encoding.transform(games["homeTeam"].values)
    away_teams = encoding.transform(games["awayTeam"].values)

    all_teams = np.vstack([home_teams, away_teams]).T
 
    oneHot = OneHotEncoder()
    X_teams = oneHot.fit_transform(all_teams).todense()
    X_teams = pd.DataFrame(X_teams)
    games = pd.concat([games, pd.DataFrame(X_teams)],axis=1)
    return games

games_2012 = OHE_Teams(games_2012)
games_2013 = OHE_Teams(games_2013)
games_2014 = OHE_Teams(games_2014)
games_2015 = OHE_Teams(games_2015)
games_2016 = OHE_Teams(games_2016)
games_2017 = OHE_Teams(games_2017)
games_2018 = OHE_Teams(games_2018)
games_2019 = OHE_Teams(games_2019)
games_2020 = OHE_Teams(games_2020)
games_2021 = OHE_Teams(games_2021)

all_games = OHE_Teams(all_games)

In [4]:
# One Hot Encoding Venues
encoding = LabelEncoder()
encoding.fit(all_games["venue"].values)
all_venues = all_games["venue"].values

all_venues = all_venues.reshape(-1,1)

def OHE_Venues(games):
    venues = games['venue'].values
    # all_venues = all_venues.reshape(-1,1)
    venues = venues.reshape(-1,1)
    oneHot = OneHotEncoder()

    oneHot.fit(all_venues)
    X_venues = oneHot.transform(venues).toarray()
    X_venues = pd.DataFrame(X_venues, columns=oneHot.categories_[0])
    games = pd.concat([games, X_venues], axis=1)
    return games

games_2012 = OHE_Venues(games_2012)
games_2013 = OHE_Venues(games_2013)
games_2014 = OHE_Venues(games_2014)
games_2015 = OHE_Venues(games_2015)
games_2016 = OHE_Venues(games_2016)
games_2017 = OHE_Venues(games_2017)
games_2018 = OHE_Venues(games_2018)
games_2019 = OHE_Venues(games_2019)
games_2020 = OHE_Venues(games_2020)
games_2021 = OHE_Venues(games_2021)

all_games = OHE_Venues(all_games)

In [5]:

y_true_2012 = games_2012['homeWin']
y_true_2013 = games_2013['homeWin']
y_true_2014 = games_2014['homeWin']
y_true_2015 = games_2015['homeWin']
y_true_2016 = games_2016['homeWin']
y_true_2017 = games_2017['homeWin']
y_true_2018 = games_2018['homeWin']
y_true_2019 = games_2019['homeWin']
y_true_2020 = games_2020['homeWin']
y_true_2021 = games_2021['homeWin']
y_true = all_games['homeWin']

drop_values = ['gameId', 'venue', 'homeWin', 'homeTeam', 'awayTeam', 'year','date','startTime', 'attendance', 'homeTeamScore', 'awayTeamScore', 'round']

def set_columns(game_list):
    game_list = game_list.drop(drop_values,axis=1)
    game_list.columns = game_list.columns.astype(str)
    return game_list

games_2012 = set_columns(games_2012)
games_2013 = set_columns(games_2013)
games_2014 = set_columns(games_2014)
games_2015 = set_columns(games_2015)
games_2016 = set_columns(games_2016)
games_2017 = set_columns(games_2017)
games_2018 = set_columns(games_2018)
games_2019 = set_columns(games_2019)
games_2020 = set_columns(games_2020)
games_2021 = set_columns(games_2021)

all_games = set_columns(all_games)

In [6]:
# normalize the dataset
scaler = MinMaxScaler(feature_range=(0, 1))
games_2012 = scaler.fit_transform(games_2012)
games_2013 = scaler.fit_transform(games_2013)
games_2014 = scaler.fit_transform(games_2014)
games_2015 = scaler.fit_transform(games_2015)
games_2016 = scaler.fit_transform(games_2016)
games_2017 = scaler.fit_transform(games_2017)
games_2018 = scaler.fit_transform(games_2018)
games_2019 = scaler.fit_transform(games_2019)
games_2020 = scaler.fit_transform(games_2020)
games_2021 = scaler.fit_transform(games_2021)

all_games = scaler.fit_transform(all_games)

In [7]:
x_train_2012 = games_2012[:171]
y_train_2012 = y_true_2012[:171]
x_valid_2012 = games_2012[171:]
y_valid_2012 = y_true_2012[171:]

x_train_2013 = games_2013[:171]
y_train_2013 = y_true_2013[:171]
x_valid_2013 = games_2013[171:]
y_valid_2013 = y_true_2013[171:]

x_train_2014 = games_2014[:171]
y_train_2014 = y_true_2014[:171]
x_valid_2014 = games_2014[171:]
y_valid_2014 = y_true_2014[171:]

x_train_2015 = games_2015[:170]
y_train_2015 = y_true_2015[:170]
x_valid_2015 = games_2015[170:]
y_valid_2015 = y_true_2015[170:]

x_train_2016 = games_2016[:171]
y_train_2016 = y_true_2016[:171]
x_valid_2016 = games_2016[171:]
y_valid_2016 = y_true_2016[171:]

x_train_2017 = games_2017[:171]
y_train_2017 = y_true_2017[:171]
x_valid_2017 = games_2017[171:]
y_valid_2017 = y_true_2017[171:]

x_train_2018 = games_2018[:171]
y_train_2018 = y_true_2018[:171]
x_valid_2018 = games_2018[171:]
y_valid_2018 = y_true_2018[171:]

x_train_2019 = games_2019[:171]
y_train_2019 = y_true_2019[:171]
x_valid_2019 = games_2019[171:]
y_valid_2019 = y_true_2019[171:]

x_train_2020 = games_2020[:127]
y_train_2020 = y_true_2020[:127]
x_valid_2020 = games_2020[127:]
y_valid_2020 = y_true_2020[127:]

x_train_2021 = games_2021[:171]
y_train_2021 = y_true_2021[:171]
x_valid_2021 = games_2021[171:]
y_valid_2021 = y_true_2021[171:]

x_train = all_games[:1447]
y_train = y_true[:1447]
x_valid = all_games[1447:1655]
y_valid = y_true[1447:1655]

In [8]:
# OHE Y values
oneHot = OneHotEncoder()
def OHE_y_values(y_val):
    y = np.vstack([y_val]).T
    
    for i in range(len(y)):
        if y[i] == 1:
            y[i] = 0

    y_OHE = oneHot.fit_transform(y).toarray()

    return y_OHE


y_train_2012_OHE = OHE_y_values(y_train_2012)
y_valid_2012_OHE = OHE_y_values(y_valid_2012)

y_train_2013_OHE = OHE_y_values(y_train_2013)
y_valid_2013_OHE = OHE_y_values(y_valid_2013)

y_train_2014_OHE = OHE_y_values(y_train_2014)
y_valid_2014_OHE = OHE_y_values(y_valid_2014)

y_train_2015_OHE = OHE_y_values(y_train_2015)
y_valid_2015_OHE = OHE_y_values(y_valid_2015)

y_train_2016_OHE = OHE_y_values(y_train_2016)
y_valid_2016_OHE = OHE_y_values(y_valid_2016)

y_train_2017_OHE = OHE_y_values(y_train_2017)
y_valid_2017_OHE = OHE_y_values(y_valid_2017)

y_train_2018_OHE = OHE_y_values(y_train_2018)
y_valid_2018_OHE = OHE_y_values(y_valid_2018)

y_train_2019_OHE = OHE_y_values(y_train_2019)
y_valid_2019_OHE = OHE_y_values(y_valid_2019)

y_train_2020_OHE = OHE_y_values(y_train_2020)
y_valid_2020_OHE = OHE_y_values(y_valid_2020)

y_train_2021_OHE = OHE_y_values(y_train_2021)
y_valid_2021_OHE = OHE_y_values(y_valid_2021)

y_train_OHE = OHE_y_values(y_train)
y_valid_OHE = OHE_y_values(y_valid)

In [56]:
y_2012_OHE = OHE_y_values(y_true_2012)

y_2012_OHE = y_2012_OHE.astype(int)
y_2012_OHE.dtype

dtype('int32')

In [9]:
d_Train = x_train_2012.reshape(171, 1, 86)
d_valid = x_valid_2012.reshape(36, 1, 86) 

d_y_t = y_train_2012_OHE.reshape(171, 1,2)
d_y_v = y_valid_2012_OHE.reshape(36, 1, 2)

In [10]:
trainDs = map(lambda text, label: x_train_2012, y_train_2012)
valDs = map(lambda text, label: x_valid_2012, y_valid_2012)

In [65]:
trainDs

<map at 0x26aa3852400>

In [75]:
def get_lstm_model():
	# input for variable-length sequences of integers
	inputs = keras.Input(shape=(86,), dtype="int32")
	# embed the tokens in a 128-dimensional vector with masking
	# applied and apply dropout
	x = layers.Embedding(86, 128, mask_zero=False)(inputs)
	x = layers.Dropout(0.2)(x)
	# add 3 LSTMs
	x = layers.LSTM(64, return_sequences=True)(x)
	x = layers.LSTM(64, return_sequences=True)(x)
	x = layers.LSTM(64)(x)
	# add a classifier head
	x = layers.Dense(units=64, activation="tanh")(x)
	x = layers.Dense(units=32, activation="tanh")(x)
	# x = layers.Dropout(0.2)(x)
	outputs = layers.Dense(2, activation="tanh")(x)
	
	# build the LSTM model
	model = keras.Model(inputs, outputs, name="LSTM")
	
	# return the LSTM model
	return model

In [12]:
import matplotlib.pyplot as plt
def plot_loss_accuracy(history, filepath):
	# plot the training and validation loss
	plt.style.use("ggplot")
	(fig, axs) = plt.subplots(2, 1)
	axs[0].plot(history["loss"], label="train_loss")
	axs[0].plot(history["val_loss"], label="val_loss")
	axs[0].set_xlabel("Epoch #")
	axs[0].set_ylabel("Loss")
	axs[0].legend()
	axs[1].plot(history["accuracy"], label="train_accuracy")
	axs[1].plot(history["val_accuracy"], label="val_accuracy")
	axs[1].set_xlabel("Epoch #")
	axs[1].set_ylabel("Accuracy")
	axs[1].legend()
	fig.savefig(filepath)

In [80]:
print("[INFO] building the LSTM model...")
modelLSTM = get_lstm_model()
modelLSTM.compile(metrics=["accuracy"],
	optimizer=keras.optimizers.RMSprop(),
	loss=keras.losses.BinaryFocalCrossentropy(from_logits=False),
)
# train the LSTM model
print("[INFO] training the LSTM model...")
historyLSTM = modelLSTM.fit(x=games_2012, y = y_2012_OHE, epochs=10,
	shuffle= False, validation_split = 0.2, 
)

[INFO] building the LSTM model...
[INFO] training the LSTM model...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
