In [None]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment3/'
FOLDERNAME = 'CS229/Project/'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))
%cd drive/My\ Drive/$FOLDERNAME

Mounted at /content/drive
/content/drive/My Drive/CS229/Project


In [None]:
# Baseline packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Custom packages
from util import load_dataset
from util import place_bets
from util import evaluate_bets
from custom_loss import custom_loss

# Neural Network packages
from keras.layers import BatchNormalization, Dense, Input, Dropout
from keras.models import Model
from keras import backend as K

from keras.callbacks import EarlyStopping, ModelCheckpoint

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', None)

In [None]:
# Training
x_train = load_dataset("Load_Data/x_train.csv", intercept=True)
y_train = load_dataset("Load_Data/y_train.csv").to_numpy().flatten()

# Validation
x_val = load_dataset("Load_Data/x_val.csv", intercept=True)
y_val = load_dataset("Load_Data/y_val.csv").to_numpy().flatten()

# Test
x_test = load_dataset("Load_Data/x_test.csv", intercept=True)
y_test = load_dataset("Load_Data/y_test.csv").to_numpy().flatten()

In [None]:
def generate_nnlabels(y, x):
  """ Generate labels for use in the neural network model of the form [Win_home, Win_away, No Bet, Odds_home, Odds_away]

  Args:
    y: vector with 0 if win_away and 1 if win_home. Shape (num_bets, )
    x: feature vector containing odds_home and odds_away. Shape (num_bets, dim)
  """
  n = len(y)
  matrix = np.zeros((n, 5))

  matrix[:, 0] = y
  matrix[:, 1] = 1-y
  # matrix[:,2] is already zeros
  matrix[:, 3] = x['Odds_Home'].reset_index(drop=True)
  matrix[:, 4] = x['Odds_Away'].reset_index(drop=True)
  
  return matrix

In [None]:
# Modifying y vectors to be of the form [Win_home, Win_away, No Bet, Odds_home, Odds_away]
ynn_train = generate_nnlabels(y_train, x_train)
ynn_val = generate_nnlabels(y_val, x_val)
ynn_test = generate_nnlabels(y_test, x_test)

In [None]:
# Neural network model adapted from Malafosse, Charles (https://towardsdatascience.com/machine-learning-for-sports-betting-not-a-basic-classification-problem-b42ae4900782#:~:text=Sports%20betting%20is%20one%20of,and%20specifically%20classification%20neural%20networks.&text=Nonetheless%2C%20classic%20classification%20models%20are,network%20to%20achieve%20better%20profitability.)

def odds_loss(y_true, y_pred):
    """
    The function implements the custom loss function
    
    Args:
      true: a vector of dimension batch_size, 5. A label encoded version of the output and the backp1_a and backp1_b
      pred: a vector of probabilities of dimension batch_size , 3.
    
    Returns: 
      Loss value
    """
    win_home_team = y_true[:, 0:1]
    win_away = y_true[:, 1:2]
    no_bet = y_true[:, 2:3]
    odds_a = y_true[:, 3:4]
    odds_b = y_true[:, 4:5]

    gain_loss_vector = K.concatenate([win_home_team * (odds_a - 1) + (1 - win_home_team) * -1,  # payoff when we say home team is going to win
      win_away * (odds_b - 1) + (1 - win_away) * -1,                                            # payoff when we say away team is going to win
      K.zeros_like(odds_a)], axis=1)                                                            # payoff when we do not bet
    return -1 * K.mean(K.sum(gain_loss_vector * y_pred, axis=1))

In [None]:
# Defining model we will run
def get_model(input_dim, output_dim, base=1000, multiplier=0.25, p=0.2):
    inputs = Input(shape=(input_dim,))
    l = BatchNormalization()(inputs)
    l = Dropout(p)(l)
    n = base
    l = Dense(n, activation='relu')(l)
    l = BatchNormalization()(l)
    l = Dropout(p)(l)
    n = int(n * multiplier)
    l = Dense(n, activation='relu')(l)
    l = BatchNormalization()(l)
    l = Dropout(p)(l)
    n = int(n * multiplier)
    l = Dense(n, activation='relu')(l)
    outputs = Dense(output_dim, activation='softmax')(l)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='Nadam', loss=odds_loss)
    return model

In [None]:
input_dim = x_train.shape[1]
output_dim = 3              # Home win, Away win, No Bet

In [None]:
neurons = [100, 150, 200]
multiplier = [0.50, 0.75]
p = [0.3, 0.5]
results_dic = {}


for neuron in neurons:
  for mult in multiplier:
    for dropout in p:
      model = get_model(input_dim, output_dim, neuron, mult, dropout)
      history = model.fit(x_train, ynn_train, validation_data=(x_val, ynn_val),
                epochs=200, batch_size=5000, callbacks=[EarlyStopping(patience=25),ModelCheckpoint('odds_loss.hdf5',save_best_only=True)])
      results_dic[(neuron, mult, dropout)] = [model.evaluate(x_train, ynn_train), model.evaluate(x_val, ynn_val)]

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200


In [None]:
results_dic

{(100, 0.5, 0.3): [-0.7843984961509705, 0.07008127123117447],
 (100, 0.5, 0.5): [-0.6109845638275146, 0.014061084017157555],
 (100, 0.75, 0.3): [-0.8037264943122864, 0.0458122119307518],
 (100, 0.75, 0.5): [-0.6131053566932678, 0.041706256568431854],
 (150, 0.5, 0.3): [-0.8028547763824463, 0.01836675964295864],
 (150, 0.5, 0.5): [-0.6926648616790771, 0.02187945321202278],
 (150, 0.75, 0.3): [-0.8173401355743408, 0.051867857575416565],
 (150, 0.75, 0.5): [-0.6729459166526794, -0.017919056117534637],
 (200, 0.5, 0.3): [-0.8469763994216919, 0.06254413723945618],
 (200, 0.5, 0.5): [-0.7077128887176514, -0.025192582979798317],
 (200, 0.75, 0.3): [-0.8159429430961609, 0.036074016243219376],
 (200, 0.75, 0.5): [-0.6806380748748779, 0.0003795288794208318]}

### Obtaining best performing model

In [None]:
# Obtaining spec of best-performing model on the validation data
counter = 0
for spec, results in results_dic.items():
  if counter == 0:
    best_spec = spec
  elif spec[1] > best_spec:
    best_spec = spec

# Looking at best spec
print(best_spec)

In [None]:
# Running NN with best model
best_model = get_model(input_dim, output_dim, 150, 0.75, 0.50)
history = best_model.fit(x_train, ynn_train, validation_data=(x_val, ynn_val),
          epochs=200, batch_size=5000, callbacks=[EarlyStopping(patience=25),ModelCheckpoint('odds_loss.hdf5',save_best_only=True)])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200


In [None]:
print('Training Loss : {}\nValidation Loss : {}'.format(best_model.evaluate(x_train, ynn_train), best_model.evaluate(x_val, ynn_val)))

Training Loss : -0.08132106065750122
Validation Loss : 0.010935821570456028


### Calculating profit

In [None]:
#  Calculate prediction probabilities in training sample
pred_tr = best_model.predict(x_train)
pred_tr

array([[3.37528977e-10, 9.55596566e-01, 4.44034189e-02],
       [5.96988514e-10, 8.92633021e-01, 1.07366934e-01],
       [5.56922342e-10, 9.03633535e-01, 9.63665172e-02],
       ...,
       [6.58079616e-14, 1.43618891e-07, 9.99999881e-01],
       [4.91420123e-14, 7.75008928e-08, 9.99999881e-01],
       [5.08601865e-15, 2.86958657e-09, 1.00000000e+00]], dtype=float32)

In [None]:
# Determine which bet is the winner 
bet_matrix_tr = np.zeros((pred_tr.shape))
max_bet = np.argmax(pred_tr, axis=1)

In [None]:
# Create matrix with discrete decisions on which bet will be chosen
n = bet_matrix_tr.shape[0]
bet_matrix_tr[np.arange(n), max_bet] = 1
bet_matrix_tr

array([[0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [None]:
# Calculate profit
profit_train_strat = evaluate_bets(bet_matrix_tr, x_train['Odds_Home'], x_train['Odds_Away'], y_train)
profit_train_strat.sum()

print(f"Model achieves profit of ${profit_train_strat.sum():.2f} on the training sample")

Model achieves profit of $17242.26 on the training sample


On validation sample

In [None]:
# Predicting probabilities on validation sample
pred_val = best_model.predict(x_val)

In [None]:
bet_matrix_val = np.zeros((pred_val.shape))
max_bet_val = np.argmax(pred_val, axis=1)

In [None]:
# Create matrix with discrete decisions on which bet will be chosen
n = bet_matrix_val.shape[0]
bet_matrix_val[np.arange(n), max_bet_val] = 1
bet_matrix_val

array([[0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.]])

In [None]:
profit_val_strat = evaluate_bets(bet_matrix_val, x_val['Odds_Home'], x_val['Odds_Away'], y_val)

print(f"Model achieves profit of ${profit_val_strat.sum():.2f} on the validation sample")

Model achieves profit of $-369.71 on the validation sample


### Running experiments: Removing team from the X matrix

In [None]:
# Removing team from features to see if model is able to generalize better without teams
x_train = x_train[['Intercept', 'Time_left', 'Spread', 'Odds_Home', 'Odds_Away', 'Initial_odds_home', 'Initial_odds_away', 'Q_2.0', 'Q_3.0', 'Q_4.0']]
x_val = x_val[['Intercept', 'Time_left', 'Spread', 'Odds_Home', 'Odds_Away', 'Initial_odds_home', 'Initial_odds_away', 'Q_2.0', 'Q_3.0', 'Q_4.0']]
x_test = x_test[['Intercept', 'Time_left', 'Spread', 'Odds_Home', 'Odds_Away', 'Initial_odds_home', 'Initial_odds_away', 'Q_2.0', 'Q_3.0', 'Q_4.0']]

In [None]:
input_dim = x_train.shape[1]

In [None]:
# Running NN with best model
best_model = get_model(input_dim, output_dim, 150, 0.75, 0.50)
history = best_model.fit(x_train, ynn_train, validation_data=(x_val, ynn_val),
          epochs=200, batch_size=5000, callbacks=[EarlyStopping(patience=25),ModelCheckpoint('odds_loss.hdf5',save_best_only=True)])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200


In [None]:
print('Training Loss : {}\nValidation Loss : {}'.format(best_model.evaluate(x_train, ynn_train), best_model.evaluate(x_val, ynn_val)))

Training Loss : -0.08548333495855331
Validation Loss : 0.024215780198574066


In [None]:
pred_val = best_model.predict(x_val)
bet_matrix_val = np.zeros((pred_val.shape))
max_bet_val = np.argmax(pred_val, axis=1)
n = bet_matrix_val.shape[0]
bet_matrix_val[np.arange(n), max_bet_val] = 1
profit_val_strat = evaluate_bets(bet_matrix_val, x_val['Odds_Home'], x_val['Odds_Away'], y_val)

print(f"Model achieves profit of ${profit_val_strat.sum():.2f} on the validation sample")

Model achieves profit of $-1125.94 on the validation sample
