Notebook used to split the main dataset into training, testing, and validation sets. Makes sure that data is randomized consistenly when training different models. 

Also adds the relevent odds information to the y-vector, as it is required in the model loss function. 

In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler

In [2]:
# Load in the data
data_path = "main_data/feature_data/"
lookback_games = 13

X = np.load(data_path + f"double_mean_lg{lookback_games}_X.npy")
Y = np.load(data_path + f"double_mean_lg{lookback_games}_Y.npy")
ids = np.load(data_path + f"double_mean_lg{lookback_games}_ids.npy")

data_master = pd.read_pickle("main_data/master_data.df")

# Add odds to the Y array
home_games = data_master[data_master.home_or_away == "HOME"]
away_games = data_master[data_master.home_or_away == "AWAY"]
home_odds = [home_games[home_games.gameId == game_id].iloc[0]['odds'] for game_id in ids]
away_odds = [away_games[away_games.gameId == game_id].iloc[0]['odds'] for game_id in ids]
Y = np.array([[y, home_odd, away_odd] for y, home_odd, away_odd in zip(Y, home_odds, away_odds)])

# Remove rows that have missing odds
missing_odds_mask = np.isnan(Y).any(axis=1) == False
X = X[missing_odds_mask]
Y = Y[missing_odds_mask]

In [3]:
# Shuffle the x and y
shuffled_ids = np.arange(len(X))
np.random.seed(100)
np.random.shuffle(shuffled_ids)
X, Y = X[shuffled_ids], Y[shuffled_ids]

In [4]:
# MODEL STUFF
train_size = int(X.shape[0] * 0.7)
test_size = int(X.shape[0] * 0.15)
val_size = int(X.shape[0] * 0.15)
train_test_size = train_size + test_size

X_train, X_test, X_val = X[:train_size], X[train_size:train_test_size], X[train_test_size:]
Y_train, Y_test, Y_val = Y[:train_size], Y[train_size:train_test_size], Y[train_test_size:]

# Scale the X data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_val = scaler.transform(X_val)

# Cut each array to be a multiple of batch_size
batch_size = 16
train_size = X_train.shape[0] - X_train.shape[0] % batch_size
test_size = X_test.shape[0] - X_test.shape[0] % batch_size
val_size = X_val.shape[0] - X_val.shape[0] % batch_size

X_train, Y_train = X_train[:train_size], Y_train[:train_size]
X_test, Y_test = X_test[:test_size], Y_test[:test_size]
X_val, Y_val = X_val[:val_size], Y_val[:val_size]

In [5]:
# Save the arrays (for training other models)
save_path = "data/validation_data/"
np.save(save_path + "X_train.npy", X_train)
np.save(save_path + "Y_train.npy", Y_train)
np.save(save_path + "X_test.npy", X_test)
np.save(save_path + "Y_test.npy", Y_test)
np.save(save_path + "X_val.npy", X_val)
np.save(save_path + "Y_val.npy", Y_val)