In [None]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment3/'
FOLDERNAME = 'CS229/Project/Load_Data'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))
%cd drive/My\ Drive/$FOLDERNAME

Mounted at /content/drive
/content/drive/My Drive/CS229/Project/Load_Data


In [None]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import train_test_split

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

# Setting pandas options
pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', None)

### Separating data from Moneyline.pkl into train, validation and test sets

In [None]:
df = pd.read_csv("Moneyline_alldata.csv", index_col=0)

In [None]:
df

Unnamed: 0,ID,Match_date,Timestamp,Match_time,Q,Time_left,Team_Home,Team_Away,Score_Home,Score_Away,Spread,Odds_Home,Odds_Away,Initial_odds_home,Initial_odds_away,Winner
0,0,2019/12/05 08:30,12/04 16:30,Pre-match,1.0,48.000000,Atlanta Hawks,BKN Nets,0,0,0,1.95,1.860,1.95,1.86,0
1,0,2019/12/05 08:30,12/04 23:11,Pre-match,1.0,48.000000,Atlanta Hawks,BKN Nets,0,0,0,2.30,1.660,1.95,1.86,0
2,0,2019/12/05 08:30,12/04 23:25,Pre-match,1.0,48.000000,Atlanta Hawks,BKN Nets,0,0,0,2.25,1.680,1.95,1.86,0
3,0,2019/12/05 08:30,12/04 23:58,Pre-match,1.0,48.000000,Atlanta Hawks,BKN Nets,0,0,0,2.30,1.660,1.95,1.86,0
4,0,2019/12/05 08:30,12/05 01:10,Pre-match,1.0,48.000000,Atlanta Hawks,BKN Nets,0,0,0,2.35,1.640,1.95,1.86,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299741,1044,2020/01/13 04:30,01/13 06:50,Q4 00:43,4.0,0.716667,Washington Wizards,Utah Jazz,115,122,-7,17.50,1.012,4.20,1.25,0
299742,1044,2020/01/13 04:30,01/13 06:50,Q4 00:43,4.0,0.716667,Washington Wizards,Utah Jazz,115,122,-7,15.50,1.028,4.20,1.25,0
299743,1044,2020/01/13 04:30,01/13 06:50,Q4 00:43,4.0,0.716667,Washington Wizards,Utah Jazz,115,122,-7,17.50,1.012,4.20,1.25,0
299744,1044,2020/01/13 04:30,01/13 06:51,Q4 00:33,4.0,0.550000,Washington Wizards,Utah Jazz,116,122,-6,19.00,1.006,4.20,1.25,0


### Generating variables for model

In [None]:
# Generating categorical variables
cat_vars=['Q','Team_Home','Team_Away']

# Generate a copy of the original df
df_dummies = df.copy()

# For each variable (Q) generate dummy variables (Q_1, Q_2, Q_3, Q_4) and join them to main dataframe
for var in cat_vars:
#     We use drop_first = True to remove the first categorical variable
    cat_list = pd.get_dummies(df[var], prefix=var, drop_first=True)
    data1=df_dummies.join(cat_list)
    df_dummies=data1
    
# Remove the original categorical variables and keep new generated dummy variables
data_vars=df_dummies.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars]

In [None]:
# Leaving only variables we want to keep
df_processed = df_dummies[to_keep]

In [None]:
# Selecting variables we want to keep
final_vars = df_processed.columns.values.tolist()
# We will remove all of the following variables
for ele in ['Match_date', 'Timestamp', 'Match_time', 'Score_Home', 'Score_Away']:
    final_vars.remove(ele)

# Indicating x and y variables
y_var = ['Winner']
X_vars = [i for i in final_vars if i not in y_var]

In [None]:
# Obtaining X and y variables
y = df_processed[y_var]
X = df_processed[X_vars]

### Setting up samples

In [None]:
# Process for splitting data

def train_val_test_splits(X, y, train_share, validation_share, test_share):
  """ Takes a feature matrix x, and labels y, and separates into training, validation and test samples

  Args:
    x: feature matrix, must contain column named ID with a unique match identifier
    y: labels
    train, validation, test_share: share of data to split into each group
  
  Returns:
    x_train, x_val, x_test: splits of feature map data
    y_train, y_val, y_test: splits of label data
  """
  # Verifying that the input makes sense
  assert train_share + validation_share + test_share == 1, "Share does not sum to 100%"

  # We want to break up samples by separating match IDs into train, validation and test sets
  ID_list = X.ID.unique()

  # Generate a random uniform distribution
  rand = np.random.uniform(size=len(ID_list))
  # Create True/False vectors that determine whether each match is part of that sample
  train_sample = (rand <= train_share)
  val_sample = ((train_share < rand) & (rand < (train_share + val_share)))
  test_sample = (rand >= (train_share + val_share))
  # Obtaining the list of game IDs for each sample
  train_id = ID_list[train_sample]
  val_id = ID_list[val_sample]
  test_id = ID_list[test_sample]

  # Now we want to go back to the full sized matrix and determine whether observations from each game ID should be in that sample
  train_matrix = X['ID'].isin(train_id)
  val_matrix = X['ID'].isin(val_id)
  test_matrix = X['ID'].isin(test_id)

  # Remove the ID column from X matrix. Here 1 is for column, 0 would be for rows
  X = X.drop('ID', 1)
  
  # Finally, we extract the games for each sample
  x_train = X[train_matrix]
  x_val = X[val_matrix]
  x_test = X[test_matrix]

  # Do the same for labels now
  y_train = y[train_matrix]
  y_val = y[val_matrix]
  y_test = y[test_matrix]

  return x_train, x_val, x_test, y_train, y_val, y_test

# Verifying size of outputs
print(f"Training sample is {x_train.shape[0]}")
print(f"Validation sample is {x_val.shape[0]}")
print(f"Test sample is {x_test.shape[0]}")



Training sample is 207024
Validation sample is 45397
Test sample is 47325


In [None]:
# Define what percentage we want in training, validation and test
train_share = 0.70
val_share = 0.15
test_share = 0.15

x_train, x_val, x_test, y_train, y_val, y_test = train_val_test_splits(X, y, train_share, val_share, test_share)

In [None]:
# Saving the datasets to file
x_train.to_csv("x_train.csv")
pd.DataFrame(y_train).to_csv("y_train.csv")
x_val.to_csv("x_val.csv")
pd.DataFrame(y_val).to_csv("y_val.csv")
x_test.to_csv("x_test.csv")
pd.DataFrame(y_test).to_csv("y_test.csv")