In [2]:
# This mounts your Google Drive to the Colab VM.
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Enter the foldername in your Drive where you have saved the unzipped
# assignment folder, e.g. 'cs231n/assignments/assignment3/'
FOLDERNAME = 'CS229/Project/'
assert FOLDERNAME is not None, "[!] Enter the foldername."

# now that we've mounted your Drive, this ensures that
# the Python interpreter of the Colab VM can load
# python files from within it.
import sys
sys.path.append('/content/drive/My Drive/{}'.format(FOLDERNAME))
%cd drive/My\ Drive/$FOLDERNAME

Mounted at /content/drive
/content/drive/My Drive/CS229/Project


In [3]:
import numpy as np
import pandas as pd
import os

from util import load_dataset
from util import place_bets
from util import evaluate_bets

from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn import metrics

# for auto-reloading external modules
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

pd.set_option('display.max_rows', 300)
pd.set_option('display.max_columns', None)

  import pandas.util.testing as tm


Setting up custom loss function

In [4]:
# In contrast to custom_loss implemented in custom_loss.py, this function returns a vector with the loss for each individual bet, as opposed to a sum total

def custom_loss(true_label, predict_label, odds_home, odds_away):
    """ Computes total betting loss given a series of bets to be made and odds at each time point

    Args:
        true_label: outcomes of the match, 0 is for home team loses, 1 is for home team wins, size (n,)
        predict_label: predictions of the model, size (n,)
        odds_home: odds that bookies pay for a win at home, size (n,)
        odds_away: odds that bookies pay for a win away, size (n,)

    Returns:
        loss: Total betting loss, scalar
    """
    loss_vec = true_label * (predict_label * (odds_home - 1) + predict_label - 1) + (1-true_label) * ((1-predict_label)*(odds_away-1) - predict_label)
    return loss_vec

### Setting up the data

In [5]:
df = pd.read_csv('Load_Data/Moneyline_alldata.csv', index_col=0)

In [6]:
df.head()

Unnamed: 0,ID,Match_date,Timestamp,Match_time,Q,Time_left,Team_Home,Team_Away,Score_Home,Score_Away,Spread,Odds_Home,Odds_Away,Initial_odds_home,Initial_odds_away,Winner
0,0,2019/12/05 08:30,12/04 16:30,Pre-match,1.0,48.0,Atlanta Hawks,BKN Nets,0,0,0,1.95,1.86,1.95,1.86,0
1,0,2019/12/05 08:30,12/04 23:11,Pre-match,1.0,48.0,Atlanta Hawks,BKN Nets,0,0,0,2.3,1.66,1.95,1.86,0
2,0,2019/12/05 08:30,12/04 23:25,Pre-match,1.0,48.0,Atlanta Hawks,BKN Nets,0,0,0,2.25,1.68,1.95,1.86,0
3,0,2019/12/05 08:30,12/04 23:58,Pre-match,1.0,48.0,Atlanta Hawks,BKN Nets,0,0,0,2.3,1.66,1.95,1.86,0
4,0,2019/12/05 08:30,12/05 01:10,Pre-match,1.0,48.0,Atlanta Hawks,BKN Nets,0,0,0,2.35,1.64,1.95,1.86,0


### Descriptive Statistics

In [7]:
winners = df.groupby(['ID']).max()['Winner']
wins = winners.sum()
loss = len(winners) - wins
    
print(f"Number of wins from home team: {wins}")
print(f"Number of wins from away team: {loss}")

Number of wins from home team: 579
Number of wins from away team: 466


### Loading train, validation and test sets

In [8]:
# Training
x_train = load_dataset("Load_Data/x_train.csv", intercept=True)
y_train = load_dataset("Load_Data/y_train.csv").to_numpy().flatten()

# Validation
x_val = load_dataset("Load_Data/x_val.csv", intercept=True)
y_val = load_dataset("Load_Data/y_val.csv").to_numpy().flatten()

# Test
x_test = load_dataset("Load_Data/x_test.csv", intercept=True)
y_test = load_dataset("Load_Data/y_test.csv").to_numpy().flatten()

In [9]:
print(f"number of observations in train: {x_train.shape[0]}, validation: {x_val.shape[0]}, test: {x_test.shape[0]}")

number of observations in train: 211591, validation: 46749, test: 41406


### Running the Model

#### Stats Models

In [10]:
# Building model in stats - useful to see coefficient of results (interpretability)
logit_model=sm.Logit(y_train, x_train)
result=logit_model.fit(method='newton')
print(result.summary2())

Optimization terminated successfully.
         Current function value: 0.470283
         Iterations 7
                                 Results: Logit
Model:                   Logit                 Pseudo R-squared:      0.319      
Dependent Variable:      y                     AIC:                   199151.5017
Date:                    2020-11-19 06:30      BIC:                   199849.3456
No. Observations:        211591                Log-Likelihood:        -99508.    
Df Model:                67                    LL-Null:               -1.4618e+05
Df Residuals:            211523                LLR p-value:           0.0000     
Converged:               1.0000                Scale:                 1.0000     
No. Iterations:          7.0000                                                  
---------------------------------------------------------------------------------
                                  Coef.  Std.Err.    z     P>|z|   [0.025  0.975]
------------------------------

#### Scikit Learn model

In [11]:
# Initializing the logistic regression class for Scikit Learn
logreg = LogisticRegression(max_iter=2000, verbose=1)

In [12]:
# Fit the model with training data
logreg.fit(x_train, y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   44.9s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=2000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=1,
                   warm_start=False)

In [13]:
# Make predictions on validation data. Prediction is a closed 0,1 solution based on which outcome is more likely
predictions = logreg.predict(x_val)
# Show underlying probability of predictions
prob_pred = logreg.predict_proba(x_val)

In [14]:
# prob_pred shows the probability of home team losing in first column (0) and home team winning in the second column (1)
prob_pred

array([[0.45775463, 0.54224537],
       [0.45021683, 0.54978317],
       [0.45775463, 0.54224537],
       ...,
       [0.98992514, 0.01007486],
       [0.98834397, 0.01165603],
       [0.98834397, 0.01165603]])

In [15]:
# Show the prediction
predictions

array([1, 1, 1, ..., 0, 0, 0])

In [16]:
# Score on the training set
score_train = logreg.score(x_train, y_train)
print(score_train)

0.7752220084975259


In [17]:
# Score returns the mean accuracy of the model on the validation set
score = logreg.score(x_val, y_val)
print(score)
# Looks like our model is not overfitting

0.7184538706710304


### Baseline Analysis

Lets first see what the model's accuracy would be if we just predicted the favorite to win each game

In [18]:
# Prediction accuracy in the training sample
score_tr = logreg.score(x_train, y_train)
print(f"Score on the training sample is {100*score_tr:.2f}%")

Score on the training sample is 77.52%


In [19]:
# 1 if Home team is favorite to win the game
x_val['Favorite'] = 0
x_val['Favorite'] = np.where((x_val.Initial_odds_home < x_val.Initial_odds_away), 1, x_val.Favorite)

# Computing score where favorite at the beginning of the game is always expected to win
score_fav = np.mean(x_val['Favorite'] == y_val)

In [20]:
print(f"Model predicts winner accurately {100*score:.2f}% of the time, while blind predictions on the favorite yields {100*score_fav:.2f}%")

Model predicts winner accurately 71.85% of the time, while blind predictions on the favorite yields 62.30%


In [21]:
# Now lets calculate the very clear benchmark of just picking the bet for the team with the highest probability based on casino odds
casino_pick = np.where(x_val['Odds_Home'] > x_val['Odds_Away'], 0, 1)
x_val['Casino_pick'] = casino_pick
score_casino = np.mean(x_val['Casino_pick'] == y_val)
print(f"Casino predicts winner accurately {100*score_casino:.2f}% of the time")

Casino predicts winner accurately 77.62% of the time


### Constructing odds matrix for exploratory analysis

In [22]:
prob_pred

array([[0.45775463, 0.54224537],
       [0.45021683, 0.54978317],
       [0.45775463, 0.54224537],
       ...,
       [0.98992514, 0.01007486],
       [0.98834397, 0.01165603],
       [0.98834397, 0.01165603]])

In [23]:
predictions

array([1, 1, 1, ..., 0, 0, 0])

In [24]:
# Putting it all together in an odds df

# Search for the x_val index values in the original df to find odds_home, odds_away and winner
Odds1 = pd.concat([df.loc[x_val.index.values, 'Odds_Home'], df.loc[x_val.index.values, 'Odds_Away'], df.loc[x_val.index.values, 'Winner']], axis=1)
Odds1.reset_index(inplace=True)
# Concatenate with predictions from the model, 'y' and 'y_pred'
Odds = pd.concat([Odds1, pd.Series(predictions), pd.Series(prob_pred[:, 1])], axis=1, ignore_index=True)
# Rename columns and set index
Odds.columns = ['Index', 'Odds_Home', 'Odds_Away', 'y', 'y_hat', 'pred_odds_home']
Odds.set_index('Index', inplace=True)

# Putting probabilities side by side
Odds['Prob_Home'] = 1 / Odds['Odds_Home']
Odds['Prob_Away'] = 1 / Odds['Odds_Away']
# Adding scores and time left
Odds = pd.concat([Odds, x_val['Favorite'], x_val['Casino_pick'], df.loc[x_val.index.values, 'Score_Home'], df.loc[x_val.index.values, 'Score_Away'], df.loc[x_val.index.values, 'Time_left']], axis=1)

In [25]:
Odds

Unnamed: 0,Odds_Home,Odds_Away,y,y_hat,pred_odds_home,Prob_Home,Prob_Away,Favorite,Casino_pick,Score_Home,Score_Away,Time_left
2792,3.05,1.400,0,1,0.542245,0.327869,0.714286,0,0,0,0,48.000000
2793,2.90,1.430,0,1,0.549783,0.344828,0.699301,0,0,0,0,48.000000
2794,3.05,1.400,0,1,0.542245,0.327869,0.714286,0,0,0,0,48.000000
2795,3.10,1.400,0,1,0.540218,0.322581,0.714286,0,0,0,0,48.000000
2796,3.20,1.370,0,1,0.534688,0.312500,0.729927,0,0,0,0,48.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
298432,34.00,1.002,0,0,0.001277,0.029412,0.998004,0,0,103,111,4.400000
298433,26.00,1.003,0,0,0.004994,0.038462,0.997009,0,0,104,111,4.383333
298434,21.00,1.005,0,0,0.010075,0.047619,0.995025,0,0,104,113,3.800000
298435,19.00,1.006,0,0,0.011656,0.052632,0.994036,0,0,104,116,3.733333


### Calculating profit

In [26]:
# Looking at profit in training sample
predictions_tr = logreg.predict(x_train)
train_profit = custom_loss(y_train, predictions_tr, x_train['Odds_Home'], x_train['Odds_Away'])

print(f"The model achieves a profit of ${train_profit.sum():.2f} in the training sample")

The model achieves a profit of $29729.68 in the training sample


In [27]:
# Calculating profit based on predictions
profit_vec = custom_loss(Odds['y'], Odds['y_hat'], Odds['Odds_Home'], Odds['Odds_Away'])
profit_df = pd.DataFrame(data=profit_vec).rename(columns={0: 'Profit'})

# Calculating profit based on favorite
profit_vec_fav = custom_loss(Odds['y'], Odds['Casino_pick'], Odds['Odds_Home'], Odds['Odds_Away'])
profit_df_fav = pd.DataFrame(data=profit_vec_fav).rename(columns={0: 'Profit_casino_pick'})

In [28]:
odds_df = pd.concat((Odds, profit_df, profit_df_fav), axis=1)

In [29]:
odds_df

Unnamed: 0,Odds_Home,Odds_Away,y,y_hat,pred_odds_home,Prob_Home,Prob_Away,Favorite,Casino_pick,Score_Home,Score_Away,Time_left,Profit,Profit_casino_pick
2792,3.05,1.400,0,1,0.542245,0.327869,0.714286,0,0,0,0,48.000000,-1.000,0.400
2793,2.90,1.430,0,1,0.549783,0.344828,0.699301,0,0,0,0,48.000000,-1.000,0.430
2794,3.05,1.400,0,1,0.542245,0.327869,0.714286,0,0,0,0,48.000000,-1.000,0.400
2795,3.10,1.400,0,1,0.540218,0.322581,0.714286,0,0,0,0,48.000000,-1.000,0.400
2796,3.20,1.370,0,1,0.534688,0.312500,0.729927,0,0,0,0,48.000000,-1.000,0.370
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298432,34.00,1.002,0,0,0.001277,0.029412,0.998004,0,0,103,111,4.400000,0.002,0.002
298433,26.00,1.003,0,0,0.004994,0.038462,0.997009,0,0,104,111,4.383333,0.003,0.003
298434,21.00,1.005,0,0,0.010075,0.047619,0.995025,0,0,104,113,3.800000,0.005,0.005
298435,19.00,1.006,0,0,0.011656,0.052632,0.994036,0,0,104,116,3.733333,0.006,0.006


In [30]:
model_profit = odds_df['Profit'].sum()
favorite_profit = odds_df['Profit_casino_pick'].sum()
print(f"Model generates ${model_profit:.2f} in profit. Betting for the casino favorite always generates ${favorite_profit:.2f} in profit")
print(f"Betting on the model represents a {100* model_profit / odds_df.shape[0]:.2f}% return on an investment of ${odds_df.shape[0]}")

Model generates $-1364.86 in profit. Betting for the casino favorite always generates $-504.50 in profit
Betting on the model represents a -2.92% return on an investment of $46749


In [31]:
profit_vec_nofav = custom_loss(Odds['y'], 1-Odds['Casino_pick'], Odds['Odds_Home'], Odds['Odds_Away'])
no_favorite_profit = profit_vec_nofav.sum()
print(f"Model generates ${model_profit:.2f} in profit. Betting for the opposite of the casino favorite generates ${no_favorite_profit:.2f} in profit")

Model generates $-1364.86 in profit. Betting for the opposite of the casino favorite generates $-8920.49 in profit


### Using Expected Value at each point to determine whether to place a bet

In [32]:
# Instead of looking at hard predictions, lest look at soft predicted probabilities
prob_train = logreg.predict_proba(x_train)

# Lets look at the odds for each team
odds_home_train = x_train['Odds_Home']
odds_away_train = x_train['Odds_Away']

# Making bet decision, using place_bets function from util
bets_train = place_bets(prob_train, odds_home_train, odds_away_train)

# Evaluating profit in the training sample
profit_train = evaluate_bets(bets_train, odds_home_train, odds_away_train, y_train)

In [33]:
# Showing example of bets matrix {Bet home, Bet away, No bet}
bets_train

array([[1., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       ...,
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.]])

In [34]:
# Eliminating "Favorite" and "Casino pick" from x_val
x_val.drop(labels=['Favorite', 'Casino_pick'], axis=1, inplace=True)

In [35]:
# Lets do the same for the validation set
prob_val = logreg.predict_proba(x_val)
odds_home_val = x_val['Odds_Home']
odds_away_val = x_val['Odds_Away']
bets_val = place_bets(prob_val, odds_home_val, odds_away_val)
profit_val = evaluate_bets(bets_val, odds_home_val, odds_away_val, y_val)

In [36]:
# Looking at the results from our model
print(f"The model achieves a profit of ${profit_train.sum():.2f} in the training sample")
print(f"The model achieves a profit of ${profit_val.sum():.2f} in the validation sample. This is a {100* profit_val.sum() / odds_df.shape[0]:.2f}% return on an investment") 

The model achieves a profit of $56056.10 in the training sample
The model achieves a profit of $-2309.20 in the validation sample. This is a -4.94% return on an investment


In [37]:
# Looking at how many bets the model is placings
print(bets_val.sum(axis=0))

[18313. 22066.  6370.]


### Looking at histogram of profits by match