In [8]:
import libsimulation
from src import main

import os, datetime, argparse, requests, urllib.parse, sys, re, traceback, json
import math, numpy as np
from matplotlib import pyplot as plt

%matplotlib inline
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# NBA data jupyter notebook

__Important__
This notebook is here for you to quickly test with the data.
It is __not__ the final submission, as we will only run your code provided in `src/main.py` and any other files referenced from it.

## Set up some environmental settings

In [9]:
# Initialize some settings
settings = libsimulation.SimulationSettings()
# This prevents you accidentally loading data beyond this point, and also defines the start of the simulation run period
settings.cutoff = '2019-01-01'

# Exploring data

In [10]:
data_loader = libsimulation.NbaDataLoader(settings)

In [11]:
data_loader.getSeason('2011')

Unnamed: 0,gameId,dateTime,homeTeam,awayTeam,pointsDiff,pointsSum,homeBlocks,homeMinutes,homeRebounds,homeScore,homeSteals,quarter0home,quarter1home,quarter2home,quarter3home,awayBlocks,awayMinutes,awayRebounds,awayScore,awaySteals,quarter0away,quarter1away,quarter2away,quarter3away,season,status
0,5210,2010-10-26T19:30:00,BOS,MIA,8.0,168.0,4.0,240.0,42.0,88.0,6.0,16.0,29.0,18.0,25.0,6.0,240.0,39.0,80.0,10.0,9.0,21.0,27.0,23.0,2011,Final
1,5211,2010-10-26T22:00:00,POR,PHO,14.0,198.0,2.0,240.0,48.0,106.0,11.0,28.0,22.0,25.0,31.0,4.0,240.0,30.0,92.0,3.0,26.0,20.0,35.0,11.0,2011,Final
2,5212,2010-10-26T22:30:00,LAL,HOU,2.0,222.0,4.0,240.0,44.0,112.0,11.0,26.0,25.0,26.0,35.0,7.0,239.0,53.0,110.0,6.0,33.0,29.0,20.0,28.0,2011,Final
3,5213,2010-10-27T19:00:00,CLE,BOS,,,,,,,,,,,,,,,,,,,,,2011,Scheduled
4,5214,2010-10-27T19:00:00,BKN,DET,3.0,199.0,4.0,240.0,44.0,101.0,4.0,20.0,19.0,31.0,31.0,3.0,239.0,37.0,98.0,7.0,20.0,27.0,20.0,31.0,2011,Final
5,5215,2010-10-27T19:00:00,PHI,MIA,-10.0,184.0,4.0,240.0,38.0,87.0,10.0,15.0,26.0,13.0,33.0,6.0,240.0,37.0,97.0,8.0,18.0,31.0,31.0,17.0,2011,Final
6,5216,2010-10-27T19:00:00,TOR,NY,-5.0,191.0,5.0,239.0,45.0,93.0,6.0,22.0,25.0,25.0,21.0,10.0,240.0,49.0,98.0,4.0,29.0,22.0,23.0,24.0,2011,Final
7,5220,2010-10-27T20:00:00,MEM,ATL,-15.0,223.0,4.0,240.0,39.0,104.0,7.0,22.0,28.0,25.0,29.0,9.0,240.0,44.0,119.0,7.0,30.0,32.0,29.0,28.0,2011,Final
8,5218,2010-10-27T20:00:00,MIN,SAC,-1.0,233.0,8.0,240.0,46.0,116.0,5.0,23.0,35.0,25.0,33.0,4.0,240.0,45.0,117.0,8.0,27.0,26.0,34.0,30.0,2011,Final
9,5219,2010-10-27T20:00:00,OKC,CHI,11.0,201.0,10.0,240.0,44.0,106.0,11.0,30.0,29.0,23.0,24.0,6.0,240.0,51.0,95.0,6.0,25.0,29.0,28.0,13.0,2011,Final


In [14]:
data_loader.getSeason('2018POST')

Unnamed: 0,gameId,dateTime,homeTeam,awayTeam,pointsDiff,pointsSum,homeBlocks,homeMinutes,homeRebounds,homeScore,homeSteals,quarter0home,quarter1home,quarter2home,quarter3home,awayBlocks,awayMinutes,awayRebounds,awayScore,awaySteals,quarter0away,quarter1away,quarter2away,quarter3away,season,status
0,12380,2018-04-14T15:00:00,GS,SA,21.0,205.0,6.0,240.0,51.0,113.0,6.0,28.0,29.0,29.0,27.0,4.0,240.0,30.0,92.0,9.0,17.0,24.0,22.0,29.0,2018POST,Final
1,12381,2018-04-14T17:30:00,TOR,WAS,8.0,220.0,7.0,240.0,38.0,114.0,6.0,28.0,27.0,31.0,28.0,3.0,240.0,35.0,106.0,11.0,23.0,36.0,26.0,21.0,2018POST,Final
2,12382,2018-04-14T20:00:00,PHI,MIA,27.0,233.0,5.0,240.0,50.0,130.0,9.0,29.0,27.0,34.0,40.0,6.0,240.0,42.0,103.0,4.0,35.0,25.0,18.0,25.0,2018POST,Final
3,12383,2018-04-14T22:30:00,POR,NO,-2.0,192.0,6.0,240.0,52.0,95.0,10.0,18.0,18.0,27.0,32.0,12.0,240.0,49.0,97.0,8.0,21.0,24.0,30.0,22.0,2018POST,Final
4,12384,2018-04-15T13:00:00,BOS,MIL,6.0,220.0,5.0,265.0,45.0,113.0,8.0,29.0,15.0,26.0,29.0,8.0,265.0,42.0,107.0,8.0,17.0,30.0,19.0,33.0,2018POST,F/OT
5,12385,2018-04-15T15:30:00,CLE,IND,-18.0,178.0,4.0,240.0,46.0,80.0,7.0,14.0,24.0,27.0,15.0,4.0,240.0,44.0,98.0,12.0,33.0,22.0,18.0,25.0,2018POST,Final
6,12386,2018-04-15T18:30:00,OKC,UTA,8.0,224.0,5.0,240.0,46.0,116.0,8.0,25.0,29.0,27.0,35.0,7.0,240.0,42.0,108.0,10.0,25.0,23.0,24.0,36.0,2018POST,Final
7,12387,2018-04-15T21:00:00,HOU,MIN,3.0,205.0,8.0,240.0,37.0,104.0,8.0,27.0,27.0,22.0,28.0,2.0,240.0,47.0,101.0,3.0,21.0,26.0,25.0,29.0,2018POST,Final
8,12388,2018-04-16T20:00:00,PHI,MIA,-10.0,216.0,5.0,240.0,49.0,103.0,7.0,29.0,13.0,33.0,28.0,7.0,240.0,46.0,113.0,8.0,22.0,34.0,30.0,27.0,2018POST,Final
9,12389,2018-04-16T22:30:00,GS,SA,15.0,217.0,5.0,240.0,39.0,116.0,5.0,23.0,24.0,33.0,36.0,2.0,240.0,35.0,101.0,10.0,25.0,28.0,22.0,26.0,2018POST,Final


In [15]:
data_loader.getGame(5210)

Unnamed: 0,gameId,name,dateTime,team,season,blocks,injuryBodyPart,injuryStatus,minutes,points,position,rebounds,steals
0,5210,Zydrunas Ilgauskas,2010-10-26T19:30:00,MIA,2011,1,,,10,1,C,3,3
1,5210,Eddie House,2010-10-26T19:30:00,MIA,2011,0,,,26,8,G,0,1
2,5210,Carlos Arroyo,2010-10-26T19:30:00,MIA,2011,0,,,12,3,PG,0,0
3,5210,LeBron James,2010-10-26T19:30:00,MIA,2011,2,,,42,31,F,4,1
4,5210,Chris Bosh,2010-10-26T19:30:00,MIA,2011,1,,,37,8,FC,8,1
5,5210,Dwyane Wade,2010-10-26T19:30:00,MIA,2011,0,,,36,13,G,4,3
6,5210,James Jones,2010-10-26T19:30:00,MIA,2011,0,,,26,6,GF,2,1
7,5210,Udonis Haslem,2010-10-26T19:30:00,MIA,2011,1,,,30,8,PF,11,0
8,5210,Joel Anthony,2010-10-26T19:30:00,MIA,2011,1,,,16,2,FC,7,0
9,5210,Shaquille O'Neal,2010-10-26T19:30:00,BOS,2011,1,,,18,9,C,7,0


In [16]:
data_loader.getPlayers('2011')

Unnamed: 0,gameId,name,dateTime,team,season,blocks,injuryBodyPart,injuryStatus,minutes,points,position,rebounds,steals
0,5210,Zydrunas Ilgauskas,2010-10-26T19:30:00,MIA,2011,1,,,10,1,C,3,3
1,5210,Eddie House,2010-10-26T19:30:00,MIA,2011,0,,,26,8,G,0,1
2,5210,Carlos Arroyo,2010-10-26T19:30:00,MIA,2011,0,,,12,3,PG,0,0
3,5210,LeBron James,2010-10-26T19:30:00,MIA,2011,2,,,42,31,F,4,1
4,5210,Chris Bosh,2010-10-26T19:30:00,MIA,2011,1,,,37,8,FC,8,1
5,5210,Dwyane Wade,2010-10-26T19:30:00,MIA,2011,0,,,36,13,G,4,3
6,5210,James Jones,2010-10-26T19:30:00,MIA,2011,0,,,26,6,GF,2,1
7,5210,Udonis Haslem,2010-10-26T19:30:00,MIA,2011,1,,,30,8,PF,11,0
8,5210,Joel Anthony,2010-10-26T19:30:00,MIA,2011,1,,,16,2,FC,7,0
9,5210,Shaquille O'Neal,2010-10-26T19:30:00,BOS,2011,1,,,18,9,C,7,0


# Building and Testing a model

## Load some data into train and validaiton sets

In [17]:
data_loader = libsimulation.NbaDataLoader(settings)

In [18]:
def get_multi_season_game_data(data_loader, first_year, last_year):
    data = [pd.DataFrame(data_loader.getSeason(str(season))) for season in range(first_year, last_year + 1)]
    data = pd.concat(data, axis=0)
    data.dropna(axis=0, inplace=True)
    data.dateTime=pd.to_datetime(data.dateTime)
    data.sort_values('dateTime', inplace=True)
    data.reset_index(inplace=True, drop=True)
    return data

In [19]:
train_data = get_multi_season_game_data(data_loader, 2009, 2016)
test_data = get_multi_season_game_data(data_loader, 2017, 2018)

In [20]:
train_data.head(2)

Unnamed: 0,gameId,dateTime,homeTeam,awayTeam,pointsDiff,pointsSum,homeBlocks,homeMinutes,homeRebounds,homeScore,homeSteals,quarter0home,quarter1home,quarter2home,quarter3home,awayBlocks,awayMinutes,awayRebounds,awayScore,awaySteals,quarter0away,quarter1away,quarter2away,quarter3away,season,status
0,6265,2008-10-28 20:00:00,BOS,CLE,5.0,175.0,0.0,240.0,36.0,90.0,10.0,22.0,21.0,24.0,23.0,2.0,240.0,41.0,85.0,8.0,28.0,22.0,13.0,22.0,2009,Final
1,6266,2008-10-28 20:30:00,CHI,MIL,13.0,203.0,6.0,240.0,40.0,108.0,9.0,26.0,29.0,27.0,26.0,3.0,240.0,32.0,95.0,5.0,26.0,26.0,23.0,20.0,2009,Final


In [21]:
test_data.head(2)

Unnamed: 0,gameId,dateTime,homeTeam,awayTeam,pointsDiff,pointsSum,homeBlocks,homeMinutes,homeRebounds,homeScore,homeSteals,quarter0home,quarter1home,quarter2home,quarter3home,awayBlocks,awayMinutes,awayRebounds,awayScore,awaySteals,quarter0away,quarter1away,quarter2away,quarter3away,season,status
0,9642,2016-10-25 19:30:00,CLE,NY,29.0,205.0,5.0,240.0,51.0,117.0,12.0,28.0,20.0,34.0,35.0,6.0,240.0,42.0,88.0,6.0,18.0,27.0,19.0,24.0,2017,Final
1,9643,2016-10-25 22:00:00,POR,UTA,9.0,217.0,3.0,239.0,34.0,113.0,5.0,26.0,28.0,23.0,36.0,5.0,240.0,31.0,104.0,9.0,26.0,20.0,37.0,21.0,2017,Final


## Define some functions to calcuate a Elo ratings over time

In [22]:
## Elo model's probability of home team winning
def home_win_probability(home_elo, away_elo):
    return 1 / (1 + math.pow(10, -(home_elo - away_elo) / 400)) 

## Get new Elo ratings home and away teams after a game
def get_updated_elo(
    home_elo, away_elo, 
    home_victory, ## 1 if home team won, 0 if away team won
    K,  ## model hyperparameter 
): 
    if home_victory not in [0, 1, False, True]:
        raise ValueError(f"home_victory should be 1 if home team won, 0 if away team won. Got {home_victory}")

    P_home_win = home_win_probability(home_elo, away_elo) 
    P_away_win = 1 - P_home_win
  
    # When home team wins 
    if home_victory : 
        home_elo += K * P_away_win
        away_elo -= K * P_home_win
      
    # When away team wins 
    else : 
        home_elo -= K * P_away_win
        away_elo += K * P_home_win
        
    return home_elo, away_elo

## Iterate through games updating each teams Elo rating
def get_elos_over_time(data, ## dataframe of games, must be in order of occurence
                      starting_elo_dict={},  ## dictionary of elo scores by team at the beginning of the data period
                      default_elo=0,  ## elo initally given to a team not in starting_elo_dict
                      K=10,  ## model hyperparameter; higher number means individuals game affects Elo more
                     ):
    
    elo_dict = starting_elo_dict.copy()
    data['homeElo'] = np.nan
    data['awayElo'] = np.nan

    ## Iterate over rows of the dataframe (i.e. over games)
    for i, row in data.iterrows():
        
        home_team = row['homeTeam']
        away_team = row['awayTeam']
        home_elo = elo_dict.get(home_team, default_elo)
        away_elo = elo_dict.get(away_team, default_elo)
        
        ## Put the team's current ELO in the dataframe (this is the teams ELO *before* the match)
        data.loc[i,'homeElo'] = home_elo
        data.loc[i,'awayElo'] = away_elo
        
        ## Calculate the new elo scores and update elo_dict with them
        home_victory = row['pointsDiff'] > 0
        home_elo, away_elo = get_updated_elo(home_elo, away_elo, home_victory, K)
        elo_dict[home_team] = home_elo
        elo_dict[away_team] = away_elo
    
    return elo_dict

In [23]:
K = 10
pre_test_elo_dict = get_elos_over_time(train_data, starting_elo_dict={}, K=K)
post_test_elo_dict = get_elos_over_time(test_data, starting_elo_dict=pre_test_elo_dict, K=K)

In [24]:
train_data.head(2)

Unnamed: 0,gameId,dateTime,homeTeam,awayTeam,pointsDiff,pointsSum,homeBlocks,homeMinutes,homeRebounds,homeScore,homeSteals,quarter0home,quarter1home,quarter2home,quarter3home,awayBlocks,awayMinutes,awayRebounds,awayScore,awaySteals,quarter0away,quarter1away,quarter2away,quarter3away,season,status,homeElo,awayElo
0,6265,2008-10-28 20:00:00,BOS,CLE,5.0,175.0,0.0,240.0,36.0,90.0,10.0,22.0,21.0,24.0,23.0,2.0,240.0,41.0,85.0,8.0,28.0,22.0,13.0,22.0,2009,Final,0.0,0.0
1,6266,2008-10-28 20:30:00,CHI,MIL,13.0,203.0,6.0,240.0,40.0,108.0,9.0,26.0,29.0,27.0,26.0,3.0,240.0,32.0,95.0,5.0,26.0,26.0,23.0,20.0,2009,Final,0.0,0.0


In [61]:
test_data.head(2)

Unnamed: 0,gameId,dateTime,homeTeam,awayTeam,pointsDiff,pointsSum,homeBlocks,homeMinutes,homeRebounds,homeScore,homeSteals,quarter0home,quarter1home,quarter2home,quarter3home,awayBlocks,awayMinutes,awayRebounds,awayScore,awaySteals,quarter0away,quarter1away,quarter2away,quarter3away,season,status,homeElo,awayElo,EloDifference,EloSum,predictedDiff
0,9642,2016-10-25 19:30:00,CLE,NY,29.0,205.0,5.0,240.0,51.0,117.0,12.0,28.0,20.0,34.0,35.0,6.0,240.0,42.0,88.0,6.0,18.0,27.0,19.0,24.0,2017,Final,-207.640847,-508.064454,300.423606,-715.705301,4.190497
1,9643,2016-10-25 22:00:00,POR,UTA,9.0,217.0,3.0,239.0,34.0,113.0,5.0,26.0,28.0,23.0,36.0,5.0,240.0,31.0,104.0,9.0,26.0,20.0,37.0,21.0,2017,Final,64.598944,-339.203693,403.802637,-274.604749,4.947839


## Look at Elo ratings over time

In [26]:
def plot_team_elo_over_time(data, team):
    team_data = data.query(f'homeTeam == "{team}" | awayTeam == "{team}"').copy()
    team_data['Elo'] = team_data.eval(f'(homeTeam == "{team}") * homeElo + (awayTeam == "{team}") * awayElo')
    team_data = team_data[['dateTime', 'gameId', 'Elo']]
    plt.plot(team_data['dateTime'], team_data['Elo'], label=team)

In [30]:
%matplotlib notebook
combined_data = pd.concat([train_data, test_data])
for team in ['GS', 'MIA', 'NY', 'SA']:
    plot_team_elo_over_time(combined_data, team)
plt.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x290bcb7d630>

## Fit a linear model on our train data

In [31]:
train_data['EloDifference'] = train_data['homeElo'] - train_data['awayElo']
test_data['EloDifference'] = test_data['homeElo'] - test_data['awayElo']

In [32]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=False)
model.fit(X=train_data[['EloDifference']], y=train_data['pointsSum'])

LinearRegression(copy_X=True, fit_intercept=False, n_jobs=1, normalize=False)

In [33]:
## Use the statsmodels library to fit a linear model of Elo difference to points difference
train_data['EloSum'] = train_data['homeElo'] + train_data['awayElo']
test_data['EloSum'] = test_data['homeElo'] + test_data['awayElo']
X = train_data[['EloDifference', 'EloSum']]
X = sm.add_constant(X)
y = train_data['pointsDiff']
model = sm.OLS(y, X).fit()
model.summary()

0,1,2,3
Dep. Variable:,pointsDiff,R-squared:,0.036
Model:,OLS,Adj. R-squared:,0.036
Method:,Least Squares,F-statistic:,152.1
Date:,"Wed, 27 Nov 2019",Prob (F-statistic):,1.36e-65
Time:,14:25:26,Log-Likelihood:,-32385.0
No. Observations:,8116,AIC:,64780.0
Df Residuals:,8113,BIC:,64800.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,3.1317,0.173,18.087,0.000,2.792,3.471
EloDifference,0.0049,0.000,17.312,0.000,0.004,0.005
EloSum,0.0006,0.000,2.130,0.033,4.55e-05,0.001

0,1,2,3
Omnibus:,229.91,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,524.005
Skew:,0.132,Prob(JB):,1.64e-114
Kurtosis:,4.217,Cond. No.,769.0


## Make some predicitons on the test set

In [34]:
X_test = test_data[['EloDifference', 'EloSum']]
X_test = sm.add_constant(X_test)
test_data['predictedDiff'] = model.predict(X_test)
test_data.head(2)

Unnamed: 0,gameId,dateTime,homeTeam,awayTeam,pointsDiff,pointsSum,homeBlocks,homeMinutes,homeRebounds,homeScore,homeSteals,quarter0home,quarter1home,quarter2home,quarter3home,awayBlocks,awayMinutes,awayRebounds,awayScore,awaySteals,quarter0away,quarter1away,quarter2away,quarter3away,season,status,homeElo,awayElo,EloDifference,EloSum,predictedDiff
0,9642,2016-10-25 19:30:00,CLE,NY,29.0,205.0,5.0,240.0,51.0,117.0,12.0,28.0,20.0,34.0,35.0,6.0,240.0,42.0,88.0,6.0,18.0,27.0,19.0,24.0,2017,Final,-207.640847,-508.064454,300.423606,-715.705301,4.190497
1,9643,2016-10-25 22:00:00,POR,UTA,9.0,217.0,3.0,239.0,34.0,113.0,5.0,26.0,28.0,23.0,36.0,5.0,240.0,31.0,104.0,9.0,26.0,20.0,37.0,21.0,2017,Final,64.598944,-339.203693,403.802637,-274.604749,4.947839


In [54]:
## Check how good our predictions are
sns.lmplot('predictedDiff', 'pointsDiff', test_data);

<IPython.core.display.Javascript object>

  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval


In [37]:
## remodel to see statistics on test data
X = test_data['predictedDiff']
y = test_data['pointsDiff']
test_model = sm.OLS(y, X).fit()
test_model.summary()

0,1,2,3
Dep. Variable:,pointsDiff,R-squared:,0.055
Model:,OLS,Adj. R-squared:,0.055
Method:,Least Squares,F-statistic:,142.9
Date:,"Wed, 27 Nov 2019",Prob (F-statistic):,4.6e-32
Time:,14:26:42,Log-Likelihood:,-9899.7
No. Observations:,2460,AIC:,19800.0
Df Residuals:,2459,BIC:,19810.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
predictedDiff,0.6380,0.053,11.956,0.000,0.533,0.743

0,1,2,3
Omnibus:,11.813,Durbin-Watson:,1.999
Prob(Omnibus):,0.003,Jarque-Bera (JB):,14.866
Skew:,0.064,Prob(JB):,0.000591
Kurtosis:,3.359,Cond. No.,1.0


## Set up a prediction method to incorporate the Elo model and return valid predictions

In [38]:
# Write some code
def predict(required_predictions, data_loader, log=lambda x: print(x)):
    first_year = 2016
    
    log('Loading training data')
    train_data = get_multi_season_game_data(data_loader, first_year=first_year, last_year=2020)
    
    log('Getting Elo ratings over time on train data')
    elo_dict = get_elos_over_time(train_data, starting_elo_dict={}, K=10)
    train_data['EloDifference'] = train_data['homeElo'] - train_data['awayElo']
    train_data['EloSum'] = train_data['homeElo'] + train_data['awayElo']
    
    log('Fitting linear model from Elo difference and sum to points difference')
    X = train_data[['EloDifference', 'EloSum']]
    X = sm.add_constant(X)
    y = train_data['pointsDiff']
    diff_model = sm.OLS(y, X).fit()
    
    log('Fitting linear model from Elo difference and sum to points sum')
    y = train_data['pointsSum']
    sum_model = sm.OLS(y, X).fit()
    
    log('Generating predictions')
#     required_predictions = pd.DataFrame(required_predictions)
    tmp = required_predictions[['homeTeam', 'awayTeam']].copy()
    tmp['homeElo'] = [elo_dict[team] for team in tmp['homeTeam']]
    tmp['awayElo'] = [elo_dict[team] for team in tmp['awayTeam']]
    tmp['EloDifference'] = tmp.eval('homeElo - awayElo')
    tmp['EloSum'] = tmp.eval('homeElo + awayElo')
    X = tmp[['EloDifference', 'EloSum']]
    X = sm.add_constant(X)
    tmp['predictedDiff'] = diff_model.predict(X)
    tmp['predictedSum'] = sum_model.predict(X)
    
    required_predictions['predictedDiff'] = tmp['predictedDiff']
    required_predictions['predictedSum'] = tmp['predictedSum']
    
    log('Finished')
    
#     return required_predictions.to_dict('records')
    return required_predictions

In [65]:
required_predictions = test_data[:1000][['homeTeam', 'awayTeam', 'dateTime', 'gameId']]
required_predictions = predict(required_predictions, data_loader)

Loading training data
Getting Elo ratings over time on train data
Fitting linear model from Elo difference and sum to points difference
Fitting linear model from Elo difference and sum to points sum
Generating predictions
Finished


In [66]:
required_predictions.head()

Unnamed: 0,homeTeam,awayTeam,dateTime,gameId,predictedDiff,predictedSum
0,CLE,NY,2016-10-25 19:30:00,9642,10.377105,215.376327
1,POR,UTA,2016-10-25 22:00:00,9643,2.715057,209.70301
2,GS,SA,2016-10-25 22:30:00,9644,3.898871,205.084536
3,IND,DAL,2016-10-26 19:00:00,9645,6.783833,212.57252
4,ORL,MIA,2016-10-26 19:00:00,9646,-3.975014,213.225348


In [74]:
def single_game_error(predictedDiff, predictedSum, actualDiff, actualSum):
    return abs(predictedSum - actualSum)
    return abs(predictedDiff - actualDiff) + abs(predictedSum - actualSum)

## This function adds new columns to the input dataframe (in place) corresponding to the score for each game 
## as well as returning the total score over the entire dataframe
def score_predictions(predictions):
    x1 = predictions['predictedDiff']
    x2 = predictions['predictedSum']
    y1 = predictions['pointsDiff']
    y2 = predictions['pointsSum']
    
    ## baseline model 
    x1_baseline = 0  ## no information about who will win
    x2_baseline = 200  ## avergae points total between 2009 and 2016 seasons
    
    predictions['error'] = single_game_error(x1, x2, y1, y2)
    predictions['baseline_error'] = single_game_error(x1_baseline, x2_baseline, y1, y2)
    
    predictions['score'] = predictions.eval('baseline_error - error')
    
    return predictions.score.sum()

In [83]:
test_data.tail()

Unnamed: 0,gameId,dateTime,homeTeam,awayTeam,pointsDiff,pointsSum,homeBlocks,homeMinutes,homeRebounds,homeScore,homeSteals,quarter0home,quarter1home,quarter2home,quarter3home,awayBlocks,awayMinutes,awayRebounds,awayScore,awaySteals,quarter0away,quarter1away,quarter2away,quarter3away,season,status,homeElo,awayElo,EloDifference,EloSum,predictedDiff
2455,12373,2018-04-11 20:00:00,ORL,WAS,9.0,193.0,7.0,240.0,42.0,101.0,6.0,26.0,29.0,21.0,25.0,1.0,240.0,50.0,92.0,5.0,23.0,22.0,30.0,17.0,2018,Final,-886.893412,-801.145153,-85.748259,-1688.038565,1.747585
2456,12374,2018-04-11 20:00:00,PHI,MIL,35.0,225.0,7.0,240.0,57.0,130.0,15.0,46.0,34.0,25.0,25.0,5.0,240.0,43.0,95.0,6.0,18.0,26.0,31.0,20.0,2018,Final,-1493.873358,-843.218576,-650.654782,-2337.091934,-1.383879
2457,12377,2018-04-11 22:30:00,POR,UTA,9.0,195.0,9.0,240.0,46.0,102.0,9.0,28.0,27.0,25.0,22.0,7.0,240.0,53.0,93.0,5.0,21.0,20.0,18.0,34.0,2018,Final,86.450794,-326.254515,412.705308,-239.803721,5.011238
2458,12376,2018-04-11 22:30:00,LAC,LAL,-15.0,215.0,1.0,240.0,41.0,100.0,8.0,32.0,21.0,21.0,26.0,6.0,240.0,49.0,115.0,8.0,39.0,25.0,28.0,23.0,2018,Final,27.540078,-620.297237,647.837315,-592.75716,5.958408
2459,12378,2018-04-11 22:30:00,SAC,HOU,13.0,179.0,3.0,240.0,48.0,96.0,6.0,28.0,24.0,27.0,17.0,6.0,240.0,40.0,83.0,2.0,22.0,22.0,19.0,20.0,2018,Final,-2193.178107,192.76759,-2385.945697,-2000.410518,-9.6708


In [76]:
tmp = pd.merge(required_predictions[['gameId', 'predictedDiff', 'predictedSum']], test_data[['gameId', 'pointsDiff', 'pointsSum']], on='gameId', how='left')
print('Total score across entire dataframe: ' + str(score_predictions(tmp)))

Total score across entire dataframe: 2073.3916300505393


In [77]:
plt.figure()
plt.plot(np.cumsum(tmp['score']))
plt.xlabel('game')
plt.title('cumulative score');

<IPython.core.display.Javascript object>

In [46]:
# Run a simulation
settings.predict = predict
simulation_result = libsimulation.runSimulation(settings)

Loading prediction matches starting from 2019-01-01
Starting call to user defined function
Loading training data
Getting Elo ratings over time on train data
Fitting linear model from Elo difference and sum to points difference
Fitting linear model from Elo difference and sum to points sum
Generating predictions
Finished
User defined function completed
Game 13029. Actual results: home 122 - away 116. Actual: sum 238 - diff 6. Predicted results: sum 208.32286030795518 - predictedDiff 5.064402016991545
Game 13030. Actual results: home 121 - away 98. Actual: sum 219 - diff 23. Predicted results: sum 211.24899107306746 - predictedDiff 2.695421927300327
Game 13031. Actual results: home 115 - away 108. Actual: sum 223 - diff 7. Predicted results: sum 216.36754912536117 - predictedDiff 8.689807526873517
Game 13032. Actual results: home 108 - away 113. Actual: sum 221 - diff -5. Predicted results: sum 213.141182613383 - predictedDiff -2.8713771240486863
Game 13033. Actual results: home 113 - aw

In [47]:
simulation_result.head(2)

Unnamed: 0,gameId,date,homeTeam,awayTeam,predictedSum,predictedDiff
0,13029,2019-01-01T19:30:00,TOR,UTA,208.32286,5.064402
1,13030,2019-01-01T20:00:00,MIL,DET,211.248991,2.695422
