In [174]:
import libsimulation
from src import main

import os, datetime, argparse, requests, urllib.parse, sys, re, traceback, json
import math, numpy as np
from matplotlib import pyplot as plt

%matplotlib inline
import pandas as pd
import statsmodels.api as sm
import seaborn as sns
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [4]:
# Initialize some settings
settings = libsimulation.SimulationSettings()
# This prevents you accidentally loading data beyond this point, and also defines the start of the simulation run period
settings.cutoff = '2019-01-01'

In [300]:
data_loader = libsimulation.NbaDataLoader(settings)

In [13]:
df = pd.concat((data_loader.getPlayers(year) for year in np.arange(2009, 2019).astype(str)), sort=False).reset_index(drop=True)

In [14]:
df.head()

Unnamed: 0,gameId,name,dateTime,team,season,blocks,injuryBodyPart,injuryStatus,minutes,points,position,rebounds,steals
0,6273,John Salmons,2008-10-29T20:00:00,SAC,2009,0,,,41,24,GF,5,0
1,6273,Jason Thompson,2008-10-29T20:00:00,SAC,2009,1,,,22,18,FC,10,0
2,6273,Kevin Martin,2008-10-29T20:00:00,SAC,2009,0,,,39,17,GF,6,2
3,6273,Spencer Hawes,2008-10-29T20:00:00,SAC,2009,6,,,33,12,FC,14,0
4,6273,Beno Udrih,2008-10-29T20:00:00,SAC,2009,0,,,35,10,PG,5,0


In [15]:
df.tail()

Unnamed: 0,gameId,name,dateTime,team,season,blocks,injuryBodyPart,injuryStatus,minutes,points,position,rebounds,steals
278255,12378,Frank Mason III,2018-04-11T22:30:00,SAC,2018,0,Heel,Out,0,0,PG,0,0
278256,12378,Iman Shumpert,2018-04-11T22:30:00,SAC,2018,0,Foot,Out,0,0,SG,0,0
278257,12378,Aaron Jackson,2018-04-11T22:30:00,HOU,2018,0,,,34,8,PF,3,0
278258,12378,Le'bryan Nash,2018-04-11T22:30:00,HOU,2018,0,,,0,0,PF,0,0
278259,12378,Tim Quarterman,2018-04-11T22:30:00,HOU,2018,0,,,0,0,SG,0,0


In [16]:
df.shape

(278260, 13)

In [19]:
for player in df.name.unique():
    print(player, df[df.name==player].team.unique())

John Salmons ['SAC' 'CHI' 'MIL' 'TOR' 'NO' 'PHO']
Jason Thompson ['SAC' 'GS' 'TOR']
Kevin Martin ['SAC' 'HOU' 'OKC' 'MIN' 'SA']
Spencer Hawes ['SAC' 'PHI' 'CLE' 'LAC' 'CHA' 'MIL']
Beno Udrih ['SAC' 'MIL' 'ORL' 'NY' 'MEM' 'MIA' 'DET']
Mikki Moore ['SAC' 'BOS' 'GS']
Bobby Jackson ['SAC']
Shelden Williams ['SAC' 'MIN' 'BOS' 'DEN' 'BKN']
Quincy Douby ['SAC' 'TOR']
Kenny Thomas ['SAC']
Al Jefferson ['MIN' 'UTA' 'CHA' 'IND']
Rashad McCants ['MIN' 'SAC']
Ryan Gomes ['MIN' 'LAC' 'LAL' 'OKC']
Randy Foye ['MIN' 'WAS' 'LAC' 'LAL' 'UTA' 'DEN' 'OKC' 'BKN']
Kevin Love ['MIN' 'CLE']
Mike Miller ['MIN' 'WAS' 'MIA' 'MEM' 'CLE' 'DEN']
Corey Brewer ['MIN' 'DEN' 'HOU' 'LAL' 'OKC']
Craig Smith ['MIN' 'LAC' 'LAL' 'POR']
Kevin Ollie ['MIN' 'OKC']
Rodney Carney ['MIN' 'PHI' 'GS' 'MEM']
Tyronn Lue ['MIL' 'ORL']
Michael Redd ['MIL' 'PHO']
Malik Allen ['MIL' 'DEN' 'ORL']
Richard Jefferson ['MIL' 'SA' 'GS' 'UTA' 'DAL' 'CLE' 'DEN']
Charlie Bell ['MIL' 'GS']
Dan Gadzuric ['MIL' 'GS' 'NY']
Luke Ridnour ['MIL' 'MIN' 

Chris Andersen ['DEN' 'MIA' 'MEM' 'CLE']
Nene Hilario ['DEN' 'WAS' 'HOU']
Dahntay Jones ['DEN' 'IND' 'DAL' 'ATL' 'LAC' 'CLE']
J.R. Smith ['DEN' 'NY' 'CLE']
Linas Kleiza ['DEN' 'TOR']
Renaldo Balkman ['DEN' 'NY']
Brevin Knight ['UTA']
Andrei Kirilenko ['UTA' 'MIN' 'BKN' 'PHI']
Mehmet Okur ['UTA' 'BKN']
Carlos Boozer ['UTA' 'CHI' 'LAL']
Kyle Korver ['UTA' 'CHI' 'ATL' 'CLE']
C.J. Miles ['UTA' 'CLE' 'IND' 'TOR']
Ronnie Price ['UTA' 'PHO' 'POR' 'ORL' 'LAL']
Ronnie Brewer ['UTA' 'CHI' 'NY' 'OKC' 'HOU']
Paul Millsap ['UTA' 'ATL' 'DEN']
Peja Stojakovic ['NO' 'TOR' 'DAL']
James Posey ['NO' 'IND']
Morris Peterson ['NO' 'OKC']
Tyson Chandler ['NO' 'CHA' 'DAL' 'NY' 'PHO']
Mike James ['NO' 'WAS' 'CHI' 'DAL' 'PHO']
Rasual Butler ['NO' 'LAC' 'LAL' 'TOR' 'IND' 'WAS' 'SA']
Devin Brown ['NO']
David West ['NO' 'IND' 'SA' 'GS']
Chris Paul ['NO' 'LAC' 'LAL' 'HOU']
Hilton Armstrong ['NO' 'WAS' 'ATL' 'GS']
Stephen Jackson ['GS' 'CHA' 'MIL' 'SA' 'LAC']
Al Harrington ['GS' 'NY' 'DEN' 'ORL' 'WAS']
Corey Maggett

Jodie Meeks ['MIL' 'PHI' 'LAL' 'DET' 'ORL' 'WAS']
DaJuan Summers ['DET' 'NO' 'LAC']
Taylor Griffin ['PHO' 'CHA']
Trey Gilder ['MEM']
Tyler Hansbrough ['IND' 'TOR' 'CHA']
Marcus Thornton ['NO' 'SAC' 'BKN' 'BOS' 'PHO' 'HOU' 'DET' 'WAS' 'CLE']
Dante Cunningham ['POR' 'CHA' 'MEM' 'MIN' 'NO' 'BKN']
Danny Green ['CLE' 'SA']
Earl Boykins ['WAS' 'MIL' 'HOU']
Jamaal Tinsley ['MEM' 'UTA']
Primoz Brezec ['PHI']
Chris Hunter ['GS']
Marcus Haislip ['SA']
Josh Childress ['PHO' 'BKN' 'NO']
Armon Johnson ['POR' 'BKN']
Ish Smith ['HOU' 'GS' 'ORL' 'MIL' 'PHO' 'OKC' 'PHI' 'NO' 'DET']
Derrick Caracter ['LAL']
Derrick Favors ['BKN' 'UTA']
Damion James ['BKN' 'SA']
Evan Turner ['PHI' 'IND' 'BOS' 'POR']
Landry Fields ['NY' 'TOR']
Timofey Mozgov ['NY' 'DEN' 'CLE' 'LAL' 'BKN']
Jordan Crawford ['ATL' 'WAS' 'BOS' 'GS' 'NO']
Xavier Henry ['MEM' 'NO' 'LAL']
Pooh Jeter ['SAC']
DeMarcus Cousins ['SAC' 'NO']
Hassan Whiteside ['SAC' 'MEM' 'MIA']
Nikola Pekovic ['MIN']
Wesley Johnson ['MIN' 'PHO' 'LAL' 'LAC']
Omer Asik

Quincy Miller ['DEN' 'SAC' 'DET']
Viacheslav Kravtsov ['DET' 'PHO']
Jarvis Varnado ['BOS' 'MIA' 'CHI' 'PHI']
Patrick Beverley ['HOU' 'LAC']
Mickael Gelabale ['MIN']
Aron Baynes ['SA' 'DET' 'BOS']
Fab Melo ['BOS']
Tim Ohlbrecht ['HOU']
Henry Sims ['NO' 'CLE' 'PHI' 'BKN']
Justin Holiday ['PHI' 'GS' 'ATL' 'CHI' 'NY']
Josh Akognon ['DAL']
Victor Oladipo ['ORL' 'OKC' 'IND']
Solomon Hill ['IND' 'NO']
Tony Snell ['CHI' 'MIL']
Erik Murphy ['CHI' 'UTA']
Reggie Bullock ['LAC' 'PHO' 'DET']
Ryan Kelly ['LAL' 'ATL']
Elias Harris ['LAL']
Mason Plumlee ['BKN' 'POR' 'DEN']
Anthony Bennett ['CLE' 'MIN' 'TOR' 'BKN']
Matthew Dellavedova ['CLE' 'MIL']
Carrick Felix ['CLE' 'WAS']
Sergey Karasev ['CLE' 'BKN']
Michael Carter-Williams ['PHI' 'MIL' 'CHI' 'CHA']
Brandon Davies ['PHI' 'BKN']
Hollis Thompson ['PHI' 'NO']
Nerlens Noel ['PHI' 'DAL']
Vitor Faverani ['BOS']
Kelly Olynyk ['BOS' 'MIA']
Phil Pressey ['BOS' 'PHI' 'PHO']
Dwight Buycks ['TOR' 'LAL' 'DET']
Glen Rice Jr. ['WAS']
Otto Porter ['WAS']
Kentaviou

Ivica Zubac ['LAL']
Danuel House ['WAS' 'PHO']
Sheldon Mcclellan ['WAS']
Daniel Ochefu ['WAS']
Tomas Satoransky ['WAS']
Deandre Bembry ['ATL']
Malcolm Delaney ['ATL']
Taurean Prince ['ATL']
Cristiano Da Silva Felicio ['CHI']
Denzel Livingston ['CHI']
Paul Zipser ['CHI']
Diamond Stone ['LAC']
Brice Johnson ['LAC' 'DET' 'MEM']
Yogi Ferrell ['BKN' 'DAL']
Jonathan Gibson ['DAL' 'BOS']
Derrick Jones ['PHO' 'MIA']
Pierre Jackson ['DAL']
Chasson Randle ['PHI' 'NY']
Okaro White ['MIA' 'ATL' 'CLE']
Denzel Valentine ['CHI']
Mike Tobey ['CHA']
Marcus Georges-hunt ['MIA' 'ORL']
Ben Bentil ['DAL']
Quinn Cook ['DAL' 'NO' 'GS']
Isaiah Taylor ['HOU' 'ATL']
David Nwaba ['LAL' 'CHI']
Shawn Long ['PHI']
Wayne Selden ['NO' 'MEM']
Jarrod Uthoff ['DAL']
Gary Payton ['MIL' 'LAL']
Alex Poythress ['PHI' 'IND']
Patricio Garino ['ORL']
Jayson Tatum ['BOS']
Abdel Nader ['BOS']
Semi Ojeleye ['BOS']
Daniel Theis ['BOS']
Kadeem Allen ['BOS']
Jabari Bird ['BOS']
Guerschon Yabusele ['BOS']
Cedi Osman ['CLE']
Ante Zizi

In [952]:
def get_multi_season_players_data(data_loader, first_year, last_year):
    res = pd.concat((data_loader.getPlayers(year) for year in np.arange(first_year, last_year + 1).astype(str)), sort=False).reset_index(drop=True)
    res['dateTime'] = pd.to_datetime(res['dateTime'])
    return res.sort_values('dateTime')

def get_multi_season_games_data(data_loader, first_year, last_year):
    res = pd.concat((data_loader.getSeason(year) for year in np.arange(first_year, last_year + 1).astype(str)), sort=False).reset_index(drop=True)
    res['dateTime'] = pd.to_datetime(res['dateTime'])
    return res.sort_values('dateTime')

def get_avg_over_time(players_data):
    return players_data[players_data.minutes!=0].set_index('dateTime').sort_index().groupby('name').points.rolling(5).mean()
#     return players_data.set_index('dateTime').sort_index().groupby('name').points.rolling(5).mean()

def get_last_game_players(team, game_id, games_data, data_loader):
    date = games_data[games_data['gameId'] == game_id].dateTime.values[0]
    last_game_id = games_data[((games_data['homeTeam'] == team) | (games_data['awayTeam'] == team)) & (games_data['dateTime'] < date)].gameId.iloc[-1]
    return data_loader.getGame(last_game_id).name

def predict_points(team, game_id, data_loader, avgs, games_data):
    date = games_data[games_data['gameId'] == game_id].dateTime.values[0]
    last_game_id = games_data[((games_data['homeTeam'] == team) | (games_data['awayTeam'] == team)) & (games_data['dateTime'] < date)].gameId.iloc[-1]
    last_players = data_loader.getGame(last_game_id).query('team == @team').name
    return avgs.loc[last_players].reset_index().query('dateTime < @date').drop_duplicates(subset='name', keep='last').points.sum()

def predict_points_2(team, game_id, data_loader, games_data, data):
    date = games_data[games_data['gameId'] == game_id].dateTime.values[0]
    last_game_id = games_data[((games_data['homeTeam'] == team) | (games_data['awayTeam'] == team)) & (games_data['dateTime'] < date)].gameId.iloc[-1]
    return data[(data.gameId == last_game_id) & (data.team == team)].predicted_points.sum()

def predict(required_predictions, data_loader, log=lambda x: print(x)):
    first_year = 2009
    if 'date' in required_predictions.columns:
        required_predictions['dateTime'] = pd.to_datetime(required_predictions['date'])
    log('Loading training data')
    players_data = get_multi_season_players_data(data_loader, first_year=first_year, last_year=2020)
#     players_data['points_pm'] = players_data['points'] / players_data['minutes'].clip(lower=1)
    games_data = get_multi_season_games_data(data_loader, first_year, last_year=2020).dropna()
    games_data = pd.concat((games_data, required_predictions), sort=False).sort_values('dateTime').dropna(subset=['gameId'])
    train_data = games_data[games_data.dateTime < required_predictions.iloc[0].dateTime].iloc[-1000:].copy()
    
    players_data = players_data.drop_duplicates(subset=['name', 'dateTime'])
#     features = pd.concat((players_data.set_index('dateTime').sort_index().groupby('name').points.rolling(5).mean(),
#                          players_data.set_index('dateTime').sort_index().groupby('name').minutes.rolling(5).mean()), axis=1)
#     features.columns = ['avg_points', 'avg_minutes']
#     df = pd.concat((players_data.set_index(['name', 'dateTime']), features), join='inner', axis=1).sort_index().reset_index()
#     df['next_points'] = df.points.shift(-1)
#     df = df.sort_values('dateTime').dropna(subset=['minutes', 'points', 'rebounds', 'steals', 'avg_points', 'avg_minutes', 'next_points'])
#     train = df[df.gameId.isin(train_data.gameId)].copy()
#     x_train = train[['minutes', 'points', 'rebounds', 'steals', 'avg_points', 'avg_minutes']]
#     y_train = train.next_points.values
#     test = df[df.gameId.isin(required_predictions.gameId)].copy()
#     x_test = test[['minutes', 'points', 'rebounds', 'steals', 'avg_points', 'avg_minutes']]
#     y_test = test.next_points.values

#     gbr = GradientBoostingRegressor(n_estimators=300, max_depth=3, learning_rate=0.1)
#     gbr.fit(x_train, y_train)
#     y_pred = gbr.predict(x_test)
#     test['predicted_points'] = y_pred
#     train['predicted_points'] = gbr.predict(x_train)
    
#     log('Getting Elo ratings over time on train data')
    avgs = get_avg_over_time(players_data)
    
    log('Generating predictions on train data')
#     required_predictions = pd.DataFrame(required_predictions)
    train_data['homePoints'] = train_data.apply(lambda x: predict_points(x.homeTeam, x.gameId, data_loader, avgs, games_data), axis=1)
    train_data['awayPoints'] = train_data.apply(lambda x: predict_points(x.awayTeam, x.gameId, data_loader, avgs, games_data), axis=1)
    train_data['predictedDiff'] = train_data['homePoints'] - train_data['awayPoints']
    train_data['predictedSum'] = train_data['homePoints'] + train_data['awayPoints']
    
    log('Fitting linear model from train predictions to points difference')
    X = train_data[['predictedDiff', 'predictedSum']]
    X = sm.add_constant(X)
    y = train_data['pointsDiff']
    diff_model = sm.OLS(y, X).fit()
    
    log('Fitting linear model from train predictions to points sum')
    y = train_data['pointsSum']
    sum_model = sm.OLS(y, X).fit()
    
    log('Generating predictions')
#     required_predictions = pd.DataFrame(required_predictions)
    tmp = required_predictions[['homeTeam', 'awayTeam', 'gameId']].copy()
    tmp['homePoints'] = tmp.apply(lambda x: predict_points(x.homeTeam, x.gameId, data_loader, avgs, games_data), axis=1)
    tmp['awayPoints'] = tmp.apply(lambda x: predict_points(x.awayTeam, x.gameId, data_loader, avgs, games_data), axis=1)
    tmp['predictedDiff'] = tmp['homePoints'] - tmp['awayPoints']
    tmp['predictedSum'] = tmp['homePoints'] + tmp['awayPoints']
                               
    X = tmp[['predictedDiff', 'predictedSum']]
    X = sm.add_constant(X)
    tmp['adjPredictedDiff'] = diff_model.predict(X)
    tmp['adjPredictedSum'] = sum_model.predict(X)
    
    required_predictions['unadjPredictedDiff'] = tmp['predictedDiff']
    required_predictions['unadjPredictedSum'] = tmp['predictedSum']
    
    required_predictions['predictedDiff'] = tmp['adjPredictedDiff']
    required_predictions['predictedSum'] = tmp['adjPredictedSum']
    
    log('Finished')
    
#     return required_predictions.to_dict('records')
    return required_predictions

def predict_rolling_mean(required_predictions, data_loader, log=lambda x: print(x)):
    first_year = 2009
    if 'date' in required_predictions.columns:
        required_predictions['dateTime'] = pd.to_datetime(required_predictions['date'])
    log('Loading training data')
#     players_data = get_multi_season_players_data(data_loader, first_year=first_year, last_year=2020)
#     players_data['points_pm'] = players_data['points'] / players_data['minutes'].clip(lower=1)
    games_data = get_multi_season_games_data(data_loader, first_year, last_year=2020).dropna()
#     games_data = pd.concat((games_data, required_predictions), sort=False).sort_values('dateTime').dropna(subset=['gameId'])
#     Carful with this code, first game of every team is last game of preceding team bcs of the shift
    predicted_home = (games_data.set_index('gameId').groupby('homeTeam').homeScore.rolling(5).mean().shift().reset_index(level=0, drop=True))
    predicted_away = (games_data.set_index('gameId').groupby('awayTeam').awayScore.rolling(5).mean().shift().reset_index(level=0, drop=True))
    predictions = pd.concat((predicted_home, predicted_away), axis=1)
    
    required_predictions = pd.concat((required_predictions.set_index('gameId'), predictions), join='inner', axis=1).reset_index()
    required_predictions['predictedDiff'] = required_predictions['homeScore'] - required_predictions['awayScore']
    required_predictions['predictedSum'] = required_predictions['homeScore'] + required_predictions['awayScore']
    
    log('Finished')
    
    return required_predictions

def predict_adj_rolling_mean(required_predictions, data_loader, log=lambda x: print(x)):
    first_year = 2009
    if 'date' in required_predictions.columns:
        required_predictions['dateTime'] = pd.to_datetime(required_predictions['date'])
    log('Loading training data')
    games_data = get_multi_season_games_data(data_loader, first_year, last_year=2020).dropna()
    train_data = games_data[games_data.dateTime < required_predictions.iloc[0].dateTime].iloc[-1000:].copy()
    
    log('Generating predictions on train data')
    tmp = predict_rolling_mean(train_data[['homeTeam', 'awayTeam', 'dateTime', 'gameId', 'pointsDiff', 'pointsSum']], data_loader, log)
    
    log('Fitting linear model from train predictions to points difference')
    X = tmp[['predictedDiff', 'predictedSum']]
    X = sm.add_constant(X)
    y = tmp['pointsDiff']
    diff_model = sm.OLS(y, X).fit()
    
    log('Fitting linear model from train predictions to points sum')
    y = tmp['pointsSum']
    sum_model = sm.OLS(y, X).fit()
    
    log('Generating predictions')
    required_predictions = predict_rolling_mean(required_predictions, data_loader, log)
    
    X = required_predictions[['predictedDiff', 'predictedSum']]
    X = sm.add_constant(X)
    required_predictions.loc['adjPredictedDiff'] = diff_model.predict(X)
    required_predictions.loc['adjPredictedSum'] = sum_model.predict(X)
    
    required_predictions.loc['unadjPredictedDiff'] = required_predictions['predictedDiff']
    required_predictions.loc['unadjPredictedSum'] = required_predictions['predictedSum']
    
    required_predictions.loc['predictedDiff'] = required_predictions['adjPredictedDiff']
    required_predictions.loc['predictedSum'] = required_predictions['adjPredictedSum']
    
    return required_predictions

In [957]:
pred = predict(test_data[['homeTeam', 'awayTeam', 'dateTime', 'gameId', 'pointsDiff', 'pointsSum']][:-2], data_loader)

Loading training data
Generating predictions on train data
Fitting linear model from train predictions to points difference
Fitting linear model from train predictions to points sum
Generating predictions
Finished


In [798]:
test_data = get_multi_season_games_data(data_loader, 2018, 2018).dropna()
required_predictions = test_data[['homeTeam', 'awayTeam', 'dateTime', 'gameId', 'pointsDiff', 'pointsSum']]

In [673]:
test_data.tail()

Unnamed: 0,gameId,dateTime,homeTeam,awayTeam,pointsDiff,pointsSum,homeBlocks,homeMinutes,homeRebounds,homeScore,homeSteals,quarter0home,quarter1home,quarter2home,quarter3home,awayBlocks,awayMinutes,awayRebounds,awayScore,awaySteals,quarter0away,quarter1away,quarter2away,quarter3away,season,status
1223,10862,2017-04-12 20:00:00,IND,ATL,18.0,190.0,7.0,240.0,41.0,104.0,5.0,22.0,28.0,34.0,20.0,4.0,240.0,37.0,86.0,5.0,15.0,28.0,25.0,18.0,2017,Final
1229,10871,2017-04-12 21:00:00,UTA,SA,4.0,198.0,5.0,240.0,36.0,101.0,10.0,32.0,21.0,29.0,19.0,2.0,240.0,42.0,97.0,12.0,23.0,23.0,26.0,25.0,2017,Final
1231,10869,2017-04-12 22:30:00,LAC,SAC,20.0,210.0,6.0,240.0,43.0,115.0,4.0,30.0,23.0,34.0,28.0,1.0,240.0,37.0,95.0,2.0,23.0,24.0,26.0,22.0,2017,Final
1230,10868,2017-04-12 22:30:00,GS,LAL,15.0,203.0,8.0,240.0,54.0,109.0,14.0,43.0,21.0,29.0,16.0,2.0,240.0,40.0,94.0,13.0,28.0,22.0,20.0,24.0,2017,Final
1232,10870,2017-04-12 22:30:00,POR,NO,-3.0,203.0,2.0,240.0,53.0,100.0,4.0,30.0,23.0,32.0,15.0,3.0,240.0,42.0,103.0,13.0,22.0,31.0,25.0,25.0,2017,Final


In [777]:
pred = predict_adj_rolling_mean(test_data[['homeTeam', 'awayTeam', 'dateTime', 'gameId', 'pointsDiff', 'pointsSum']], data_loader)

Loading training data
Generating predictions on train data
Loading training data
Finished
Fitting linear model from train predictions to points difference
Fitting linear model from train predictions to points sum
Generating predictions
Loading training data
Finished


In [1001]:
def single_game_error(predictedDiff, predictedSum, actualDiff, actualSum):
    return abs(predictedSum - actualSum)
    return abs(predictedDiff - actualDiff)
    return abs(predictedDiff - actualDiff) + abs(predictedSum - actualSum)

## This function adds new columns to the input dataframe (in place) corresponding to the score for each game 
## as well as returning the total score over the entire dataframe
def score_predictions(predictions):
    x1 = predictions['predictedDiff']
    x2 = predictions['predictedSum']
    y1 = predictions['pointsDiff']
    y2 = predictions['pointsSum']
    
    ## baseline model 
    x1_baseline = 0  ## no information about who will win
    x2_baseline = 200  ## avergae points total between 2009 and 2016 seasons
    
    predictions['error'] = single_game_error(x1, x2, y1, y2)
    predictions['baseline_error'] = single_game_error(x1_baseline, x2_baseline, y1, y2)
    
    predictions['score'] = predictions.eval('baseline_error - error')
    
    return predictions.score.sum()

In [None]:
pred['predictedDiffOld'] = pred['predictedDiff']

In [1000]:
pred['predictedSumOld'] = pred['predictedSum']

In [1027]:
pred['predictedDiff'] = pred['predictedDiffOld'] * 0.8
pred['predictedSum'] = pred['predictedSumOld'] * 1.

In [1028]:
pred.head()

Unnamed: 0,homeTeam,awayTeam,dateTime,gameId,pointsDiff,pointsSum,unadjPredictedDiff,unadjPredictedSum,predictedDiff,predictedSum,predictedDiffOld,predictedSumOld
0,CLE,BOS,2017-10-17 20:00:00,11148,3.0,201.0,27.8,270.6,2.653887,212.85573,3.317358,212.85573
1,GS,HOU,2017-10-17 22:30:00,11149,-1.0,243.0,-9.8,299.8,3.529327,213.531988,4.411658,213.531988
2,DET,CHA,2017-10-18 19:00:00,11150,12.0,192.0,12.6,267.8,2.678262,212.536237,3.347827,212.536237
3,IND,BKN,2017-10-18 19:00:00,11151,9.0,271.0,-15.2,269.2,2.869982,212.216628,3.587477,212.216628
4,ORL,MIA,2017-10-18 19:00:00,11152,7.0,225.0,-49.2,287.6,3.480997,212.503197,4.351247,212.503197


In [1029]:
pred.tail()

Unnamed: 0,homeTeam,awayTeam,dateTime,gameId,pointsDiff,pointsSum,unadjPredictedDiff,unadjPredictedSum,predictedDiff,predictedSum,predictedDiffOld,predictedSumOld
1222,MIA,TOR,2018-04-11 20:00:00,12371,7.0,225.0,3.8,257.4,2.494288,211.994876,3.11786,211.994876
1220,CHI,DET,2018-04-11 20:00:00,12369,-32.0,206.0,20.2,338.2,4.22297,215.497108,5.278713,215.497108
1219,BOS,BKN,2018-04-11 20:00:00,12367,13.0,207.0,20.6,339.0,4.238717,215.535002,5.298396,215.535002
1223,MIN,DEN,2018-04-11 20:00:00,12372,6.0,218.0,-9.4,263.8,2.714729,212.075948,3.393412,212.075948
1229,POR,UTA,2018-04-11 22:30:00,12377,9.0,195.0,-4.2,259.0,2.576471,211.951501,3.220589,211.951501


In [1030]:
tmp = pd.merge(pred[['gameId', 'predictedDiff', 'predictedSum']], test_data[['gameId', 'pointsDiff', 'pointsSum']], on='gameId', how='left')
# tmp['predictedSum'] = 200
# tmp['predictedDiff'] = 0
print('Total score across entire dataframe:', str(score_predictions(tmp)))
print('Average score:', str(score_predictions(tmp) / tmp.shape[0]))

Total score across entire dataframe: 3424.535457272023
Average score: 2.7887096557589763


In [1004]:
plt.figure()
plt.plot(np.cumsum(tmp['score']))
plt.xlabel('game')
plt.title('cumulative score');

<IPython.core.display.Javascript object>

In [610]:
%matplotlib notebook
plt.figure()
# (tmp['pointsDiff'] - tmp['predictedDiff']).plot()
# (tmp['pointsDiff'] - tmp['predictedDiff']).rolling(100, 1).mean().plot()
tmp.pointsSum.plot()
tmp.predictedSum.plot()
plt.xlabel('game')
plt.grid()
plt.title('cumulative score')

<IPython.core.display.Javascript object>

Text(0.5,1,'cumulative score')

In [464]:
tmp.head()

Unnamed: 0,gameId,predictedDiff,predictedSum,pointsDiff,pointsSum,error,baseline_error,score
0,9642,13.4,202.2,29.0,205.0,18.4,34.0,15.6
1,9643,17.0,206.2,9.0,217.0,18.8,26.0,7.2
2,9644,12.2,206.2,-29.0,229.0,64.0,58.0,-6.0
3,9645,18.2,205.4,9.0,251.0,54.8,60.0,5.2
4,9646,-1.6,212.0,-12.0,204.0,18.4,16.0,-2.4


In [465]:
tmp.tail()

Unnamed: 0,gameId,predictedDiff,predictedSum,pointsDiff,pointsSum,error,baseline_error,score
995,10638,4.29929,403.83135,28.0,230.0,197.532059,58.0,-139.532059
996,10639,8.365787,380.111686,5.0,235.0,148.477472,40.0,-108.477472
997,10640,-8.21139,402.140999,32.0,224.0,218.352389,56.0,-162.352389
998,10641,9.482475,406.356337,-18.0,226.0,207.838811,44.0,-163.838811
999,10642,10.55002,371.270032,6.0,168.0,207.820052,38.0,-169.820052


In [315]:
players_data = get_multi_season_players_data(data_loader, 2009, 2020)
games_data = get_multi_season_games_data(data_loader, 2009, 2020).dropna()

In [336]:
avgs = get_avg_over_time(players_data)

In [318]:
print(predict_points('NY', 9642, data_loader, avgs, games_data))
print(predict_points('CLE', 9642, data_loader, avgs, games_data))

118.24118196980916
142.8245574816147


In [670]:
test_data.tail(10)

Unnamed: 0,gameId,dateTime,homeTeam,awayTeam,pointsDiff,pointsSum,homeBlocks,homeMinutes,homeRebounds,homeScore,homeSteals,quarter0home,quarter1home,quarter2home,quarter3home,awayBlocks,awayMinutes,awayRebounds,awayScore,awaySteals,quarter0away,quarter1away,quarter2away,quarter3away,season,status
2458,12375,2018-04-11 20:00:00,OKC,MEM,14.0,260.0,3.0,240.0,45.0,137.0,8.0,38.0,39.0,28.0,32.0,2.0,240.0,42.0,123.0,9.0,28.0,29.0,39.0,27.0,2018,Final
2457,12368,2018-04-11 20:00:00,NO,SA,24.0,220.0,5.0,240.0,51.0,122.0,11.0,27.0,34.0,29.0,32.0,1.0,240.0,38.0,98.0,9.0,26.0,17.0,29.0,26.0,2018,Final
2454,12370,2018-04-11 20:00:00,CLE,NY,-12.0,208.0,3.0,240.0,48.0,98.0,9.0,20.0,17.0,34.0,27.0,3.0,240.0,39.0,110.0,10.0,20.0,37.0,27.0,26.0,2018,Final
2455,12371,2018-04-11 20:00:00,MIA,TOR,7.0,225.0,7.0,265.0,56.0,116.0,8.0,27.0,20.0,24.0,34.0,10.0,265.0,47.0,109.0,5.0,29.0,24.0,25.0,27.0,2018,F/OT
2453,12369,2018-04-11 20:00:00,CHI,DET,-32.0,206.0,1.0,240.0,40.0,87.0,8.0,21.0,29.0,24.0,13.0,4.0,240.0,47.0,119.0,12.0,35.0,31.0,28.0,25.0,2018,Final
2452,12367,2018-04-11 20:00:00,BOS,BKN,13.0,207.0,6.0,240.0,62.0,110.0,6.0,27.0,26.0,30.0,27.0,6.0,240.0,44.0,97.0,10.0,22.0,21.0,27.0,27.0,2018,Final
2456,12372,2018-04-11 20:00:00,MIN,DEN,6.0,218.0,2.0,265.0,40.0,112.0,10.0,29.0,33.0,24.0,15.0,4.0,265.0,47.0,106.0,4.0,26.0,28.0,27.0,20.0,2018,F/OT
2462,12377,2018-04-11 22:30:00,POR,UTA,9.0,195.0,9.0,240.0,46.0,102.0,9.0,28.0,27.0,25.0,22.0,7.0,240.0,53.0,93.0,5.0,21.0,20.0,18.0,34.0,2018,Final
2461,12376,2018-04-11 22:30:00,LAC,LAL,-15.0,215.0,1.0,240.0,41.0,100.0,8.0,32.0,21.0,21.0,26.0,6.0,240.0,49.0,115.0,8.0,39.0,25.0,28.0,23.0,2018,Final
2463,12378,2018-04-11 22:30:00,SAC,HOU,13.0,179.0,3.0,240.0,48.0,96.0,6.0,28.0,24.0,27.0,17.0,6.0,240.0,40.0,83.0,2.0,22.0,22.0,19.0,20.0,2018,Final


In [326]:
tmp.head(10)

Unnamed: 0,gameId,predictedDiff,predictedSum,pointsDiff,pointsSum,error,baseline_error,score
0,9642,24.583376,261.065739,29.0,205.0,60.482364,34.0,-26.482364
1,9643,0.530516,187.969655,9.0,217.0,37.499829,26.0,-11.499829
2,9644,-34.101742,265.812091,-29.0,229.0,41.913833,58.0,16.086167
3,9645,-23.791744,244.034854,9.0,251.0,39.75689,60.0,20.24311
4,9646,-43.221796,251.229321,-12.0,204.0,78.451117,16.0,-62.451117
5,9647,-0.474005,185.869522,5.0,239.0,58.604483,44.0,-14.604483
6,9648,12.349054,201.751558,18.0,200.0,7.402504,18.0,10.597496
7,9651,-39.382541,209.782487,-6.0,200.0,43.165027,6.0,-37.165027
8,9649,12.470436,240.255336,-5.0,209.0,48.725772,14.0,-34.725772
9,9650,-8.714829,224.421659,-11.0,203.0,23.70683,14.0,-9.70683


In [354]:
players_data.head()

Unnamed: 0,gameId,name,dateTime,team,season,blocks,injuryBodyPart,injuryStatus,minutes,points,position,rebounds,steals
162,6265,Zydrunas Ilgauskas,2008-10-28 20:00:00,CLE,2009,0,,,30,15,C,8,0
179,6265,Glen Davis,2008-10-28 20:00:00,BOS,2009,0,,,15,2,FC,4,1
178,6265,Leon Powe,2008-10-28 20:00:00,BOS,2009,0,,,23,13,FC,2,0
177,6265,Rajon Rondo,2008-10-28 20:00:00,BOS,2009,0,,,29,14,PG,5,3
176,6265,Tony Allen,2008-10-28 20:00:00,BOS,2009,0,,,17,11,GF,3,1


In [357]:
players_data[players_data.minutes==0]

Unnamed: 0,gameId,name,dateTime,team,season,blocks,injuryBodyPart,injuryStatus,minutes,points,position,rebounds,steals
662,6297,Rodney Carney,2008-11-01 20:00:00,MIN,2009,0,,,0,0,GF,0,0
947,6311,Kosta Koufos,2008-11-03 22:30:00,UTA,2009,0,,,0,0,FC,0,0
945,6311,Morris Almond,2008-11-03 22:30:00,UTA,2009,0,,,0,0,GF,0,0
946,6311,Kyrylo Fesenko,2008-11-03 22:30:00,UTA,2009,0,,,0,0,C,0,0
1139,6322,Kevin Ollie,2008-11-05 20:00:00,MIN,2009,0,,,0,0,PG,0,0
1291,6327,Jason Hart,2008-11-05 22:30:00,LAC,2009,0,,,0,0,PG,0,0
1469,6336,Matt Bonner,2008-11-07 20:00:00,SA,2009,0,,,0,0,FC,0,0
1568,6340,J.J. Barea,2008-11-07 22:30:00,DAL,2009,0,,,0,0,G,0,0
1567,6340,James Singleton,2008-11-07 22:30:00,DAL,2009,0,,,0,0,F,0,0
1747,6349,Hassan Adams,2008-11-09 13:00:00,TOR,2009,0,,,0,0,PG,0,0


In [451]:
games_data.tail()

Unnamed: 0,gameId,dateTime,homeTeam,awayTeam,pointsDiff,pointsSum,homeBlocks,homeMinutes,homeRebounds,homeScore,homeSteals,quarter0home,quarter1home,quarter2home,quarter3home,awayBlocks,awayMinutes,awayRebounds,awayScore,awaySteals,quarter0away,quarter1away,quarter2away,quarter3away,season,status
11231,13024,2018-12-31 19:00:00,HOU,MEM,12.0,214.0,4.0,240.0,46.0,113.0,5.0,27.0,34.0,25.0,27.0,4.0,240.0,39.0,101.0,11.0,18.0,20.0,35.0,28.0,2019,Final
11232,13025,2018-12-31 19:00:00,SA,BOS,9.0,231.0,4.0,240.0,43.0,120.0,6.0,17.0,29.0,46.0,28.0,8.0,240.0,40.0,111.0,5.0,23.0,29.0,30.0,29.0,2019,Final
11234,13027,2018-12-31 20:00:00,OKC,DAL,20.0,224.0,3.0,240.0,46.0,122.0,13.0,32.0,28.0,32.0,30.0,3.0,240.0,49.0,102.0,7.0,21.0,22.0,32.0,27.0,2019,Final
11233,13026,2018-12-31 20:00:00,NO,MIN,9.0,237.0,2.0,240.0,44.0,123.0,3.0,27.0,36.0,25.0,35.0,9.0,240.0,39.0,114.0,10.0,22.0,29.0,38.0,25.0,2019,Final
11235,13028,2018-12-31 21:00:00,PHO,GS,-23.0,241.0,3.0,240.0,38.0,109.0,7.0,27.0,25.0,35.0,22.0,8.0,240.0,44.0,132.0,7.0,31.0,38.0,39.0,24.0,2019,Final


In [445]:
games_data.homeTeam.nunique()

30

In [452]:
30*29 * 10

8700

In [447]:
games_data.shape

(11124, 26)

In [690]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error

  from numpy.core.umath_tests import inner1d


In [706]:
players_data = players_data.drop_duplicates(subset=['name', 'dateTime'])

features = pd.concat((players_data.set_index('dateTime').sort_index().groupby('name').points.rolling(5).mean(),
                     players_data.set_index('dateTime').sort_index().groupby('name').minutes.rolling(5).mean()), axis=1)
features.columns = ['avg_points', 'avg_minutes']

df = pd.concat((players_data.set_index(['name', 'dateTime']), features), join='inner', axis=1).sort_index().reset_index()

df['next_points'] = df.points.shift(-1)

In [729]:
df.head()

Unnamed: 0,name,dateTime,gameId,team,season,blocks,injuryBodyPart,injuryStatus,minutes,points,position,rebounds,steals,avg_points,avg_minutes,next_points
0,A.J. Hammons,2017-10-18 19:00:00,11152,MIA,2018,0,Illness,Questionable,0,0,C,0,0,,,0
1,A.J. Hammons,2017-10-21 20:00:00,11178,MIA,2018,0,Illness,Questionable,0,0,C,0,0,,,0
2,A.J. Hammons,2017-10-23 19:30:00,11189,MIA,2018,0,,,0,0,C,0,0,,,0
3,A.J. Hammons,2017-10-25 20:00:00,11206,MIA,2018,0,,,0,0,C,0,0,,,0
4,A.J. Hammons,2017-10-28 20:00:00,11226,MIA,2018,0,,,0,0,C,0,0,0.0,0.0,0


In [748]:
df = df.sort_values('dateTime').dropna(subset=['minutes', 'points', 'rebounds', 'steals', 'avg_points', 'avg_minutes', 'next_points'])
train = df.iloc[:25000].copy()
x_train = train[['minutes', 'points', 'rebounds', 'steals', 'avg_points', 'avg_minutes']]
y_train = train.next_points.values
test = df.iloc[25001:].copy()
x_test = test[['minutes', 'points', 'rebounds', 'steals', 'avg_points', 'avg_minutes']]
y_test = test.next_points.values

gbr = GradientBoostingRegressor(n_estimators=300, max_depth=3, learning_rate=0.1)
gbr.fit(x_train, y_train)
y_pred = gbr.predict(x_test)

In [755]:
mean_squared_error(y_test, y_pred)

31.715456061087227

In [756]:
gbr.feature_importances_

array([0.16746723, 0.15201295, 0.11434272, 0.07651333, 0.29047022,
       0.19919356])

In [757]:
test['pred'] = y_pred

In [759]:
test[test.name=='Dirk Nowitzki']

Unnamed: 0,name,dateTime,gameId,team,season,blocks,injuryBodyPart,injuryStatus,minutes,points,position,rebounds,steals,avg_points,avg_minutes,next_points,pred
77489,Dirk Nowitzki,2009-11-13 20:00:00,6185,DAL,2010,3,,,28,20,PF,11,0,28.2,35.4,25,26.009116
77490,Dirk Nowitzki,2009-11-15 18:00:00,6199,DAL,2010,2,,,38,25,PF,6,1,25.2,35.2,32,22.337047
77491,Dirk Nowitzki,2009-11-16 20:00:00,6205,DAL,2010,1,,,42,32,PF,11,2,25.8,36.4,41,24.096964
77492,Dirk Nowitzki,2009-11-18 21:30:00,6224,DAL,2010,1,,,44,41,PF,12,0,29.4,38.6,20,32.133321
77493,Dirk Nowitzki,2009-11-20 20:30:00,6235,DAL,2010,2,,,38,20,PF,5,1,27.6,38.0,25,22.912421
77494,Dirk Nowitzki,2009-11-25 20:30:00,6251,DAL,2010,2,,,36,25,PF,8,2,28.6,39.6,31,26.057887
77495,Dirk Nowitzki,2009-11-27 20:00:00,6263,DAL,2010,2,,,30,31,PF,4,1,29.8,38.0,28,23.233825
77496,Dirk Nowitzki,2010-10-27 20:30:00,5221,DAL,2011,1,,,36,28,FC,13,0,29.0,36.8,27,27.964225
77497,Dirk Nowitzki,2010-10-29 20:30:00,5237,DAL,2011,0,,,37,27,FC,7,0,26.2,35.4,16,22.747766
77498,Dirk Nowitzki,2010-10-31 15:30:00,5250,DAL,2011,3,,,30,16,FC,7,0,25.4,33.8,35,22.199548


In [938]:
plt.figure()
players_data[(players_data.team == 'SA') & (players_data.name == 'Tony Parker')].set_index('dateTime').minutes.plot()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x2033610a390>

In [943]:
players_data.set_index('dateTime').query('name == "Tony Parker"').loc['2014-12':]

Unnamed: 0_level_0,gameId,name,team,season,blocks,injuryBodyPart,injuryStatus,minutes,points,position,rebounds,steals
dateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2014-12-01 19:00:00,305,Tony Parker,SA,2015,0,,,0,0,PG,0,0
2014-12-03 19:30:00,323,Tony Parker,SA,2015,0,,,29,9,G,1,0
2014-12-05 20:00:00,339,Tony Parker,SA,2015,1,,,22,14,G,1,0
2014-12-06 20:30:00,347,Tony Parker,SA,2015,0,,,0,0,PG,0,0
2014-12-09 21:00:00,370,Tony Parker,SA,2015,0,,,0,0,PG,0,0
2014-12-10 20:30:00,379,Tony Parker,SA,2015,0,,,0,0,PG,0,0
2014-12-12 21:30:00,395,Tony Parker,SA,2015,0,,,17,5,G,0,1
2014-12-14 20:00:00,410,Tony Parker,SA,2015,0,,,0,0,PG,0,0
2014-12-15 22:00:00,417,Tony Parker,SA,2015,0,,,0,0,PG,0,0
2014-12-17 20:30:00,431,Tony Parker,SA,2015,0,,,0,0,PG,0,0


In [945]:
players_data[players_data.minutes!=0].set_index('dateTime').sort_index().groupby('name').points.rolling(5).mean()

name                dateTime           
A.J. Price          2009-10-30 20:00:00     NaN
                    2009-11-03 19:00:00     NaN
                    2009-11-06 19:00:00     NaN
                    2009-11-11 19:00:00     NaN
                    2010-11-16 19:00:00     3.4
                    2010-11-18 19:00:00     6.2
                    2010-11-20 19:00:00     6.4
                    2010-12-06 19:00:00     5.2
                    2010-12-11 19:00:00     5.6
                    2010-12-29 19:00:00     4.8
                    2011-01-08 19:00:00     2.6
                    2011-01-19 22:30:00     3.2
                    2011-01-22 22:00:00     4.6
                    2011-01-23 20:00:00     5.2
                    2011-01-26 19:00:00     5.4
                    2011-01-28 19:00:00     6.4
                    2011-01-29 20:00:00     7.4
                    2011-01-31 19:00:00     7.6
                    2011-02-02 19:00:00     7.4
                    2011-02-06 12:00:00     7.2
