In [None]:
pip install espn_api

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
pip install ordered_set

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting ordered_set
  Downloading ordered_set-4.1.0-py3-none-any.whl (7.6 kB)
Installing collected packages: ordered-set
Successfully installed ordered-set-4.1.0


In [None]:
# Basketball API
from espn_api.basketball import League
from espn_api.basketball import Player
import pandas as pd
import numpy as np
import time
import datetime as dt
import math
import plotly
import plotly.graph_objects as go
from ordered_set import OrderedSet as oset

# Init
#league = League(league_id=1483720569, year=2022)
# private league with cookies
league2022 = League(league_id=1483720569, year=2022, espn_s2='AEAcEt%2BMhB7Fge2Z8ytHMzkMCzI3Pv9P2iScn2FG4%2BjItbmIma%2BKoAP32crD2jneK0ZSIpkCeuMewE2tWpBSh8V427kylczAMaNqUpWA3XlpILa8IUyjLY%2BiKLeNdEobg0TBvOfYzWIxEJYdE5%2FctLuNNvV9GVpzc4u2I6FTdEleJgkb4FLU1NGuK8CAWppO8eI0cYgIWpWfNfFHIw9NmlqFTH57vdXgYRGNq3pGJuLZxgVIq54ghOwrsynfR8uQ2yjhAwI7n74p5eqgMJ9Ifnh74mvMGOFeArbTx9KMKkYeWg%3D%3D', 
                swid='{AA6619A1-D92E-4220-BECC-9A41FA2A0B0D}')
league2023 = League(league_id=1483720569, year=2023, espn_s2='AEAcEt%2BMhB7Fge2Z8ytHMzkMCzI3Pv9P2iScn2FG4%2BjItbmIma%2BKoAP32crD2jneK0ZSIpkCeuMewE2tWpBSh8V427kylczAMaNqUpWA3XlpILa8IUyjLY%2BiKLeNdEobg0TBvOfYzWIxEJYdE5%2FctLuNNvV9GVpzc4u2I6FTdEleJgkb4FLU1NGuK8CAWppO8eI0cYgIWpWfNfFHIw9NmlqFTH57vdXgYRGNq3pGJuLZxgVIq54ghOwrsynfR8uQ2yjhAwI7n74p5eqgMJ9Ifnh74mvMGOFeArbTx9KMKkYeWg%3D%3D', 
                swid='{AA6619A1-D92E-4220-BECC-9A41FA2A0B0D}',fetch_league=True)

In [None]:
league2023.free_agents(size=1000)

#Load Player Data

In [None]:
#To load all players, we need to read in all team rosters + all free agents

#Grab all players from previous year
freeagents2022 = league2022.free_agents(size=1000)

#Grab all players from current year
freeagents2023 = league2023.free_agents(size=1000)

#Function to get all players that are on teams
def getPlayersOnRosters(teams):
  allPlayersOnRosters = teams[0].roster
  for team in teams[1:]:
    playersOnTeam = team.roster
    allPlayersOnRosters = allPlayersOnRosters+playersOnTeam
  return allPlayersOnRosters

#Put all players in one list/array.
players2022 = getPlayersOnRosters(league2022.teams)
players2023 = getPlayersOnRosters(league2023.teams)
players2022 = players2022+freeagents2022
players2023 = players2023+freeagents2023

#Function that will create a MultiIndex dataframe of stats for each player
def createPlayerDataFrame(player):
  stats = pd.DataFrame(player.stats).transpose()
  index = stats.index
  stats['name'] = player.name
  stats = stats.set_index(['name',index])
  return stats

#Function that will step through the list of players, create their stat dataframe, and concat all dataframes together
def createAllPlayersDataFrame(players):
  allStats = createPlayerDataFrame(players[0])
  for player in players[1:]:
    playerStats = createPlayerDataFrame(player)
    allStats = pd.concat([allStats, playerStats])
  return allStats

#Instantiate players stats from 2022 and 2023 season.
playerStats2022 = createAllPlayersDataFrame(players2022)
playerStats2023 = createAllPlayersDataFrame(players2023)

#Merge the 2022 season and 2023 season for all players
#playerStats22_23 = pd.merge(playerStats2022, playerStats2023, left_index=True, right_index=True, how='right',suffixes=('_22','_23'))
playerStats22_23 = playerStats2022.join(playerStats2023, how='outer',lsuffix='_22', rsuffix='_23')

#The player data for the 2022 and 2023 seasons are all located in one dataframe.


#Helper functions for calculating regression lines
#Calculate Sxx, Syy, and Sxy
def calcSxx(x):
  return np.sum(x**2) - (np.sum(x)**2 / len(x))
def calcSyy(y):
  return np.sum(y**2) - (np.sum(y)**2 / len(y))
def calcSxy(x,y):
  return np.sum(x*y) - (np.sum(x)*np.sum(y) / len(x))

#Calculate the regression line
def calcLinearRegression(x,y):
  Sxy = calcSxy(x,y)
  Sxx = calcSxx(x)
  Syy = calcSyy(y)
  ybar = np.sum(y) / len(y)
  xbar = np.sum(x) / len(x)
  beta1hat = Sxy / Sxx
  beta0hat = ybar - (beta1hat * xbar)
  Rsquared = Sxy**2 / (Sxx*Syy)
  return beta0hat, beta1hat, Rsquared

# ESPN Projection Analysis
Let's look at how well the ESPN projections perform. Let's use linear regression to determine how correlated the projections are with actual stats. Since the 2022 season has already happened, we will use stats from that year.

Let's start by looking at projected and actual score totals for the 2022 season.


## Totals

In [None]:
#Query for all players with a 2022_projected field, then narrow to the applied total.
playersProjectedTotal22 = playerStats22_23.query("ilevel_1 == '2022_projected'").loc[:,'applied_total_22']

#Query for all players with a 2022 field, then narrow to the applied total.
playersActualTotal22 = playerStats22_23.query("ilevel_1 == '2022'").loc[:,'applied_total_22']

#Turn both queries into DataFrames, and drop the useless multi-index level.
playersProjectedTotal22 = pd.DataFrame(playersProjectedTotal22.droplevel(1))
playersActualTotal22 = pd.DataFrame(playersActualTotal22.droplevel(1))

#Join the Projected and Actual into one DataFrame, then drop any rows that have projection values of NaN.
playersTotal22 = playersActualTotal22.join(playersProjectedTotal22, how='outer', lsuffix='_Actual', rsuffix='_Projected')
playersTotal22 = playersTotal22.dropna(subset=['applied_total_22_Projected'])

#Plot Actual Scores vs Projected Scores
fig = go.Figure()
fig.add_trace(go.Scatter(
    x=playersTotal22.loc[:,'applied_total_22_Projected'],
    y=playersTotal22.loc[:,'applied_total_22_Actual'],
    mode='markers'
))
fig.update_layout(
    title="Actual vs Projected 2022 Total Fantasy Score",
    xaxis_title="2022 Projected Score",
    yaxis_title="2022 Actual Score",
)
fig.show()

#Calculate Sxx, Syy, and Sxy
def calcSxx(x):
  return np.sum(x**2) - (np.sum(x)**2 / len(x))
def calcSyy(y):
  return np.sum(y**2) - (np.sum(y)**2 / len(y))
def calcSxy(x,y):
  return np.sum(x*y) - (np.sum(x)*np.sum(y) / len(x))

#Calculate the regression line
def calcLinearRegression(x,y):
  Sxy = calcSxy(x,y)
  Sxx = calcSxx(x)
  Syy = calcSyy(y)
  ybar = np.sum(y) / len(y)
  xbar = np.sum(x) / len(x)
  beta1hat = Sxy / Sxx
  beta0hat = ybar - (beta1hat * xbar)
  Rsquared = Sxy**2 / (Sxx*Syy)
  return beta0hat, beta1hat, Rsquared

regressionInfo = calcLinearRegression(playersTotal22.loc[:,'applied_total_22_Projected'], playersTotal22.loc[:,'applied_total_22_Actual'])
print("Regression Line: y = {0} + {1}x".format(regressionInfo[0],regressionInfo[1]))
print("Rsquared: ", regressionInfo[2])
print("R: ",np.sqrt(regressionInfo[2]))

Regression Line: y = -12.159220452601403 + 0.7552869827223359x
Rsquared:  0.4716491117390445
R:  0.6867671452093821


We have our equation for the regression line, which allows us to predict what a players actual score will be given the ESPN prediction, and a scatter plot of actual score vs projected score. We should put the line onto this scatter plot. 

An additional note is that we can see there are a few projected scores that have an actual score of 0. This is most likely due to players who sustained season ending injuries right at the start of the season, probably in training camp or preseason. It would be unfair to the ESPN model if we included these datapoints, as it's impossible to predict when a player will get a season ending injuring. So, let's remove those datapoints, recalculate and add the regression line and see what we get.

In [None]:
#Get rid of rows where actual score is 0
playersTotal22 = playersTotal22[playersTotal22['applied_total_22_Actual'] != 0]

#Recalculate new regression line
regressionInfo = calcLinearRegression(playersTotal22.loc[:,'applied_total_22_Projected'], playersTotal22.loc[:,'applied_total_22_Actual'])
print("Regression Line: y = {0} + {1}x".format(regressionInfo[0],regressionInfo[1]))
print("Rsquared: ", regressionInfo[2])
print("R: ",np.sqrt(regressionInfo[2]))

#Create fake x values for our regression line
fakeX = np.linspace(0, 5000, 5000, retstep=True)

#generate our y values
genY = [-26.40233332893581 + 0.7598669419268778 * x for x in fakeX]

#add the trace to the plot.
fig2 = go.Figure()
fig2.add_trace(go.Scatter(
    x=playersTotal22.loc[:,'applied_total_22_Projected'],
    y=playersTotal22.loc[:,'applied_total_22_Actual'],
    mode='markers'
))
fig2.add_trace(go.Scatter(
    x=fakeX[0],
    y=genY[0]
))
fig2.update_layout(
    title="Actual vs Projected 2022 Total Fantasy Score",
    xaxis_title="2022 Projected Score",
    yaxis_title="2022 Actual Score",
)
fig2.show()

Regression Line: y = 38.70832592354918 + 0.7618421361477875x
Rsquared:  0.5161416188698928
R:  0.7184299679647925


So, removing the datapoints that have an actual score of 0 increases the R value, or the correlation coefficient, to 72%. Not bad! Turns out ESPN is pretty good at predicting the total. However, we can dive deeper.

I'm interested in knowing if ESPN is actually really good at predicting a player's averages over the season, and not so great at predicting how many games they play. Therefore, let's switch our focus from players totals to player averages. We will do the same procedure but for averages.

## Averages

#### ESPN Projections

In [None]:
#Query for all players with a 2022_projected field, then narrow to the applied average.
playersProjectedAvg22 = playerStats22_23.query("ilevel_1 == '2022_projected'").loc[:,'applied_avg_22']

#Query for all players with a 2022 field, then narrow to the applied average.
playersActualAvg22 = playerStats22_23.query("ilevel_1 == '2022'").loc[:,'applied_avg_22']

#Turn both queries into DataFrames, and drop the useless multi-index level.
playersProjectedAvg22 = pd.DataFrame(playersProjectedAvg22.droplevel(1))
playersActualAvg22 = pd.DataFrame(playersActualAvg22.droplevel(1))

#Join the Projected and Actual into one DataFrame, then drop any rows that have projection values of NaN.
playersAvg22 = playersActualAvg22.join(playersProjectedAvg22, how='outer', lsuffix='_Actual', rsuffix='_Projected')
playersAvg22 = playersAvg22.dropna(subset=['applied_avg_22_Projected'])

#Again, let's drop rows that have an actual score of 0.
playersAvg22 = playersAvg22[playersAvg22['applied_avg_22_Actual'] != 0]

#Calculate regression line
regressionInfo_avg = calcLinearRegression(playersAvg22.loc[:,'applied_avg_22_Projected'], playersAvg22.loc[:,'applied_avg_22_Actual'])
print("Regression Line: y = {0} + {1}x".format(regressionInfo_avg[0],regressionInfo_avg[1]))
print("Rsquared: ", regressionInfo_avg[2])
print("R: ",np.sqrt(regressionInfo_avg[2]))

#Create fake x values for our regression line
fakeX_avg = np.linspace(0, 60, 120, retstep=True)

#generate our y values
genY_avg = [regressionInfo_avg[0] + regressionInfo_avg[1] * x for x in fakeX_avg]

#Plot it all
fig3 = go.Figure()
fig3.add_trace(go.Scatter(
    x=playersAvg22.loc[:,'applied_avg_22_Projected'],
    y=playersAvg22.loc[:,'applied_avg_22_Actual'],
    mode='markers'
))
fig3.add_trace(go.Scatter(
    x=fakeX_avg[0],
    y=genY_avg[0]
))
fig3.update_layout(
    title="Actual vs Projected 2022 Average Fantasy Score, ESPN Projections",
    xaxis_title="2022 Projected Score",
    yaxis_title="2022 Actual Score",
)
fig3.show()

Regression Line: y = 0.502485703204929 + 0.915872295589305x
Rsquared:  0.7311619790977633
R:  0.8550801009833893


Wow! An 86% correlation! So, ESPN is even better at predicting the averages of a player. This leads me to believe they are not great at predicting how many games a player will play during the whole season. 

Now that we've looked at how well ESPN's projections are, let's analyze another projection system: Basketball Reference's SPS.

#### SPS Projections

In [None]:
#import data from drive, downloaded as csv from basketball reference.com
dfSPS = pd.read_csv("/content/drive/MyDrive/Personal Projects/NBA_Fantasy/SPS_2022-23.txt")

In [None]:
#Function that calculates our league's FPTS based on the categories.
def calcFPTS(row):
  return 2*row['FG'] + (-1)*row['FGA'] + row['3P'] + row['FT'] + (-1)*row['FTA'] + row['TRB'] + 2*row['AST'] + 4*row['STL'] + 4*row['BLK'] + (-2)*row['TOV'] + row['PTS']

#Create new column that contains FPTS for each player
dfSPS['FPTS'] = dfSPS.apply(calcFPTS, axis=1)

dfSPS.sort_values("FPTS",ascending=False)


Unnamed: 0,Rk,Player,Type,FG,FGA,3P,3PA,FT,FTA,ORB,...,STL,BLK,TOV,PF,PTS,FG%,3P%,FT%,-9999,FPTS
6,7,Nikola Jokić,Projected,10.6,18.5,1.4,4.0,5.1,6.2,2.9,...,1.5,0.8,3.7,2.8,27.7,0.572,0.350,0.827,jokicni01,62.3
0,1,Giannis Antetokounmpo,Projected,11.1,20.1,1.2,4.1,8.1,11.5,2.1,...,1.2,1.4,3.6,3.3,31.6,0.555,0.301,0.706,antetgi01,59.8
1,2,Joel Embiid,Projected,10.1,20.3,1.4,3.9,9.9,12.0,2.4,...,1.2,1.5,3.4,2.9,31.6,0.500,0.368,0.824,embiijo01,55.3
2,3,Luka Dončić,Projected,10.2,21.4,3.1,8.7,5.8,7.7,1.0,...,1.1,0.6,4.3,2.3,29.2,0.474,0.358,0.750,doncilu01,54.6
8,9,LeBron James,Projected,10.3,20.3,2.6,7.3,4.2,5.7,1.0,...,1.2,0.8,3.6,2.1,27.2,0.506,0.350,0.727,jamesle01,50.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599,600,Solomon Hill,Projected,3.0,8.0,1.6,5.0,1.0,1.4,1.1,...,1.2,0.4,1.2,3.0,8.6,0.371,0.319,0.762,hillso01,22.0
595,596,Tony Snell,Projected,3.4,7.8,2.1,5.1,0.5,0.6,0.7,...,0.7,0.4,0.9,2.8,9.5,0.442,0.405,0.888,snellto01,21.1
591,592,Rodney Hood,Projected,3.7,9.4,1.5,4.6,1.1,1.3,0.9,...,0.9,0.3,1.0,2.2,9.9,0.390,0.333,0.840,hoodro01,20.4
588,589,Semi Ojeleye,Projected,3.4,9.0,1.8,5.2,1.5,1.9,1.6,...,0.7,0.3,1.0,2.5,10.2,0.383,0.348,0.785,ojelese01,20.4


Interesting results. However, this doesn't project how many games the player will play. So while we cannot compare this to ESPN's total projected score, we can compare it to their Averages. Let's do that for the 21-22 season.

In [None]:
#import data from drive, downloaded as csv from basketball reference.com
dfSPS_22 = pd.read_csv("/content/drive/MyDrive/Personal Projects/NBA_Fantasy/SPS_2021-22.txt")

#Create new column that contains FPTS for each player
dfSPS_22['FPTS'] = dfSPS_22.apply(calcFPTS, axis=1)

#Make player the index and sort rows by FPTS
dfSPS_22 = dfSPS_22.set_index('Player',drop=True)
dfSPS_22 = dfSPS_22.sort_values("FPTS",ascending=False)
dfSPS_22

#SPSAvg22 = playersActualAvg22.join(dfSPS_22, on='name', how='outer', lsuffix='_Actual', rsuffix='_Projected')
dfSPS_22_merge2 = playersActualAvg22.merge(dfSPS_22, left_index=True, right_index=True)

#Again, let's drop rows that have an actual score of 0.
dfSPS_22_merge = dfSPS_22_merge[dfSPS_22_merge['applied_avg_22'] != 0]

#Calculate regression line
regressionInfo_avg_SPS = calcLinearRegression(dfSPS_22_merge.loc[:,'FPTS'], dfSPS_22_merge.loc[:,'applied_avg_22'])
print("Regression Line: y = {0} + {1}x".format(regressionInfo_avg_SPS[0],regressionInfo_avg_SPS[1]))
print("Rsquared: ", regressionInfo_avg_SPS[2])
print("R: ",np.sqrt(regressionInfo_avg_SPS[2]))

NameError: ignored

A lot weaker of a correlation than ESPN's projections. For completion sake, let's look at the graph

In [None]:
#Create fake x values for our regression line
fakeX_sps = np.linspace(0, 65, 120, retstep=True)

#generate our y values
genY_sps = [-19.675178482275353 + 1.1714763689867569*x for x in fakeX_sps]

#Plot it all
fig3 = go.Figure()
fig3.add_trace(go.Scatter(
    x=dfSPS_22_merge.loc[:,'FPTS'],
    y=dfSPS_22_merge.loc[:,'applied_avg_22'],
    mode='markers'
))
fig3.add_trace(go.Scatter(
    x=fakeX_sps[0],
    y=genY_sps[0]
))
fig3.update_layout(
    title="Actual vs Projected 2022 Average Fantasy Score, SPS Projections",
    xaxis_title="2022 Projected Score",
    yaxis_title="2022 Actual Score",
)
fig3.show()

While there is definitely a correlation here, it's pretty small compared to ESPN's (63% vs 86%). Thus, we should stick with ESPN's projections; at least on the averages.