## User Inputs

In [66]:
year = '1962'

## Libraries

In [119]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb
import statsmodels.api as sm
import sklearn
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
warnings.filterwarnings("ignore")

## Functions

In [111]:
def my_mean(x):
    return x[x>0].mean()

def my_count(x):
    return x[x>0].count()

def create_adjusted(final):
    for j,row in final.iterrows():
        try:
            i = str(row['Player'])
            pri = float(prior_impact[prior_impact.Player==i]['Impact'])
            pri_coef = float(df[df['index']=="prior_impact"+i+"_T"]['coef'])
            curr_imp = float(final[final.Player==i]['Impact'])
            adj_imp = float((pri * pri_coef)+curr_imp)
            final.set_value(j,'Impact',adj_imp)
        except:
            pass
    return final

def read_boxes(year):
    boxs = pd.read_excel('BoxScores/'+str(int(year))+'box.xlsx')

    # Create home, away, and neutral game split factors
    dummies = pd.get_dummies(boxs['Location'])

    # Add dummy variables and clean up dataset
    boxs = pd.concat([boxs, dummies], axis=1)
    boxs.drop('Location',axis=1,inplace=True)
    boxs = boxs.fillna(0)
    boxs = boxs.reset_index().drop('index',axis=1)
    try:
        boxs=boxs.drop('Unnamed: 0',axis=1)
    except:
        pass
    #boxs['Blowout']=np.where(boxs.Differential>=16,1,0)
    #boxs['Differential']=np.where(boxs.Differential>0,1,0)
    return boxs

## Clean Boxscore Data

In [149]:
years = [str(int(year)-2),str(int(year)-1),str(year)]
boxs = pd.DataFrame()
count=1
for year in years:
    box1 = read_boxes(year)
    if count>=2:
        box1=pd.concat([box1, box1], axis=0)
    if count==3:
        box1=pd.concat([box1, box1], axis=0)
        box1=pd.concat([box1, box1], axis=0)
        roster_last=box1.columns.unique()
    if len(boxs) == 0:
        boxs = box1.copy()
    else:
        boxs=pd.concat([boxs, box1], axis=0)
    boxs = boxs.fillna(0)
    count+=1
# Create X and y variables for regression
y=boxs.pop('Differential')
X=boxs.copy()

# Creates clean copy of the dataset
boxes=pd.concat([X,y], axis=1)

In [87]:
# for col in boxs.columns:
#     try:
#         boxs[col]=np.where((boxs[col]>=.75)&(boxs.Differential>=20),1,boxs[col])
#         boxs[col]=np.where((boxs[col]<.75)&(boxs.Differential>=20),1,boxs[col]/.75)
#     except:
#         pass
    
# # Create X and y variables for regression
# y=boxs.pop('Differential')
# X=boxs.copy()

## Run Uninformed Regression

In [126]:
from numpy import arange
from sklearn.linear_model import RidgeCV
from sklearn.model_selection import RepeatedKFold

# define model evaluation method
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define model
model = RidgeCV(alphas=arange(0, 1, 0.01), cv=cv, scoring='neg_mean_absolute_error')
# fit model
model.fit(X, y)

# model coefficients
df=pd.DataFrame({'index': X.columns.unique(),'coef':model.coef_})

# summarize chosen configuration
print('alpha: %f' % model.alpha_)

alpha: 0.010000


In [150]:
# Create Regression coefficient and Run regression
#X2 = sm.add_constant(X)
est = sm.OLS(y, X)
regr = est.fit()

# Regression output results
df = pd.read_html(regr.summary().tables[1].as_html(),header=0,index_col=0)[0]
print(regr.summary())

                            OLS Regression Results                            
Dep. Variable:           Differential   R-squared:                       0.821
Model:                            OLS   Adj. R-squared:                  0.793
Method:                 Least Squares   F-statistic:                     29.91
Date:                Fri, 13 May 2022   Prob (F-statistic):               0.00
Time:                        15:10:38   Log-Likelihood:                -7332.3
No. Observations:                2186   AIC:                         1.525e+04
Df Residuals:                    1895   BIC:                         1.690e+04
Df Model:                         290                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
@                      -2.8563    

### Show Impact and Save

In [147]:
# Create total minutes played, avg minutes played out of total possible, and total games
avgz = pd.DataFrame([boxs.sum(),boxs.apply(my_mean),boxs.apply(my_count)]).T.reset_index()
avgz.columns = ['index','minutes','avg_minutes','gamez']

# Regression summary
df = df.reset_index()
df = df[['index','coef']]

# X and Y variables together
boxes=pd.concat([X,y], axis=1)

# Get rid of player duplicate as opposing player
final = df[~(df['index'].str[-2:].str.contains('_T'))]
#final = final[final.index.isin(roster_last)]

# Rename the Regression coefficient
final=final.replace('index', 'Baseline')

# Add in our summary statistics and clean column names
final = pd.merge(final,avgz,how='left', on='index')
final.columns = ['Player','Impact','Minutes','AvgMinutes','Games']

# Create scaled impacts
final['Impact']=-1*final['Impact']
final['Impact']=np.where(final.Player.isin(['@','H','N']),final.Impact*-1,final.Impact)
final['ImpactPerGame']=final['Impact']*final['AvgMinutes']
final['ImpactPerGameScaled'] = (final.ImpactPerGame - final.ImpactPerGame.mean())/final.ImpactPerGame.std(ddof=0)
final['ImpactScaled'] = (final.Impact - final.Impact.mean())/final.Impact.std(ddof=0)
final.sort_values('ImpactPerGame',ascending=False,inplace=True)

# Only use players in the last year
final = final[final.Player.isin(roster_last)]
final

Unnamed: 0,Player,Impact,Minutes,AvgMinutes,Games,ImpactPerGame,ImpactPerGameScaled,ImpactScaled
16,Bob Boozer,63.070778,65.958333,0.622248,106.0,39.245692,2.609250,1.180557
117,Richie Guerin,40.602607,92.490959,0.848541,109.0,34.452973,2.301904,0.780051
13,Bill Smith,86.354318,4.166667,0.260417,16.0,22.488104,1.534625,1.595598
139,Walter Dukes,41.368040,55.750000,0.493363,113.0,20.409453,1.401326,0.793695
50,Ed Burton,184.865489,0.833333,0.104167,8.0,19.256822,1.327411,3.351608
...,...,...,...,...,...,...,...,...
34,Cleveland Buckner,-98.695839,17.666667,0.245370,72.0,-24.217034,-1.460465,-1.703012
109,Phil Jordon,-39.513088,63.395833,0.621528,102.0,-24.558482,-1.482361,-0.648051
40,Dave Gambee,-55.054386,72.666667,0.567708,128.0,-31.254834,-1.911782,-0.925082
39,Dave Budd,-76.556575,47.228774,0.421685,112.0,-32.282796,-1.977703,-1.308369


In [148]:
final.to_excel('Impact/'+year+'impact2.xlsx',index=False)

## Run Informed Regression

In [None]:
# Add in prior impact to the boxscores
try:
    prior_impact = pd.read_excel('InformedImpact/'+str(int(year)-1)+'impactInformed2.xlsx')
except:
    # If first year, won't have informed prior impact
    prior_impact = pd.read_excel('Impact/'+str(int(year)-1)+'impact2.xlsx')
for i in boxes:
    try:
        # Prior home, away, and neutral should have no impact
        if i not in ['H','@','N']:
            # Prior impact for both player and opposing version duplicate
            pri = float(prior_impact[prior_impact.Player==i]['Impact'])
            boxes['prior_impact'+str(i)] = pri * boxes[str(i)]
            boxes['prior_impact'+str(i)+"_T"] = pri * boxes[str(i)+"_T"]
    except:
        pass
    
# Create a copy of the original cleaned up Boxscores
boxs2 = boxes.copy()

# Create X and y variables for regression
y=boxs2.pop('Differential')
X=boxs2.copy()

# Create Regression coefficient and Run regression
X2 = sm.add_constant(X)
est = sm.OLS(y, X)
regr = est.fit()

# Regression output results
df = pd.read_html(regr.summary().tables[1].as_html(),header=0,index_col=0)[0]
print(regr.summary())

### Show Impact and Save

In [None]:
# Create total minutes played, avg minutes played out of total possible, and total games
avgz = pd.DataFrame([boxs.sum(),boxs.apply(my_mean),boxs.apply(my_count)]).T.reset_index()
avgz.columns = ['index','minutes','avg_minutes','gamez']

# Regression summary
df = df.reset_index()
df = df[['index','coef']]

# X and Y variables together
boxes=pd.concat([X,y], axis=1)

# Get rid of player duplicate as opposing player
final = df[~(df['index'].str[-2:].str.contains('_T'))]

# Get rid of prior impact factor
final = final[~(final['index'].str.contains('prior_impact'))]

# Rename the Regression coefficient
final=final.replace('index', 'Baseline')

# Add in our summary statistics and clean column names
final = pd.merge(final,avgz,how='left', on='index')
final.columns = ['Player','Impact','Minutes','AvgMinutes','Games']

# Set up base impact variables
final['Impact']=-1*final['Impact']
final['OldImpact']=final['Impact']

# Adjust impact with last year's prior impact as factor
final = create_adjusted(final)

# Create scaled imacts
final['ImpactPerGame']=final['Impact']*final['AvgMinutes']
final['ImpactPerGameScaled'] = (final.ImpactPerGame - final.ImpactPerGame.mean())/final.ImpactPerGame.std(ddof=0)
final['ImpactScaled'] = (final.Impact - final.Impact.mean())/final.Impact.std(ddof=0)
final.sort_values('ImpactPerGame',ascending=False,inplace=True)
final

In [None]:
final.to_excel('InformedImpact/'+year+'impactInformed2.xlsx',index=False)