## User Inputs

In [50]:
year = '1967'

## Libraries

In [51]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb
import statsmodels.api as sm
import sklearn
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
warnings.filterwarnings("ignore")

## Functions

In [52]:
def my_mean(x):
    return x[x>0].mean()

def my_count(x):
    return x[x>0].count()

def create_adjusted(final):
    for j,row in final.iterrows():
        try:
            i = str(row['Player'])
            pri = float(prior_impact[prior_impact.Player==i]['Impact'])
            pri_coef = float(df[df['index']=="prior_impact"+i+"_T"]['coef'])
            curr_imp = float(final[final.Player==i]['Impact'])
            adj_imp = float((pri * pri_coef)+curr_imp)
            final.set_value(j,'Impact',adj_imp)
        except:
            pass
    return final

## Clean Boxscore Data

In [86]:
boxs = pd.read_excel('Box/'+str(int(year))+'box.xlsx')

# Create home, away, and neutral game split factors
dummies = pd.get_dummies(boxs['Location'])

# Add dummy variables and clean up dataset
boxs = pd.concat([boxs, dummies], axis=1)
boxs.drop('Location',axis=1,inplace=True)
boxs = boxs.fillna(0).reset_index().drop('index',axis=1)
#boxs['Blowout']=np.where(boxs.Differential>=16,1,0)
#boxs['Differential']=np.where(boxs.Differential>0,1,0)

# Create X and y variables for regression
#y=boxs.pop('Differential')
#X=boxs.copy()

# Creates clean copy of the dataset
boxes=pd.concat([X,y], axis=1)

In [87]:
for col in boxs.columns:
    try:
        boxs[col]=np.where((boxs[col]>=.75)&(boxs.Differential>=20),1,boxs[col])
        boxs[col]=np.where((boxs[col]<.75)&(boxs.Differential>=20),1,boxs[col]/.75)
    except:
        pass
    
# Create X and y variables for regression
y=boxs.pop('Differential')
X=boxs.copy()

## Run Uninformed Regression

In [88]:
# Create Regression coefficient and Run regression
X2 = sm.add_constant(X)
est = sm.OLS(y, X)
regr = est.fit()

# Regression output results
df = pd.read_html(regr.summary().tables[1].as_html(),header=0,index_col=0)[0]
print(regr.summary())

                            OLS Regression Results                            
Dep. Variable:           Differential   R-squared:                       0.719
Model:                            OLS   Adj. R-squared:                  0.494
Method:                 Least Squares   F-statistic:                     3.192
Date:                Tue, 10 May 2022   Prob (F-statistic):           4.95e-22
Time:                        16:54:22   Log-Likelihood:                -1969.1
No. Observations:                 560   AIC:                             4438.
Df Residuals:                     310   BIC:                             5520.
Df Model:                         249                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Unnamed: 0          -138.1233    123

### Show Impact and Save

In [21]:
# Create total minutes played, avg minutes played out of total possible, and total games
avgz = pd.DataFrame([boxs.sum(),boxs.apply(my_mean),boxs.apply(my_count)]).T.reset_index()
avgz.columns = ['index','minutes','avg_minutes','gamez']

# Regression summary
df = df.reset_index()
df = df[['index','coef']]

# X and Y variables together
boxes=pd.concat([X,y], axis=1)

# Get rid of player duplicate as opposing player
final = df[~(df['index'].str[-2:].str.contains('_T'))]

# Get rid of prior impact factor
final = final[~(final['index'].str.contains('prior_impact'))]

# Rename the Regression coefficient
final=final.replace('index', 'Baseline')

# Add in our summary statistics and clean column names
final = pd.merge(final,avgz,how='left', on='index')
final.columns = ['Player','Impact','Minutes','AvgMinutes','Games']

# Create scaled impacts
final['Impact']=-1*final['Impact']
final['ImpactPerGame']=final['Impact']*final['AvgMinutes']
final['ImpactPerGameScaled'] = (final.ImpactPerGame - final.ImpactPerGame.mean())/final.ImpactPerGame.std(ddof=0)
final['ImpactScaled'] = (final.Impact - final.Impact.mean())/final.Impact.std(ddof=0)
final.sort_values('ImpactPerGame',ascending=False,inplace=True)
final

Unnamed: 0,Player,Impact,Minutes,AvgMinutes,Games,ImpactPerGame,ImpactPerGameScaled,ImpactScaled
71,Kevin Loughery,1.6191,19.728435,0.616514,32.0,0.998197,2.955104,1.709573
33,Don Ohl,1.2561,21.969665,0.732322,30.0,0.919870,2.734791,1.356467
45,Gerry Ward,3.4648,0.625000,0.208333,3.0,0.721833,2.177769,3.504966
1,Adrian Smith,0.9906,23.810142,0.721519,33.0,0.714737,2.157810,1.098204
112,Willis Reed,0.4925,33.034049,0.805709,41.0,0.396811,1.263574,0.613680
...,...,...,...,...,...,...,...,...
20,Charlie Hardnett,-2.4175,2.354167,0.235417,10.0,-0.569120,-1.453321,-2.217004
108,Wayne Embry,-1.0058,17.871855,0.616271,29.0,-0.619845,-1.595998,-0.843782
79,Luke Jackson,-1.1476,40.471698,0.697788,58.0,-0.800781,-2.104921,-0.981717
56,Jerry Greenspan,-5.1324,0.520833,0.173611,3.0,-0.891042,-2.358798,-4.857906


In [None]:
final.to_excel('Impact/'+year+'impact2.xlsx',index=False)

## Run Informed Regression

In [None]:
# Add in prior impact to the boxscores
try:
    prior_impact = pd.read_excel('InformedImpact/'+str(int(year)-1)+'impactInformed2.xlsx')
except:
    # If first year, won't have informed prior impact
    prior_impact = pd.read_excel('Impact/'+str(int(year)-1)+'impact2.xlsx')
for i in boxes:
    try:
        # Prior home, away, and neutral should have no impact
        if i not in ['H','@','N']:
            # Prior impact for both player and opposing version duplicate
            pri = float(prior_impact[prior_impact.Player==i]['Impact'])
            boxes['prior_impact'+str(i)] = pri * boxes[str(i)]
            boxes['prior_impact'+str(i)+"_T"] = pri * boxes[str(i)+"_T"]
    except:
        pass
    
# Create a copy of the original cleaned up Boxscores
boxs2 = boxes.copy()

# Create X and y variables for regression
y=boxs2.pop('Differential')
X=boxs2.copy()

# Create Regression coefficient and Run regression
X2 = sm.add_constant(X)
est = sm.OLS(y, X)
regr = est.fit()

# Regression output results
df = pd.read_html(regr.summary().tables[1].as_html(),header=0,index_col=0)[0]
print(regr.summary())

### Show Impact and Save

In [None]:
# Create total minutes played, avg minutes played out of total possible, and total games
avgz = pd.DataFrame([boxs.sum(),boxs.apply(my_mean),boxs.apply(my_count)]).T.reset_index()
avgz.columns = ['index','minutes','avg_minutes','gamez']

# Regression summary
df = df.reset_index()
df = df[['index','coef']]

# X and Y variables together
boxes=pd.concat([X,y], axis=1)

# Get rid of player duplicate as opposing player
final = df[~(df['index'].str[-2:].str.contains('_T'))]

# Get rid of prior impact factor
final = final[~(final['index'].str.contains('prior_impact'))]

# Rename the Regression coefficient
final=final.replace('index', 'Baseline')

# Add in our summary statistics and clean column names
final = pd.merge(final,avgz,how='left', on='index')
final.columns = ['Player','Impact','Minutes','AvgMinutes','Games']

# Set up base impact variables
final['Impact']=-1*final['Impact']
final['OldImpact']=final['Impact']

# Adjust impact with last year's prior impact as factor
final = create_adjusted(final)

# Create scaled imacts
final['ImpactPerGame']=final['Impact']*final['AvgMinutes']
final['ImpactPerGameScaled'] = (final.ImpactPerGame - final.ImpactPerGame.mean())/final.ImpactPerGame.std(ddof=0)
final['ImpactScaled'] = (final.Impact - final.Impact.mean())/final.Impact.std(ddof=0)
final.sort_values('ImpactPerGame',ascending=False,inplace=True)
final

In [None]:
final.to_excel('InformedImpact/'+year+'impactInformed2.xlsx',index=False)