## User Inputs

In [None]:
year = '1973'

## Libraries

In [None]:
import pandas as pd
import warnings
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sb
import statsmodels.api as sm
import sklearn
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
warnings.filterwarnings("ignore")

## Functions

In [None]:
def my_mean(x):
    return x[x>0].mean()

def my_count(x):
    return x[x>0].count()

def create_adjusted(final):
    for j,row in final.iterrows():
        try:
            i = str(row['Player'])
            pri = float(prior_impact[prior_impact.Player==i]['Impact'])
            pri_coef = float(df[df['index']=="prior_impact"+i+"_T"]['coef'])
            curr_imp = float(final[final.Player==i]['Impact'])
            adj_imp = float((pri * pri_coef)+curr_imp)
            final.set_value(j,'Impact',adj_imp)
        except:
            pass
    return final

## Clean Boxscore Data

In [None]:
boxs = pd.read_excel('BoxScores/'+str(int(year))+'box.xlsx')

# Create home, away, and neutral game split factors
dummies = pd.get_dummies(boxs['Location'])

# Add dummy variables and clean up dataset
boxs = pd.concat([boxs, dummies], axis=1)
boxs.drop('Location',axis=1,inplace=True)
boxs = boxs.fillna(0).reset_index().drop('index',axis=1)

# Create X and y variables for regression
y=boxs.pop('Differential')
X=boxs.copy()

# Creates clean copy of the dataset
boxes=pd.concat([X,y], axis=1)

## Run Uninformed Regression

In [None]:
# Create Regression coefficient and Run regression
X2 = sm.add_constant(X)
est = sm.OLS(y, X)
regr = est.fit()

# Regression output results
df = pd.read_html(regr.summary().tables[1].as_html(),header=0,index_col=0)[0]
print(regr.summary())

### Show Impact and Save

In [None]:
# Create total minutes played, avg minutes played out of total possible, and total games
avgz = pd.DataFrame([boxs.sum(),boxs.apply(my_mean),boxs.apply(my_count)]).T.reset_index()
avgz.columns = ['index','minutes','avg_minutes','gamez']

# Regression summary
df = df.reset_index()
df = df[['index','coef']]

# X and Y variables together
boxes=pd.concat([X,y], axis=1)

# Get rid of player duplicate as opposing player
final = df[~(df['index'].str[-2:].str.contains('_T'))]

# Get rid of prior impact factor
final = final[~(final['index'].str.contains('prior_impact'))]

# Rename the Regression coefficient
final=final.replace('index', 'Baseline')

# Add in our summary statistics and clean column names
final = pd.merge(final,avgz,how='left', on='index')
final.columns = ['Player','Impact','Minutes','AvgMinutes','Games']

# Create scaled impacts
final['Impact']=-1*final['Impact']
final['ImpactPerGame']=final['Impact']*final['AvgMinutes']
final['ImpactPerGameScaled'] = (final.ImpactPerGame - final.ImpactPerGame.mean())/final.ImpactPerGame.std(ddof=0)
final['ImpactScaled'] = (final.Impact - final.Impact.mean())/final.Impact.std(ddof=0)
final.sort_values('ImpactPerGame',ascending=False,inplace=True)
final

In [None]:
final.to_excel('Impact/'+year+'impact.xlsx',index=False)

## Run Informed Regression

In [None]:
# Add in prior impact to the boxscores
try:
    prior_impact = pd.read_excel('InformedImpact/'+str(int(year)-1)+'impactInformed.xlsx')
except:
    # If first year, won't have informed prior impact
    prior_impact = pd.read_excel('Impact/'+str(int(year)-1)+'impact.xlsx')
for i in boxes:
    try:
        # Prior home, away, and neutral should have no impact
        if i not in ['H','@','N']:
            # Prior impact for both player and opposing version duplicate
            pri = float(prior_impact[prior_impact.Player==i]['Impact'])
            boxes['prior_impact'+str(i)] = pri * boxes[str(i)]
            boxes['prior_impact'+str(i)+"_T"] = pri * boxes[str(i)+"_T"]
    except:
        pass
    
# Create a copy of the original cleaned up Boxscores
boxs2 = boxes.copy()

# Create X and y variables for regression
y=boxs2.pop('Differential')
X=boxs2.copy()

# Create Regression coefficient and Run regression
X2 = sm.add_constant(X)
est = sm.OLS(y, X)
regr = est.fit()

# Regression output results
df = pd.read_html(regr.summary().tables[1].as_html(),header=0,index_col=0)[0]
print(regr.summary())

### Show Impact and Save

In [None]:
# Create total minutes played, avg minutes played out of total possible, and total games
avgz = pd.DataFrame([boxs.sum(),boxs.apply(my_mean),boxs.apply(my_count)]).T.reset_index()
avgz.columns = ['index','minutes','avg_minutes','gamez']

# Regression summary
df = df.reset_index()
df = df[['index','coef']]

# X and Y variables together
boxes=pd.concat([X,y], axis=1)

# Get rid of player duplicate as opposing player
final = df[~(df['index'].str[-2:].str.contains('_T'))]

# Get rid of prior impact factor
final = final[~(final['index'].str.contains('prior_impact'))]

# Rename the Regression coefficient
final=final.replace('index', 'Baseline')

# Add in our summary statistics and clean column names
final = pd.merge(final,avgz,how='left', on='index')
final.columns = ['Player','Impact','Minutes','AvgMinutes','Games']

# Set up base impact variables
final['Impact']=-1*final['Impact']
final['OldImpact']=final['Impact']

# Adjust impact with last year's prior impact as factor
final = create_adjusted(final)

# Create scaled imacts
final['ImpactPerGame']=final['Impact']*final['AvgMinutes']
final['ImpactPerGameScaled'] = (final.ImpactPerGame - final.ImpactPerGame.mean())/final.ImpactPerGame.std(ddof=0)
final['ImpactScaled'] = (final.Impact - final.Impact.mean())/final.Impact.std(ddof=0)
final.sort_values('ImpactPerGame',ascending=False,inplace=True)
final

In [None]:
final.to_excel('InformedImpact/'+year+'impactInformed.xlsx',index=False)