In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.feature_extraction.text import CountVectorizer

import os

import requests
from bs4 import BeautifulSoup
import time

from collections import Counter

import matplotlib.pyplot as plt

%matplotlib inline

## Read in Player_By_Game Data from Scraper

In [2]:
# Read in all the partial data pieces

df1 = pd.read_csv('../data/player_boxscores/offset_0-200000.csv')
df2 = pd.read_csv('../data/player_boxscores/offset_200000-300000.csv')
df3 = pd.read_csv('../data/player_boxscores/offset_300000-475000.csv')
df4 = pd.read_csv('../data/player_boxscores/offset_475000-550000.csv')
df5 = pd.read_csv('../data/player_boxscores/offset_550000-600000.csv')
df6 = pd.read_csv('../data/player_boxscores/offset_600000-725000.csv')
df7 = pd.read_csv('../data/player_boxscores/offset_725000-909000.csv')

In [3]:
# Aggregate all the segmented data into one dataframe

df = pd.concat([df1, df2, df3, df4, df5, df6, df7])

In [4]:
# Check starting shape

df.shape

(909700, 31)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,player,pos,date_game,school_id,opp_id,game_result,gs,mp,fg,...,orb,drb,trb,ast,stl,blk,tov,pf,pts,game_score
0,0,Nate Wolters,G,2013-02-07,South Dakota State,IPFW,W,1.0,40,17,...,2,2,4,3,1,1,3,0,53,42.6
1,1,Mike James,G,2011-01-04,Lamar,,W,0.0,28,18,...,1,4,5,3,1,0,1,4,52,36.3
2,2,Marshon Brooks,G,2011-02-23,Providence,Notre Dame,L,1.0,40,20,...,2,3,5,4,1,0,1,4,52,42.3
3,3,Jimmer Fredette,G,2011-03-11,Brigham Young,New Mexico,W,1.0,40,22,...,0,2,2,4,1,0,2,2,52,36.5
4,4,Markus Howard,G,2018-01-03,Marquette,Providence,W,1.0,44,17,...,0,1,1,2,2,0,2,1,52,39.8


## EDA And Cleaning of Player_By_Game Data

In [6]:
df.columns

Index(['Unnamed: 0', 'player', 'pos', 'date_game', 'school_id', 'opp_id',
       'game_result', 'gs', 'mp', 'fg', 'fga', 'fg_pct', 'fg2', 'fg2a',
       'fg2_pct', 'fg3', 'fg3a', 'fg3_pct', 'ft', 'fta', 'ft_pct', 'orb',
       'drb', 'trb', 'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'game_score'],
      dtype='object')

In [7]:
df.isnull().sum()

Unnamed: 0          0
player              0
pos                91
date_game           0
school_id           0
opp_id          38014
game_result         0
gs               1558
mp                  0
fg                  0
fga                 0
fg_pct         107301
fg2                 0
fg2a                0
fg2_pct        173922
fg3                 0
fg3a                0
fg3_pct        384069
ft                  0
fta                 0
ft_pct         415522
orb                 0
drb                 0
trb                 0
ast                 0
stl                 0
blk                 0
tov                 0
pf                  0
pts                 0
game_score          0
dtype: int64

In [8]:
# Drop these columns because they are derived explicity from existing features
df.drop(['fg_pct', 'fg2_pct', 'fg3_pct', 'ft_pct', 'trb'], axis=1, inplace=True)

In [9]:
# Drop erroneous column
df.drop('Unnamed: 0', axis=1, inplace=True)

In [10]:
# If we don't know whether they started or not, chances are they did not. We'll set np.nan 
# of 'gs' to 0.
df['gs'] = df['gs'].fillna(value=0)

In [11]:
# Check and drop duplicated rows, probably resulting from scraping overlap
df.duplicated().sum()

1526

In [12]:
df.drop_duplicates(inplace=True)

In [13]:
df.isnull().sum()

player             0
pos               90
date_game          0
school_id          0
opp_id         37961
game_result        0
gs                 0
mp                 0
fg                 0
fga                0
fg2                0
fg2a               0
fg3                0
fg3a               0
ft                 0
fta                0
orb                0
drb                0
ast                0
stl                0
blk                0
tov                0
pf                 0
pts                0
game_score         0
dtype: int64

In [14]:
# We will be bringing in a lot of opponent data as features, so not knowing the opp_id is 
# useless to us. So let's drop the np.nan values in opp_id

df.dropna(inplace=True)

In [15]:
df.isnull().sum()

player         0
pos            0
date_game      0
school_id      0
opp_id         0
game_result    0
gs             0
mp             0
fg             0
fga            0
fg2            0
fg2a           0
fg3            0
fg3a           0
ft             0
fta            0
orb            0
drb            0
ast            0
stl            0
blk            0
tov            0
pf             0
pts            0
game_score     0
dtype: int64

In [16]:
# Check resulting shape. We still have 96% of our original rows. Pretty good
df.shape

(870131, 25)

In [17]:
# Convert date_game to datetime type

df['date_game'] = pd.to_datetime(df['date_game'])

In [18]:
# Create a year and month feature

df['year'] = df['date_game'].map(lambda x: x.year)
df['month'] = df['date_game'].map(lambda x: x.month)

In [19]:
# Get an idea of timeframe 

print(df['date_game'].min())
print(df['date_game'].max())

2010-11-08 00:00:00
2018-03-07 00:00:00


In [20]:
# Make a 'season' feature, where the year it ended is defined as the season

season_list = []
for i in df['date_game']:
    if i.month == 11 or i.month == 12:
        season_list.append(i.year + 1)
    else:
        season_list.append(i.year)

In [21]:
df['season'] = season_list

In [22]:
# Codify Win/Loss with 1/0
df['W'] = df['game_result'].map(lambda x: 1 if x == 'W' else 0)

In [23]:
# Dummify position
#df = pd.concat([df, pd.get_dummies(df['pos'])], axis=1)

In [24]:
# Drop game_result: We have one-hot encoded
df.drop('game_result', axis=1, inplace=True)

In [25]:
df.isnull().sum()

player        0
pos           0
date_game     0
school_id     0
opp_id        0
gs            0
mp            0
fg            0
fga           0
fg2           0
fg2a          0
fg3           0
fg3a          0
ft            0
fta           0
orb           0
drb           0
ast           0
stl           0
blk           0
tov           0
pf            0
pts           0
game_score    0
year          0
month         0
season        0
W             0
dtype: int64

In [26]:
df.dtypes

player                object
pos                   object
date_game     datetime64[ns]
school_id             object
opp_id                object
gs                   float64
mp                     int64
fg                     int64
fga                    int64
fg2                    int64
fg2a                   int64
fg3                    int64
fg3a                   int64
ft                     int64
fta                    int64
orb                    int64
drb                    int64
ast                    int64
stl                    int64
blk                    int64
tov                    int64
pf                     int64
pts                    int64
game_score           float64
year                   int64
month                  int64
season                 int64
W                      int64
dtype: object

In [27]:
df.to_csv('../data/player_boxscores_df.csv')

## Bring in School Data

In [28]:
school_df = pd.read_csv('../data/schools.csv')

In [29]:
school_df.head()

Unnamed: 0.1,Unnamed: 0,School,WinLossPct,SRS,SOS,Year
0,0,Air Force,0.323,-4.9,3.13,2010
1,1,Akron,0.686,2.82,-1.5,2010
2,2,Alabama A&M,0.407,-20.19,-13.71,2010
3,3,Alabama-Birmingham,0.735,9.46,2.9,2010
4,4,Alabama State,0.516,-14.41,-12.02,2010


# Player-by-Player Evaluation

In [30]:
df.set_index(['player', 'date_game'], drop=False, inplace=True)

In [31]:
stats = ['gs', 'mp', 'fg2', 'fg2a', 'fg3', 'fg3a', 'ft', 'fta','orb', 'drb', 'ast', 
         'stl', 'blk', 'tov', 'pf', 'pts', 'game_score', 'W']

In [32]:
# Specifically needed for rolling mean and rolling median

lookbacks = ['_3day_', '_10day_', '_30day_', '_60day_', '_90day_', '_120day_']

metrics = ['mean', 'median']

In [33]:
# Specifically needed for rolling mean and rolling median

all_rolling = []

for i in metrics:
    for j in lookbacks:
        for k in stats:
            all_rolling.append(i+j+k)

In [34]:
# Note: There will be some noise introduced by players with same name (no unique id associated)
players = df['player'].unique()
len(players)

16566

In [35]:
stats.append('date_game')

In [None]:
count = 0
stamp = datetime.now()
for player in players:
    if count % 500 == 0:
        diff_time = datetime.now() - stamp
        stamp = datetime.now()
        print('parsing...', count)
        print('last parsing block took: ', (diff_time.seconds / 60), ' minutes')
    player_df = df.loc[player][stats].sort_values('date_game')
    
    ewm_01 = player_df.drop('date_game', axis=1).ewm(alpha = 0.1, min_periods=1).mean().shift()
    ewm_03 = player_df.drop('date_game', axis=1).ewm(alpha = 0.3, min_periods=1).mean().shift()
    ewm_05 = player_df.drop('date_game', axis=1).ewm(alpha = 0.5, min_periods=1).mean().shift()
    ewm_07 = player_df.drop('date_game', axis=1).ewm(alpha = 0.7, min_periods=1).mean().shift()
    ewm_09 = player_df.drop('date_game', axis=1).ewm(alpha = 0.9, min_periods=1).mean().shift()
    ewm_10 = player_df.drop('date_game', axis=1).ewm(alpha = 1.0, min_periods=1).mean().shift()
    
    this_df = pd.concat([ewm_01, ewm_03, ewm_05, ewm_07, ewm_09, ewm_10], axis=1)

    this_df['player'] = player

    this_df.to_csv('../data/player_ewm/'+player.replace(' ', '_')+'.csv')
    count += 1

## Player-by-Player Aggregation

In [None]:
for (dirpath, dirnames, filenames) in os.walk('../data/player_ewm/'):
    break

In [None]:
len(filenames)

In [None]:
f = filenames # [:3] for testing

with open('../data/player_ewm_df.csv', 'wb') as output:
    for filename in f:
        with open('../data/player_ewm/'+filename, 'rb') as _input:
            for i, line in enumerate(_input):
                if i == 0:
                    continue       
                #print(line) # for testing
                output.write(line)

In [None]:
player_ewm_df = pd.read_csv('../data/player_ewm_df.csv', skiprows=1, header=None)

In [None]:
stats = pd.Series(stats)

ewm_cols = []
for i in ['01', '03', '05', '07', '09', '10']:
    for j in stats.drop(18):
        ewm_cols.append('ewm'+j+'_'+i)

In [None]:
ewm_cols.insert(0, 'date_game')
ewm_cols.append('player')
len(ewm_cols)

In [None]:
player_ewm_df.columns = ewm_cols

In [None]:
player_ewm_df.head()

In [None]:
player_ewm_df['date_game'] = pd.to_datetime(player_ewm_df['date_game'])

In [None]:
player_ewm_df.set_index(['player', 'date_game'], drop=False, inplace=True)

In [None]:
joined_df = df.join(player_ewm_df, how='left', on=['player', 'date_game'], rsuffix='_right')

## Player-by-Player Modeling

In [None]:
ewm_cols.append('pts')

In [None]:
joined_df.isnull().sum()

In [None]:
joined_df.shape

In [None]:
joined_df.fillna(0, inplace=True)

In [None]:
trial_df = joined_df[ewm_cols].drop(['date_game', 'player'], axis=1)

In [None]:
trial_df.shape

In [None]:
X = trial_df.drop('pts', axis=1)
y = trial_df['pts']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

### LinReg

In [None]:
player_linreg = LinearRegression()
player_linreg.fit(X_train, y_train)

In [None]:
score_lr = cross_val_score(player_linreg, X_train, y_train, cv=5)
score_lr

In [None]:
score_lr.mean()

#### Need to GroupBy to get Team Score Predictions

In [None]:
temp_lr = X

In [None]:
lr_preds = player_linreg.predict(temp_lr)

In [None]:
lr_preds.shape

In [None]:
joined_df['player_preds'] = lr_preds

In [None]:
team_actual = joined_df.groupby(['school_id', 'date_game']).sum()['pts']
team_preds = joined_df.groupby(['school_id', 'date_game']).sum()['player_preds']

In [None]:
print(team_actual.shape)
team_actual.isnull().sum()

In [None]:
print(team_preds.shape)
team_preds.isnull().sum()

In [None]:
r2_score(team_actual, team_preds)

In [None]:
plt.scatter(team_actual, team_preds, alpha=0.2)
plt.xlim(0,150)
plt.ylim(0,150)

### RFRegressor

In [None]:
player_rf = RandomForestRegressor(max_depth=2)
player_rf.fit(X_train, y_train)

In [None]:
score_rf = cross_val_score(player_rf, X_train, y_train, cv=5)
score_rf

In [None]:
score_rf.mean()

#### Need to GroupBy to get Team Score Predictions

# Team Evaluation

In [36]:
team_df = df.groupby(['school_id', 'opp_id', 'date_game']).sum()

Defaulting to column but this will raise an ambiguity error in a future version
  if __name__ == '__main__':


In [37]:
# Sum of 'game_started' flag and 'minutes_played' meaningless, so we can drop
team_df.drop(['gs', 'mp', 'year', 'month', 'season'], axis=1, inplace=True)

In [38]:
team_df.reset_index(inplace=True)

In [39]:
team_df.head()

Unnamed: 0,school_id,opp_id,date_game,fg,fga,fg2,fg2a,fg3,fg3a,ft,...,orb,drb,ast,stl,blk,tov,pf,pts,game_score,W
0,Abilene Christian,Air Force,2017-12-06,23,54,14,29,9,25,7,...,7,17,11,8,4,12,16,62,40.3,12
1,Abilene Christian,Arkansas State,2017-11-13,25,45,22,32,3,13,16,...,5,14,11,4,2,13,19,69,46.1,0
2,Abilene Christian,Arkansas-Pine Bluff,2014-12-23,24,45,14,25,10,20,11,...,5,22,15,7,2,18,21,69,47.7,11
3,Abilene Christian,Boise State,2014-12-20,11,55,5,36,6,19,5,...,11,10,4,4,0,9,12,33,2.2,0
4,Abilene Christian,Bowling Green State,2017-11-26,34,64,28,48,6,16,14,...,10,26,18,2,6,8,18,88,73.6,12


In [40]:
team_df[(team_df['school_id']=='Nevada-Las Vegas') & (team_df['date_game'] == '2018-03-07')]

Unnamed: 0,school_id,opp_id,date_game,fg,fga,fg2,fg2a,fg3,fg3a,ft,...,orb,drb,ast,stl,blk,tov,pf,pts,game_score,W


In [41]:
team_df[(team_df['school_id']=='Air Force') & (team_df['date_game'] == '2017-12-06')].index[0]

128

In [42]:
# STOP RUNNING THIS FUCKING CELL YOU FUCKING MORON, THIS SHIT TAKES FOREVER
index_dict = {}
for i in team_df.index:
    row = team_df.loc[i]
    try:
        opp_index = team_df[(team_df['school_id']==row['opp_id']) & (team_df['date_game'] == row['date_game'])].index[0]
    except:
        opp_index = np.nan
    index_dict[i] = opp_index

In [43]:
index_dict[0]

128

In [45]:
len(index_dict)

87171

In [50]:
def_stats = ['fg', 'fga', 'fg2', 'fg2a', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb', 'ast', 
             'stl', 'blk', 'tov', 'pf', 'pts', 'game_score', 'W']

team_opp_df = pd.DataFrame(columns=def_stats)

count = 0
for key in range(0,87171):
    if count % 500 == 0:
        print('parsed...', count)
    try:
        team_opp_df = team_opp_df.append(team_df.loc[index_dict[key]][def_stats])
    except:
        team_opp_df = team_opp_df.append({'fg':np.nan, 'fga':np.nan, 'fg2':np.nan, 'fg2a':np.nan, 
                            'fg3':np.nan, 'fg3a':np.nan, 'ft':np.nan, 'fta':np.nan, 
                            'orb':np.nan, 'drb':np.nan, 'ast':np.nan, 'stl':np.nan, 
                            'blk':np.nan, 'tov':np.nan, 'pf':np.nan, 'pts':np.nan, 
                            'game_score':np.nan, 'W':np.nan}, ignore_index=True)
    count += 1

parsed... 0
parsed... 500
parsed... 1000
parsed... 1500
parsed... 2000
parsed... 2500
parsed... 3000
parsed... 3500
parsed... 4000
parsed... 4500
parsed... 5000
parsed... 5500
parsed... 6000
parsed... 6500
parsed... 7000
parsed... 7500
parsed... 8000
parsed... 8500
parsed... 9000
parsed... 9500
parsed... 10000
parsed... 10500
parsed... 11000
parsed... 11500
parsed... 12000
parsed... 12500
parsed... 13000
parsed... 13500
parsed... 14000
parsed... 14500
parsed... 15000
parsed... 15500
parsed... 16000
parsed... 16500
parsed... 17000
parsed... 17500
parsed... 18000
parsed... 18500
parsed... 19000
parsed... 19500
parsed... 20000
parsed... 20500
parsed... 21000
parsed... 21500
parsed... 22000
parsed... 22500
parsed... 23000
parsed... 23500
parsed... 24000
parsed... 24500
parsed... 25000
parsed... 25500
parsed... 26000
parsed... 26500
parsed... 27000
parsed... 27500
parsed... 28000
parsed... 28500
parsed... 29000
parsed... 29500
parsed... 30000
parsed... 30500
parsed... 31000
parsed... 31500


In [51]:
team_opp_df.isnull().sum()

fg            50
fga           50
fg2           50
fg2a          50
fg3           50
fg3a          50
ft            50
fta           50
orb           50
drb           50
ast           50
stl           50
blk           50
tov           50
pf            50
pts           50
game_score    50
W             50
dtype: int64

In [58]:
# Merge them before dropping NA's
print(team_opp_df.shape)
print(team_df.shape)

(87171, 18)
(87171, 21)


In [59]:
team_opp_df.head()

Unnamed: 0,fg,fga,fg2,fg2a,fg3,fg3a,ft,fta,orb,drb,ast,stl,blk,tov,pf,pts,game_score,W
0,25,51,22,33,3,18,5,8,5,28,15,5,2,20,14,58,34.3,0
1,28,48,21,35,7,13,20,26,9,15,7,6,1,9,12,83,66.8,7
2,20,53,15,34,5,19,16,20,11,18,5,3,2,14,19,61,29.7,0
3,31,52,21,32,10,20,5,6,8,32,17,2,6,12,7,77,71.1,13
4,30,72,22,48,8,24,15,22,17,20,9,4,3,8,18,83,56.9,0


In [60]:
opp_df_cols = ['def_fg', 'def_fga', 'def_fg2', 'def_fg2a', 'def_fg3', 'def_fg3a', 'def_ft', 
               'def_fta', 'def_orb', 'def_drb', 'def_ast', 'def_stl', 'def_blk', 'def_tov', 
               'def_pf', 'def_pts', 'def_game_score', 'def_W']

In [61]:
team_opp_df.columns = opp_df_cols

In [65]:
team_df = pd.concat([team_df, team_opp_df], axis=1)

In [67]:
team_df.isnull().sum()

school_id          0
opp_id             0
date_game          0
fg                 0
fga                0
fg2                0
fg2a               0
fg3                0
fg3a               0
ft                 0
fta                0
orb                0
drb                0
ast                0
stl                0
blk                0
tov                0
pf                 0
pts                0
game_score         0
W                  0
def_fg            49
def_fga           49
def_fg2           49
def_fg2a          49
def_fg3           49
def_fg3a          49
def_ft            49
def_fta           49
def_orb           49
def_drb           49
def_ast           49
def_stl           49
def_blk           49
def_tov           49
def_pf            49
def_pts           49
def_game_score    49
def_W             49
dtype: int64

In [68]:
team_df.dropna(inplace=True)

In [72]:
teams = team_df['school_id'].unique()

In [136]:
team_df.set_index(['school_id', 'date_game'], drop=False, inplace=True)

In [85]:
# Change to 0/1 Flags
team_df['W'] = team_df['W'].apply(lambda x: 1 if x > 0 else 0)
team_df['def_W'] = team_df['def_W'].apply(lambda x: 1 if x > 0 else 0)

In [88]:
team_df.loc['Abilene Christian'].sort_values('date_game')

Unnamed: 0_level_0,school_id,opp_id,date_game,fg,fga,fg2,fg2a,fg3,fg3a,ft,...,def_orb,def_drb,def_ast,def_stl,def_blk,def_tov,def_pf,def_pts,def_game_score,def_W
school_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Abilene Christian,Abilene Christian,Duquesne,2013-11-09,22,63,15,38,7,25,24,...,10,30,19,4,2,10,22,94,77.3,1
Abilene Christian,Abilene Christian,St. Bonaventure,2013-11-11,14,40,10,24,4,16,15,...,18,22,15,7,7,9,23,75,65.6,1
Abilene Christian,Abilene Christian,Maryland,2013-11-13,17,54,7,33,10,21,0,...,12,26,12,9,10,17,8,67,58.3,1
Abilene Christian,Abilene Christian,Iowa,2013-11-17,14,54,11,31,3,23,10,...,5,30,25,11,5,5,10,103,109.4,1
Abilene Christian,Abilene Christian,Texas Christian,2013-11-19,19,50,10,30,9,20,17,...,8,22,11,7,3,6,20,71,56.0,1
Abilene Christian,Abilene Christian,Xavier,2013-11-25,23,53,14,29,9,24,10,...,16,29,18,8,7,8,16,93,91.3,1
Abilene Christian,Abilene Christian,Towson,2013-11-30,18,55,15,36,3,19,8,...,4,25,12,1,9,14,17,74,57.3,1
Abilene Christian,Abilene Christian,New Orleans,2014-01-09,28,63,21,40,7,23,18,...,7,31,13,2,3,13,22,87,61.3,1
Abilene Christian,Abilene Christian,Southeastern Louisiana,2014-01-11,23,69,16,50,7,19,24,...,4,35,17,2,5,12,23,85,57.4,1
Abilene Christian,Abilene Christian,Oral Roberts,2014-01-16,19,47,14,37,5,10,16,...,11,19,10,8,6,11,15,82,69.0,1


In [94]:
team_rolling_stats = ['date_game', 'fg2', 'fg2a', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb', 
                      'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'game_score', 'W', 'def_fg2', 
                      'def_fg2a', 'def_fg3', 'def_fg3a', 'def_ft', 'def_fta', 'def_orb', 
                      'def_drb', 'def_ast', 'def_stl', 'def_blk', 'def_tov', 'def_pf', 
                      'def_pts', 'def_game_score', 'def_W']

In [116]:
count = 0
stamp = datetime.now()
for team in teams:
    if count % 500 == 0:
        diff_time = datetime.now() - stamp
        stamp = datetime.now()
        print('parsing...', count)
        print('last parsing block took: ', (diff_time.seconds / 60), ' minutes')
    one_team_df = team_df.loc[team][team_rolling_stats].sort_values('date_game')
    
    ewm_01 = one_team_df.drop('date_game', axis=1).ewm(alpha = 0.1, min_periods=1).mean().shift()
    ewm_03 = one_team_df.drop('date_game', axis=1).ewm(alpha = 0.3, min_periods=1).mean().shift()
    ewm_05 = one_team_df.drop('date_game', axis=1).ewm(alpha = 0.5, min_periods=1).mean().shift()
    ewm_07 = one_team_df.drop('date_game', axis=1).ewm(alpha = 0.7, min_periods=1).mean().shift()
    ewm_09 = one_team_df.drop('date_game', axis=1).ewm(alpha = 0.9, min_periods=1).mean().shift()
    ewm_10 = one_team_df.drop('date_game', axis=1).ewm(alpha = 1.0, min_periods=1).mean().shift()
    
    this_df = pd.concat([ewm_01, ewm_03, ewm_05, ewm_07, ewm_09, ewm_10], axis=1)

    this_df['date_game'] = one_team_df['date_game']

    this_df.to_csv('../data/team_ewm/'+team.replace(' ', '_')+'.csv')
    count += 1

parsing... 0
last parsing block took:  0.0  minutes


count = 0
stamp = datetime.now()
for player in players:
    if count % 500 == 0:
        diff_time = datetime.now() - stamp
        stamp = datetime.now()
        print('parsing...', count)
        print('last parsing block took: ', (diff_time.seconds / 60), ' minutes')
    player_df = df.loc[player][stats].sort_values('date_game')

    mean_3day = player_df.drop('date_game', axis=1).rolling(window=3, center=False, min_periods=1).mean().shift()
    mean_10day = player_df.drop('date_game', axis=1).rolling(window=10, center=False, min_periods=1).mean().shift()
    mean_30day = player_df.drop('date_game', axis=1).rolling(window=30, center=False, min_periods=1).mean().shift()
    mean_60day = player_df.drop('date_game', axis=1).rolling(window=60, center=False, min_periods=1).mean().shift()
    mean_90day = player_df.drop('date_game', axis=1).rolling(window=90, center=False, min_periods=1).mean().shift()
    mean_120day = player_df.drop('date_game', axis=1).rolling(window=120, center=False, min_periods=1).mean().shift()

    #median_3day = player_df.drop('date_game', axis=1).rolling(window=3, center=False, min_periods=1).median()
    #median_10day = player_df.drop('date_game', axis=1).rolling(window=10, center=False, min_periods=1).median()
    #median_30day = player_df.drop('date_game', axis=1).rolling(window=30, center=False, min_periods=1).median()
    #median_60day = player_df.drop('date_game', axis=1).rolling(window=60, center=False, min_periods=1).median()
    #median_90day = player_df.drop('date_game', axis=1).rolling(window=90, center=False, min_periods=1).median()
    #median_120day = player_df.drop('date_game', axis=1).rolling(window=120, center=False, min_periods=1).median()
 
    this_df = pd.concat([mean_3day, mean_10day, mean_30day, mean_60day, mean_90day, mean_120day,
                        median_3day, median_10day, median_30day, median_60day, median_90day,
                        median_120day], axis=1)
    
    this_df['player'] = player

    this_df.to_csv('../data/player_rolling/'+player.replace(' ', '_')+'.csv')
    count += 1

In [118]:
for (dirpath, dirnames, filenames) in os.walk('../data/team_ewm/'):
    break

In [119]:
len(filenames)

352

In [120]:
f = filenames # [:3] for testing

with open('../data/team_ewm_df.csv', 'wb') as output:
    for filename in f:
        with open('../data/team_ewm/'+filename, 'rb') as _input:
            for i, line in enumerate(_input):
                if i == 0:
                    continue       
                #print(line) # for testing
                output.write(line)

In [121]:
team_ewm_df = pd.read_csv('../data/team_ewm_df.csv', skiprows=1, header=None)

In [122]:
team_ewm_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,184,185,186,187,188,189,190,191,192,193
0,High Point,8.0,32.0,8.0,22.0,17.0,29.0,10.0,19.0,10.0,...,27.0,19.0,14.0,8.0,11.0,19.0,79.0,79.6,1.0,2010-11-23
1,High Point,13.263158,39.894737,4.842105,14.105263,19.105263,31.105263,10.0,20.052632,7.894737,...,26.0,16.0,3.0,1.0,14.0,24.0,71.0,45.2,1.0,2010-11-28
2,High Point,13.166052,38.826568,5.269373,13.328413,19.435424,28.852399,8.154982,22.247232,9.409594,...,27.0,7.0,4.0,7.0,10.0,24.0,70.0,45.8,1.0,2010-12-02
3,High Point,14.86246,38.876999,4.609479,12.36057,19.308811,28.895318,7.528351,22.175342,9.290491,...,26.0,3.0,7.0,1.0,19.0,23.0,64.0,29.0,0.0,2010-12-04
4,High Point,18.070572,38.662841,3.728065,13.493492,18.745012,28.188298,8.131914,22.132524,10.928915,...,16.0,11.0,10.0,4.0,22.0,22.0,73.0,43.8,0.0,2010-12-08


In [124]:
team_rolling_stats = pd.Series(team_rolling_stats)

In [125]:
team_rolling_stats

0          date_game
1                fg2
2               fg2a
3                fg3
4               fg3a
5                 ft
6                fta
7                orb
8                drb
9                ast
10               stl
11               blk
12               tov
13                pf
14               pts
15        game_score
16                 W
17           def_fg2
18          def_fg2a
19           def_fg3
20          def_fg3a
21            def_ft
22           def_fta
23           def_orb
24           def_drb
25           def_ast
26           def_stl
27           def_blk
28           def_tov
29            def_pf
30           def_pts
31    def_game_score
32             def_W
dtype: object

In [126]:
team_ewm_cols = []
for i in ['01', '03', '05', '07', '09', '10']:
    for j in team_rolling_stats.drop(0):
        team_ewm_cols.append('ewm'+j+'_'+i)

In [128]:
len(team_ewm_cols)

192

In [129]:
team_ewm_cols.insert(0, 'school_id')
team_ewm_cols.append('date_game')
len(team_ewm_cols)

194

In [130]:
team_ewm_df.columns = team_ewm_cols

In [133]:
team_ewm_df['date_game'] = pd.to_datetime(team_ewm_df['date_game'])
team_ewm_df.set_index(['school_id', 'date_game'], drop=False, inplace=True)

In [142]:
team_joined_df = team_df.join(team_ewm_df, how='left', on=['school_id', 'date_game'], rsuffix='_right')

In [147]:
team_joined_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,school_id,opp_id,date_game,fg,fga,fg2,fg2a,fg3,fg3a,ft,...,ewmdef_drb_10,ewmdef_ast_10,ewmdef_stl_10,ewmdef_blk_10,ewmdef_tov_10,ewmdef_pf_10,ewmdef_pts_10,ewmdef_game_score_10,ewmdef_W_10,date_game_right
school_id,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Abilene Christian,2017-12-06,Abilene Christian,Air Force,2017-12-06,23,54,14,29,9,25,7,...,20.0,9.0,4.0,3.0,8.0,18.0,83.0,56.9,0.0,2017-12-06
Abilene Christian,2017-11-13,Abilene Christian,Arkansas State,2017-11-13,25,45,22,32,3,13,16,...,19.0,18.0,4.0,2.0,9.0,8.0,81.0,69.2,1.0,2017-11-13
Abilene Christian,2014-12-23,Abilene Christian,Arkansas-Pine Bluff,2014-12-23,24,45,14,25,10,20,11,...,28.0,11.0,8.0,6.0,22.0,21.0,59.0,35.5,0.0,2014-12-23
Abilene Christian,2014-12-20,Abilene Christian,Boise State,2014-12-20,11,55,5,36,6,19,5,...,35.0,17.0,5.0,2.0,14.0,13.0,83.0,75.8,1.0,2014-12-20
Abilene Christian,2017-11-26,Abilene Christian,Bowling Green State,2017-11-26,34,64,28,48,6,16,14,...,21.0,17.0,7.0,2.0,12.0,12.0,88.0,75.4,1.0,2017-11-26


In [148]:
team_joined_df.shape

(87136, 233)

## Team Modeling

In [150]:
# Will drop the "first" observation of each team
team_joined_df.dropna(inplace=True)

In [152]:
list(team_joined_df.columns)

['school_id',
 'opp_id',
 'date_game',
 'fg',
 'fga',
 'fg2',
 'fg2a',
 'fg3',
 'fg3a',
 'ft',
 'fta',
 'orb',
 'drb',
 'ast',
 'stl',
 'blk',
 'tov',
 'pf',
 'pts',
 'game_score',
 'W',
 'def_fg',
 'def_fga',
 'def_fg2',
 'def_fg2a',
 'def_fg3',
 'def_fg3a',
 'def_ft',
 'def_fta',
 'def_orb',
 'def_drb',
 'def_ast',
 'def_stl',
 'def_blk',
 'def_tov',
 'def_pf',
 'def_pts',
 'def_game_score',
 'def_W',
 'school_id_right',
 'ewmfg2_01',
 'ewmfg2a_01',
 'ewmfg3_01',
 'ewmfg3a_01',
 'ewmft_01',
 'ewmfta_01',
 'ewmorb_01',
 'ewmdrb_01',
 'ewmast_01',
 'ewmstl_01',
 'ewmblk_01',
 'ewmtov_01',
 'ewmpf_01',
 'ewmpts_01',
 'ewmgame_score_01',
 'ewmW_01',
 'ewmdef_fg2_01',
 'ewmdef_fg2a_01',
 'ewmdef_fg3_01',
 'ewmdef_fg3a_01',
 'ewmdef_ft_01',
 'ewmdef_fta_01',
 'ewmdef_orb_01',
 'ewmdef_drb_01',
 'ewmdef_ast_01',
 'ewmdef_stl_01',
 'ewmdef_blk_01',
 'ewmdef_tov_01',
 'ewmdef_pf_01',
 'ewmdef_pts_01',
 'ewmdef_game_score_01',
 'ewmdef_W_01',
 'ewmfg2_03',
 'ewmfg2a_03',
 'ewmfg3_03',
 'ewmfg3

In [178]:
team_ewm_cols.append('pts')

In [179]:
team_modeling_feats = pd.Series(team_ewm_cols)

In [181]:
team_modeling_feats.drop([0, 193], inplace=True)

In [202]:
trial_df = team_joined_df[test_list]

In [203]:
X = trial_df.drop('pts', axis=1)
y = trial_df['pts']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(60748, 13)
(60748,)
(26036, 13)
(26036,)


In [201]:
team_joined_df.loc['Abilene Christian'][test_list].sort_index()

Unnamed: 0_level_0,pts,def_pts,ewmpts_01,ewmdef_pts_01,ewmpts_03,ewmdef_pts_03,ewmpts_05,ewmdef_pts_05,ewmpts_07,ewmdef_pts_07,ewmpts_09,ewmdef_pts_09,ewmpts_10,ewmdef_pts_10
date_game,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2013-11-11,47,75,75.000000,94.000000,75.000000,94.000000,75.000000,94.000000,75.000000,94.000000,75.000000,94.000000,75.0,94.0
2013-11-13,44,67,60.263158,84.000000,58.529412,82.823529,56.333333,81.333333,53.461538,79.384615,49.545455,76.727273,47.0,75.0
2013-11-17,41,103,54.261993,77.726937,51.894977,75.598174,49.285714,73.142857,46.654676,70.474820,44.549550,67.963964,44.0,67.0
2013-11-19,64,71,50.405641,85.075894,47.593762,86.416107,44.866667,89.066667,42.664079,93.428370,41.354635,99.499550,41.0,103.0
2013-11-25,65,93,53.725306,81.638641,53.509971,80.856947,54.741935,79.741935,57.635605,77.690267,61.735667,73.849698,64.0,71.0
2013-11-30,47,74,56.131554,84.063386,57.416589,84.985594,59.952381,86.476190,62.794442,88.414898,64.673570,91.084987,65.0,93.0
2014-01-09,81,87,54.381219,82.134437,54.011161,81.394145,53.425197,80.188976,51.735914,78.322262,48.767355,75.708497,47.0,74.0
2014-01-11,77,85,59.055011,82.988745,62.603122,83.178782,67.266667,83.607843,72.222118,84.397077,77.776736,85.870850,81.0,87.0
2014-01-16,59,82,61.984425,83.317070,67.103804,83.748122,72.142857,84.305284,75.566701,84.819131,77.077674,85.087085,77.0,85.0
2014-01-18,73,72,61.526215,83.114855,64.601993,83.208441,65.565005,83.151515,63.969942,82.845728,60.807767,82.308708,59.0,82.0


In [200]:
test_list = []
for i in team_joined_df.columns:
    if 'pts' in i:
        test_list.append(i)

### Team LinReg

In [204]:
team_linreg = LinearRegression()
team_linreg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [205]:
score_lr = cross_val_score(team_linreg, X_train, y_train, cv=5)
score_lr

array([0.2323768 , 0.22094993, 0.21195707, 0.20990442, 0.20817141])

## Engineer Defender Features

In [None]:
school_list = school_df['School']

In [None]:
game_dates = df['date_game'].unique()

In [None]:
# TEST!!!
game_dates = '2010-11-08'

In [None]:
for m in game_dates:
    for n in school_list:    
        date_mask = m
        school_mask = n

        df_myteam = df[(df['date_game']==date_mask) & (df['school_id']==school_mask)]
        df_myteam = df_myteam.sort_values(['gs', 'mp'], ascending=False)

        opp_school = list(df_myteam['opp_id'])[0]

        df_opp = df[(df['date_game']==date_mask) & (df['school_id']==opp_school)]
        df_opp = df_opp.sort_values(['gs', 'mp'], ascending=False)

        largest = min([len(df_opp), len(df_myteam)])

        df_myteam = df_myteam[0:largest]
        df_opp = df_opp[0:largest]

        my_team_index = df_myteam.index
        matchup_index = []

        opp_pos_list = df_opp['pos']

        for i in df_myteam['pos']:
            if i == 'PG':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PG':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SG':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'G':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break

            if i == 'SG':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SG':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PG':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'G':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break  

            if i == 'SF':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'F':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break

            if i == 'PF':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'C':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'F':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break

            if i == 'C':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'C':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break

            if i == 'G':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PG':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SG':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'G':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break

            if i == 'F':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'F':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
            try:
                matchup_index.append(df_opp['pos'].index[0])
                df_opp['pos'].drop(df_opp['pos'].index[0], inplace=True)
            except:
                pass
        df.merge()
        # Do a thing that appends select cols to the right


# Modeling

In [None]:
# Specific to rolling mean and rolling median
#all_rolling.append('pts')

In [None]:
ewm_cols.append('pts')

In [None]:
test_df = joined_df.dropna()[ewm_cols].drop(['date_game', 'player'], axis=1)

In [None]:
test_df.head()

In [None]:
# Come back to set X to drop just pts and 'Unnamed:0'

#X = test_df.drop(['pts'], axis=1)
X = test_df.drop('pts', axis=1)
y = test_df['pts']

In [None]:
X.columns

In [None]:
#from sklearn.preprocessing import StandardScaler

In [None]:
#ss = StandardScaler()
#ss.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
linreg = LinearRegression(normalize=True)
linreg.fit(X_train, y_train)

In [None]:
cross_val_score(linreg, X_train, y_train)

In [None]:
preds = linreg.predict(X)

In [None]:
fig, ax = plt.subplots(figsize=(10,6))

plt.scatter(preds, y_test)

In [None]:
linreg.score(X_test, y_test)

In [None]:
y_test.max()

In [None]:
preds.max()

In [None]:
joined_df.head()

In [None]:
orig_df = joined_df.dropna()

In [None]:
orig_df['preds'] = preds

In [None]:
temp = orig_df[['school_id', 'date_game', 'pts', 'preds']].groupby(['school_id', 'date_game']).sum()

In [None]:
fig, ax = plt.subplots(figsize=(14,6))
plt.scatter(temp['pts'], temp['preds'])

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
r2_score(y, preds)

In [None]:
temp['pts'].mean()

In [None]:
temp['mean'] = temp['pts'].mean()

In [None]:
len(temp['pts'])

In [None]:
mean_score = np.full(86849, temp['pts'].mean())

In [None]:
real_scores = temp['pts'].values

In [None]:
r2_score(real_scores, mean_score)

In [None]:
mean_squared_error(real_scores, mean_score)

In [None]:
mean_squared_error(real_scores, temp['preds'])

In [None]:
r2_score(real_scores, temp['preds'])

In [None]:
plt.hist(real_scores, bins=50)

In [None]:
np.std(real_scores)

In [None]:
print(len(temp['pts'].values))
print(len(mean_score))

In [None]:
mean_sc

In [None]:
joined_df.head()

In [None]:
test_df.groupby(['school_id', 'date_game']).sum()

# Exploratory

In [None]:
test = joined_df.loc['Grayson Allen'][['mp','pts', 'ewmpts_05', 'ewmgame_score_05', 'date_game']].sort_values('date_game')
test

In [None]:
fig, ax = plt.subplots(figsize=(14,6))
plt.plot(test['pts'])
#plt.plot(test['mp'])
plt.plot(test['ewmgame_score_05'])

In [None]:
ewm_test = player_df.drop('date_game', axis=1).ewm(alpha = 0.5).mean().head()
ewm_test

In [None]:
ewm_test.shift()

In [None]:
player_df.head()

In [None]:
test = df.groupby(['school_id', 'season']).sum().sort_values('pts', ascending=False)
test

In [None]:
test.corr()