In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.feature_extraction.text import CountVectorizer

import os

import requests
from bs4 import BeautifulSoup
import time

from collections import Counter

import matplotlib.pyplot as plt

%matplotlib inline

## Read in Player_By_Game Data from Scraper

In [None]:
# Read in all the partial data pieces

df1 = pd.read_csv('../data/player_boxscores/offset_0-200000.csv')
df2 = pd.read_csv('../data/player_boxscores/offset_200000-300000.csv')
df3 = pd.read_csv('../data/player_boxscores/offset_300000-475000.csv')
df4 = pd.read_csv('../data/player_boxscores/offset_475000-550000.csv')
df5 = pd.read_csv('../data/player_boxscores/offset_550000-600000.csv')
df6 = pd.read_csv('../data/player_boxscores/offset_600000-725000.csv')
df7 = pd.read_csv('../data/player_boxscores/offset_725000-909000.csv')

In [None]:
# Aggregate all the segmented data into one dataframe

df = pd.concat([df1, df2, df3, df4, df5, df6, df7])

In [None]:
# Check starting shape

df.shape

In [None]:
df.head()

## EDA And Cleaning of Player_By_Game Data

In [None]:
df.columns

In [None]:
df.isnull().sum()

In [None]:
# Drop these columns because they are derived explicity from existing features
df.drop(['fg_pct', 'fg2_pct', 'fg3_pct', 'ft_pct', 'trb'], axis=1, inplace=True)

In [None]:
# Drop erroneous column
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
# If we don't know whether they started or not, chances are they did not. We'll set np.nan 
# of 'gs' to 0.
df['gs'] = df['gs'].fillna(value=0)

In [None]:
# Check and drop duplicated rows, probably resulting from scraping overlap
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
# We will be bringing in a lot of opponent data as features, so not knowing the opp_id is 
# useless to us. So let's drop the np.nan values in opp_id

df.dropna(inplace=True)

In [None]:
df.isnull().sum()

In [None]:
# Check resulting shape. We still have 96% of our original rows. Pretty good
df.shape

In [None]:
# Convert date_game to datetime type

df['date_game'] = pd.to_datetime(df['date_game'])

In [None]:
# Create a year and month feature

df['year'] = df['date_game'].map(lambda x: x.year)
df['month'] = df['date_game'].map(lambda x: x.month)

In [None]:
# Get an idea of timeframe 

print(df['date_game'].min())
print(df['date_game'].max())

In [None]:
# Make a 'season' feature, where the year it ended is defined as the season

season_list = []
for i in df['date_game']:
    if i.month == 11 or i.month == 12:
        season_list.append(i.year + 1)
    else:
        season_list.append(i.year)

In [None]:
df['season'] = season_list

In [None]:
# Codify Win/Loss with 1/0
df['W'] = df['game_result'].map(lambda x: 1 if x == 'W' else 0)

In [None]:
# Dummify position
#df = pd.concat([df, pd.get_dummies(df['pos'])], axis=1)

In [None]:
# Drop game_result: We have one-hot encoded
df.drop('game_result', axis=1, inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df.to_csv('../data/player_boxscores_df.csv')

## Bring in School Data

In [None]:
school_df = pd.read_csv('../data/schools.csv')

In [None]:
school_df.head()

# Player-by-Player Evaluation

In [None]:
df.set_index(['player', 'date_game'], drop=False, inplace=True)

In [None]:
stats = ['gs', 'mp', 'fg2', 'fg2a', 'fg3', 'fg3a', 'ft', 'fta','orb', 'drb', 'ast', 
         'stl', 'blk', 'tov', 'pf', 'pts', 'game_score', 'W']

In [None]:
# Specifically needed for rolling mean and rolling median

lookbacks = ['_3day_', '_10day_', '_30day_', '_60day_', '_90day_', '_120day_']

metrics = ['mean', 'median']

In [None]:
# Specifically needed for rolling mean and rolling median

all_rolling = []

for i in metrics:
    for j in lookbacks:
        for k in stats:
            all_rolling.append(i+j+k)

In [None]:
# Note: There will be some noise introduced by players with same name (no unique id associated)
players = df['player'].unique()
len(players)

In [None]:
stats.append('date_game')

In [None]:
count = 0
stamp = datetime.now()
for player in players:
    if count % 500 == 0:
        diff_time = datetime.now() - stamp
        stamp = datetime.now()
        print('parsing...', count)
        print('last parsing block took: ', (diff_time.seconds / 60), ' minutes')
    player_df = df.loc[player][stats].sort_values('date_game')
    
    ewm_01 = player_df.drop('date_game', axis=1).ewm(alpha = 0.1, min_periods=1).mean().shift()
    ewm_03 = player_df.drop('date_game', axis=1).ewm(alpha = 0.3, min_periods=1).mean().shift()
    ewm_05 = player_df.drop('date_game', axis=1).ewm(alpha = 0.5, min_periods=1).mean().shift()
    ewm_07 = player_df.drop('date_game', axis=1).ewm(alpha = 0.7, min_periods=1).mean().shift()
    ewm_09 = player_df.drop('date_game', axis=1).ewm(alpha = 0.9, min_periods=1).mean().shift()
    ewm_10 = player_df.drop('date_game', axis=1).ewm(alpha = 1.0, min_periods=1).mean().shift()
    
    this_df = pd.concat([ewm_01, ewm_03, ewm_05, ewm_07, ewm_09, ewm_10], axis=1)

    this_df['player'] = player

    this_df.to_csv('../data/player_ewm/'+player.replace(' ', '_')+'.csv')
    count += 1

## Player-by-Player Aggregation

In [None]:
for (dirpath, dirnames, filenames) in os.walk('../data/player_ewm/'):
    break

In [None]:
len(filenames)

In [None]:
f = filenames # [:3] for testing

with open('../data/player_ewm_df.csv', 'wb') as output:
    for filename in f:
        with open('../data/player_ewm/'+filename, 'rb') as _input:
            for i, line in enumerate(_input):
                if i == 0:
                    continue       
                #print(line) # for testing
                output.write(line)

In [None]:
player_ewm_df = pd.read_csv('../data/player_ewm_df.csv', skiprows=1, header=None)

In [None]:
stats = pd.Series(stats)

ewm_cols = []
for i in ['01', '03', '05', '07', '09', '10']:
    for j in stats.drop(18):
        ewm_cols.append('ewm'+j+'_'+i)

In [None]:
ewm_cols.insert(0, 'date_game')
ewm_cols.append('player')
len(ewm_cols)

In [None]:
player_ewm_df.columns = ewm_cols

In [None]:
player_ewm_df.head()

In [None]:
player_ewm_df['date_game'] = pd.to_datetime(player_ewm_df['date_game'])

In [None]:
player_ewm_df.set_index(['player', 'date_game'], drop=False, inplace=True)

In [None]:
joined_df = df.join(player_ewm_df, how='left', on=['player', 'date_game'], rsuffix='_right')

## Player-by-Player Modeling

In [None]:
ewm_cols.append('pts')

In [None]:
joined_df.isnull().sum()

In [None]:
joined_df.shape

In [None]:
joined_df.fillna(0, inplace=True)

In [None]:
trial_df = joined_df[ewm_cols].drop(['date_game', 'player'], axis=1)

In [None]:
trial_df.shape

In [None]:
X = trial_df.drop('pts', axis=1)
y = trial_df['pts']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

### LinReg

In [None]:
player_linreg = LinearRegression()
player_linreg.fit(X_train, y_train)

In [None]:
score_lr = cross_val_score(player_linreg, X_train, y_train, cv=5)
score_lr

In [None]:
score_lr.mean()

#### Need to GroupBy to get Team Score Predictions

In [None]:
temp_lr = X

In [None]:
lr_preds = player_linreg.predict(temp_lr)

In [None]:
lr_preds.shape

In [None]:
joined_df['player_preds'] = lr_preds

In [None]:
team_actual = joined_df.groupby(['school_id', 'date_game']).sum()['pts']
team_preds = joined_df.groupby(['school_id', 'date_game']).sum()['player_preds']

In [None]:
print(team_actual.shape)
team_actual.isnull().sum()

In [None]:
print(team_preds.shape)
team_preds.isnull().sum()

In [None]:
r2_score(team_actual, team_preds)

In [None]:
plt.scatter(team_actual, team_preds, alpha=0.2)
plt.xlim(0,150)
plt.ylim(0,150)

### RFRegressor

In [None]:
player_rf = RandomForestRegressor(max_depth=2)
player_rf.fit(X_train, y_train)

In [None]:
score_rf = cross_val_score(player_rf, X_train, y_train, cv=5)
score_rf

In [None]:
score_rf.mean()

#### Need to GroupBy to get Team Score Predictions

# Team Evaluation

In [None]:
team_df = df.groupby(['school_id', 'opp_id', 'date_game']).sum()

In [None]:
# Sum of 'game_started' flag and 'minutes_played' meaningless, so we can drop
team_df.drop(['gs', 'mp', 'year', 'month', 'season'], axis=1, inplace=True)

In [None]:
team_df.reset_index(inplace=True)

In [None]:
team_df.head()

In [None]:
team_df[(team_df['school_id']=='Nevada-Las Vegas') & (team_df['date_game'] == '2018-03-07')]

In [None]:
team_df[(team_df['school_id']=='Air Force') & (team_df['date_game'] == '2017-12-06')].index[0]

In [None]:
# STOP RUNNING THIS FUCKING CELL YOU FUCKING MORON, THIS SHIT TAKES FOREVER
index_dict = {}
for i in team_df.index:
    row = team_df.loc[i]
    try:
        opp_index = team_df[(team_df['school_id']==row['opp_id']) & (team_df['date_game'] == row['date_game'])].index[0]
    except:
        opp_index = np.nan
    index_dict[i] = opp_index

In [None]:
def_stats = ['fg', 'fga', 'fg2', 'fg2a', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb', 'ast', 
             'stl', 'blk', 'tov', 'pf', 'pts', 'game_score', 'W']

team_opp_df = pd.DataFrame(columns=def_stats)

count = 0
for key in range(0,87171):
    if count % 500 == 0:
        print('parsed...', count)
    try:
        team_opp_df = team_opp_df.append(team_df.loc[index_dict[key]][def_stats])
    except:
        team_opp_df.append({'fg':np.nan, 'fga':np.nan, 'fg2':np.nan, 'fg2a':np.nan, 
                            'fg3':np.nan, 'fg3a':np.nan, 'ft':np.nan, 'fta':np.nan, 
                            'orb':np.nan, 'drb':np.nan, 'ast':np.nan, 'stl':np.nan, 
                            'blk':np.nan, 'tov':np.nan, 'pf':np.nan, 'pts':np.nan, 
                            'game_score':np.nan, 'W':np.nan}, ignore_index=True)
    count += 1

In [None]:
team_opp_df

In [None]:
opp_df_cols = ['def_fg', 'def_fga', 'def_fg2', 'def_fg2a', 'def_fg3', 'def_fg3a', 'def_ft', 
               'def_fta', 'def_orb', 'def_drb', 'def_ast', 'def_stl', 'def_blk', 'def_tov', 
               'def_pf', 'def_pts', 'def_game_score', 'def_W']

In [None]:
index_dict[0]

In [None]:
index_dict

In [None]:
team_opp_df

In [None]:
df.head()

In [None]:
team_df['mp']

In [None]:
count = 0
stamp = datetime.now()
for player in players:
    if count % 500 == 0:
        diff_time = datetime.now() - stamp
        stamp = datetime.now()
        print('parsing...', count)
        print('last parsing block took: ', (diff_time.seconds / 60), ' minutes')
    player_df = df.loc[player][stats].sort_values('date_game')

    mean_3day = player_df.drop('date_game', axis=1).rolling(window=3, center=False, min_periods=1).mean().shift()
    mean_10day = player_df.drop('date_game', axis=1).rolling(window=10, center=False, min_periods=1).mean().shift()
    mean_30day = player_df.drop('date_game', axis=1).rolling(window=30, center=False, min_periods=1).mean().shift()
    mean_60day = player_df.drop('date_game', axis=1).rolling(window=60, center=False, min_periods=1).mean().shift()
    mean_90day = player_df.drop('date_game', axis=1).rolling(window=90, center=False, min_periods=1).mean().shift()
    mean_120day = player_df.drop('date_game', axis=1).rolling(window=120, center=False, min_periods=1).mean().shift()

    #median_3day = player_df.drop('date_game', axis=1).rolling(window=3, center=False, min_periods=1).median()
    #median_10day = player_df.drop('date_game', axis=1).rolling(window=10, center=False, min_periods=1).median()
    #median_30day = player_df.drop('date_game', axis=1).rolling(window=30, center=False, min_periods=1).median()
    #median_60day = player_df.drop('date_game', axis=1).rolling(window=60, center=False, min_periods=1).median()
    #median_90day = player_df.drop('date_game', axis=1).rolling(window=90, center=False, min_periods=1).median()
    #median_120day = player_df.drop('date_game', axis=1).rolling(window=120, center=False, min_periods=1).median()
 
    this_df = pd.concat([mean_3day, mean_10day, mean_30day, mean_60day, mean_90day, mean_120day,
                        median_3day, median_10day, median_30day, median_60day, median_90day,
                        median_120day], axis=1)
    
    this_df['player'] = player

    this_df.to_csv('../data/player_rolling/'+player.replace(' ', '_')+'.csv')
    count += 1

In [None]:
for (dirpath, dirnames, filenames) in os.walk('../data/player_dfs/'):
    break

In [None]:
# Any idea why 16,567 files were made, but there are 16,566 players?
len(filenames)

In [None]:
f = filenames # [:3] for testing

with open('../data/rolling_df.csv', 'wb') as output:
    for filename in f:
        with open('../data/player_dfs/'+filename, 'rb') as _input:
            for i, line in enumerate(_input):
                if i == 0:
                    continue       
                #print(line) # for testing
                output.write(line)

In [None]:
rolling_df = pd.read_csv('../data/rolling_df.csv', skiprows=1, header=None)

In [None]:
stats = pd.Series(stats)
stats

In [None]:
ewm_cols = []
for i in ['01', '03', '05', '07', '09', '10']:
    for j in stats.drop(18):
        ewm_cols.append('ewm'+j+'_'+i)

In [None]:
ewm_cols.insert(0, 'date_game')
ewm_cols.append('player')
len(ewm_cols)

In [None]:
ewm_cols

In [None]:
rolling_df.columns = ewm_cols

In [None]:
rolling_df.shape

In [None]:
rolling_df.head()

In [None]:
# Specific to rolling mean and rolling median
#rolling_df_cols = all_rolling
#rolling_df_cols.insert(0, 'date_game')
#rolling_df_cols.append('player')
#len(rolling_df_cols)

In [None]:
# Specific to rolling mean and rolling median
#rolling_df.columns = rolling_df_cols

In [None]:
rolling_df.set_index(['player', 'date_game'], drop=False, inplace=True)

In [None]:
rolling_df.head()

In [None]:
joined_df = df.join(rolling_df, how='left', on=['player', 'date_game'], rsuffix='_right')

In [None]:
joined_df.head()

## Engineer Defender Features

In [None]:
school_list = school_df['School']

In [None]:
game_dates = df['date_game'].unique()

In [None]:
# TEST!!!
game_dates = '2010-11-08'

In [None]:
for m in game_dates:
    for n in school_list:    
        date_mask = m
        school_mask = n

        df_myteam = df[(df['date_game']==date_mask) & (df['school_id']==school_mask)]
        df_myteam = df_myteam.sort_values(['gs', 'mp'], ascending=False)

        opp_school = list(df_myteam['opp_id'])[0]

        df_opp = df[(df['date_game']==date_mask) & (df['school_id']==opp_school)]
        df_opp = df_opp.sort_values(['gs', 'mp'], ascending=False)

        largest = min([len(df_opp), len(df_myteam)])

        df_myteam = df_myteam[0:largest]
        df_opp = df_opp[0:largest]

        my_team_index = df_myteam.index
        matchup_index = []

        opp_pos_list = df_opp['pos']

        for i in df_myteam['pos']:
            if i == 'PG':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PG':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SG':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'G':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break

            if i == 'SG':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SG':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PG':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'G':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break  

            if i == 'SF':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'F':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break

            if i == 'PF':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'C':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'F':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break

            if i == 'C':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'C':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break

            if i == 'G':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PG':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SG':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'G':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break

            if i == 'F':
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'PF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'SF':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
                for j in df_opp['pos'].index:
                    if df_opp['pos'][j] == 'F':
                        matchup_index.append(j)
                        df_opp['pos'].drop(j, inplace=True)
                        break
            try:
                matchup_index.append(df_opp['pos'].index[0])
                df_opp['pos'].drop(df_opp['pos'].index[0], inplace=True)
            except:
                pass
        df.merge()
        # Do a thing that appends select cols to the right


# Modeling

In [None]:
# Specific to rolling mean and rolling median
#all_rolling.append('pts')

In [None]:
ewm_cols.append('pts')

In [None]:
test_df = joined_df.dropna()[ewm_cols].drop(['date_game', 'player'], axis=1)

In [None]:
test_df.head()

In [None]:
# Come back to set X to drop just pts and 'Unnamed:0'

#X = test_df.drop(['pts'], axis=1)
X = test_df.drop('pts', axis=1)
y = test_df['pts']

In [None]:
X.columns

In [None]:
#from sklearn.preprocessing import StandardScaler

In [None]:
#ss = StandardScaler()
#ss.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
linreg = LinearRegression(normalize=True)
linreg.fit(X_train, y_train)

In [None]:
cross_val_score(linreg, X_train, y_train)

In [None]:
preds = linreg.predict(X)

In [None]:
fig, ax = plt.subplots(figsize=(10,6))

plt.scatter(preds, y_test)

In [None]:
linreg.score(X_test, y_test)

In [None]:
y_test.max()

In [None]:
preds.max()

In [None]:
joined_df.head()

In [None]:
orig_df = joined_df.dropna()

In [None]:
orig_df['preds'] = preds

In [None]:
temp = orig_df[['school_id', 'date_game', 'pts', 'preds']].groupby(['school_id', 'date_game']).sum()

In [None]:
fig, ax = plt.subplots(figsize=(14,6))
plt.scatter(temp['pts'], temp['preds'])

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [None]:
r2_score(y, preds)

In [None]:
temp['pts'].mean()

In [None]:
temp['mean'] = temp['pts'].mean()

In [None]:
len(temp['pts'])

In [None]:
mean_score = np.full(86849, temp['pts'].mean())

In [None]:
real_scores = temp['pts'].values

In [None]:
r2_score(real_scores, mean_score)

In [None]:
mean_squared_error(real_scores, mean_score)

In [None]:
mean_squared_error(real_scores, temp['preds'])

In [None]:
r2_score(real_scores, temp['preds'])

In [None]:
plt.hist(real_scores, bins=50)

In [None]:
np.std(real_scores)

In [None]:
print(len(temp['pts'].values))
print(len(mean_score))

In [None]:
mean_sc

In [None]:
joined_df.head()

In [None]:
test_df.groupby(['school_id', 'date_game']).sum()

# Exploratory

In [None]:
test = joined_df.loc['Grayson Allen'][['mp','pts', 'ewmpts_05', 'ewmgame_score_05', 'date_game']].sort_values('date_game')
test

In [None]:
fig, ax = plt.subplots(figsize=(14,6))
plt.plot(test['pts'])
#plt.plot(test['mp'])
plt.plot(test['ewmgame_score_05'])

In [None]:
ewm_test = player_df.drop('date_game', axis=1).ewm(alpha = 0.5).mean().head()
ewm_test

In [None]:
ewm_test.shift()

In [None]:
player_df.head()

In [None]:
test = df.groupby(['school_id', 'season']).sum().sort_values('pts', ascending=False)
test

In [None]:
test.corr()