In [None]:
# Look into Home/Away data
# Refine offensive/defensive characterization
# Could explore boosted models
# Could explore neural nets

## Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

#from sklearn.linear_model import LinearRegression
#from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
#from sklearn.metrics import r2_score
#from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import PCA

import os
import re
from datetime import datetime
from datetime import timedelta
from operator import itemgetter
from string import digits

import requests
from bs4 import BeautifulSoup
import time

from collections import Counter

import matplotlib.pyplot as plt

%matplotlib inline

## Read in Player_By_Game Data from Scraper

In [2]:
df = pd.read_csv('../data/player_boxscores_df.csv', index_col=0)
df['date_game'] = pd.to_datetime(df['date_game'])
df.set_index(['player', 'date_game'], drop=False, inplace=True)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,player,pos,date_game,school_id,opp_id,gs,mp,fg,fga,fg2,...,stl,blk,tov,pf,pts,game_score,year,month,season,W
player,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Nate Wolters,2013-02-07,Nate Wolters,G,2013-02-07,South Dakota State,IPFW,1.0,40,17,28,8,...,1,1,3,0,53,42.6,2013,2,2013,1
Marshon Brooks,2011-02-23,Marshon Brooks,G,2011-02-23,Providence,Notre Dame,1.0,40,20,28,14,...,1,0,1,4,52,42.3,2011,2,2011,0
Jimmer Fredette,2011-03-11,Jimmer Fredette,G,2011-03-11,Brigham Young,New Mexico,1.0,40,22,37,15,...,1,0,2,2,52,36.5,2011,3,2011,1
Markus Howard,2018-01-03,Markus Howard,G,2018-01-03,Marquette,Providence,1.0,44,17,29,6,...,2,0,2,1,52,39.8,2018,1,2018,1
Mike Daum,2017-02-18,Mike Daum,F,2017-02-18,South Dakota State,IPFW,1.0,36,14,29,7,...,0,1,2,0,51,39.8,2017,2,2017,1


In [3]:
# May be useful to scrape and get Class/Height/Weight of Players, then generate a "Mismatch"
# feature to apply to the team_level game.

In [7]:
df.duplicated().sum()

0

In [19]:
df.shape

(870131, 28)

In [8]:
print(df['date_game'].max())
print(df['date_game'].min())

2018-03-07 00:00:00
2010-11-08 00:00:00


In [17]:
len(df[df['season']==2011]['school_id'].unique())

345

In [18]:
len(df[df['year']==2018]['school_id'].unique())

351

## Bring in School Data from Scraper

In [20]:
school_df = pd.read_csv('../data/schools.csv', index_col=0)
school_df.head()

Unnamed: 0,School,WinLossPct,SRS,SOS,Year
0,Air Force,0.323,-4.9,3.13,2010
1,Akron,0.686,2.82,-1.5,2010
2,Alabama A&M,0.407,-20.19,-13.71,2010
3,Alabama-Birmingham,0.735,9.46,2.9,2010
4,Alabama State,0.516,-14.41,-12.02,2010


In [21]:
school_df = school_df.rename(columns = {'School':'school_id', 'Year':'year'})

In [22]:
school_df.set_index(['school_id', 'year'], drop=False, inplace=True)

In [23]:
school_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,school_id,WinLossPct,SRS,SOS,year
school_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Air Force,2010,Air Force,0.323,-4.9,3.13,2010
Akron,2010,Akron,0.686,2.82,-1.5,2010
Alabama A&M,2010,Alabama A&M,0.407,-20.19,-13.71,2010
Alabama-Birmingham,2010,Alabama-Birmingham,0.735,9.46,2.9,2010
Alabama State,2010,Alabama State,0.516,-14.41,-12.02,2010


In [24]:
# May be useful scrape Ortg/Drtg and apply as features to the team_level game.

# Evaluation at the Team Level

### Set up "MyTeam"

In [25]:
myteam_df = df.groupby(['school_id', 'opp_id', 'date_game']).sum()

Defaulting to column but this will raise an ambiguity error in a future version
  if __name__ == '__main__':


In [26]:
# Sum of 'game_started' flag and 'minutes_played' meaningless, so we can drop
myteam_df.drop(['gs', 'mp', 'month', 'year', 'season'], axis=1, inplace=True)

In [27]:
# Drop 'fg' and 'fga' since these are captured in 'fg2'/'fg2a'/'fg3'/'fg3a'
myteam_df.drop(['fg', 'fga'], axis=1, inplace=True)

In [28]:
# Change to 0/1 Flags
myteam_df['W'] = myteam_df['W'].apply(lambda x: 1 if x > 0 else 0)

In [29]:
myteam_df.reset_index(inplace=True)

In [30]:
myteam_df.set_index(['school_id', 'date_game'], drop=False, inplace=True)

In [31]:
myteam_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,school_id,opp_id,date_game,fg2,fg2a,fg3,fg3a,ft,fta,orb,drb,ast,stl,blk,tov,pf,pts,game_score,W
school_id,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Abilene Christian,2017-12-06,Abilene Christian,Air Force,2017-12-06,14,29,9,25,7,15,7,17,11,8,4,12,16,62,40.3,1
Abilene Christian,2017-11-13,Abilene Christian,Arkansas State,2017-11-13,22,32,3,13,16,20,5,14,11,4,2,13,19,69,46.1,0
Abilene Christian,2014-12-23,Abilene Christian,Arkansas-Pine Bluff,2014-12-23,14,25,10,20,11,16,5,22,15,7,2,18,21,69,47.7,1
Abilene Christian,2014-12-20,Abilene Christian,Boise State,2014-12-20,5,36,6,19,5,6,11,10,4,4,0,9,12,33,2.2,0
Abilene Christian,2017-11-26,Abilene Christian,Bowling Green State,2017-11-26,28,48,6,16,14,18,10,26,18,2,6,8,18,88,73.6,1


### Set up "YourTeam" (which will be base defensive characteristics on)

In [32]:
yourteam_df = df.groupby(['opp_id', 'school_id', 'date_game']).sum()

# Sum of 'game_started' flag and 'minutes_played' meaningless, so we can drop
yourteam_df.drop(['gs', 'mp', 'year', 'month', 'season'], axis=1, inplace=True)

# Drop 'fg' and 'fga' since these are captured in 'fg2'/'fg2a'/'fg3'/'fg3a'
yourteam_df.drop(['fg', 'fga'], axis=1, inplace=True)

# Change to 0/1 Flags
yourteam_df['W'] = yourteam_df['W'].apply(lambda x: 1 if x > 0 else 0)

yourteam_df.reset_index(inplace=True)

yourteam_df.set_index(['opp_id', 'date_game'], drop=False, inplace=True)

Defaulting to column but this will raise an ambiguity error in a future version
  if __name__ == '__main__':


In [33]:
yourteam_df.columns = 'heldto_' + yourteam_df.columns

In [34]:
yourteam_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,heldto_opp_id,heldto_school_id,heldto_date_game,heldto_fg2,heldto_fg2a,heldto_fg3,heldto_fg3a,heldto_ft,heldto_fta,heldto_orb,heldto_drb,heldto_ast,heldto_stl,heldto_blk,heldto_tov,heldto_pf,heldto_pts,heldto_game_score,heldto_W
opp_id,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Abilene Christian,2017-12-06,Abilene Christian,Air Force,2017-12-06,22,33,3,18,5,8,5,28,15,5,2,20,14,58,34.3,0
Abilene Christian,2017-11-13,Abilene Christian,Arkansas State,2017-11-13,21,35,7,13,20,26,9,15,7,6,1,9,12,83,66.8,1
Abilene Christian,2014-12-23,Abilene Christian,Arkansas-Pine Bluff,2014-12-23,15,34,5,19,16,20,11,18,5,3,2,14,19,61,29.7,0
Abilene Christian,2014-12-20,Abilene Christian,Boise State,2014-12-20,21,32,10,20,5,6,8,32,17,2,6,12,7,77,71.1,1
Abilene Christian,2017-11-26,Abilene Christian,Bowling Green State,2017-11-26,22,48,8,24,15,22,17,20,9,4,3,8,18,83,56.9,0


### "MyTeam" Rolling/EWM Statistics Generation

In [35]:
myteam_stats = ['date_game', 'opp_id', 'fg2', 'fg2a', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb', 
         'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'game_score', 'W']

In [36]:
teams = df['school_id'].unique()

In [37]:
len(teams)

353

In [None]:
for team in teams:
    one_team_df = myteam_df.loc[team][myteam_stats].sort_values('date_game')
    
    ewm_03 = one_team_df.drop(['date_game', 'opp_id'], axis=1).ewm(span=3).mean().shift()
    ewm_10 = one_team_df.drop(['date_game', 'opp_id'], axis=1).ewm(span=10).mean().shift()
    ewm_20 = one_team_df.drop(['date_game', 'opp_id'], axis=1).ewm(span=20).mean().shift()
    rm_30 = one_team_df.drop(['date_game', 'opp_id'], axis=1).rolling(window=30).mean().shift()

    this_df = pd.concat([ewm_03, ewm_10, ewm_20, rm_30], axis=1)
    
    this_df['school_id'] = team
    this_df['date_game'] = one_team_df['date_game']
    this_df['opp_id'] = one_team_df['opp_id']

    this_df.to_csv('../data/myteam_ewm/'+team.replace(' ', '_')+'.csv')

In [None]:
for (dirpath, dirnames, filenames) in os.walk('../data/myteam_ewm/'):
    break

f = filenames # [:3] for testing

with open('../data/myteam_ewm_df.csv', 'wb') as output:
    for filename in f:
        with open('../data/myteam_ewm/'+filename, 'rb') as _input:
            for i, line in enumerate(_input):
                if i == 0:
                    continue       
                #print(line) # for testing
                output.write(line)

In [38]:
myteam_ewm_df = pd.read_csv('../data/myteam_ewm_df.csv', header=None)

In [39]:
myteam_ewm_df.shape

(87171, 68)

In [40]:
# Drop duplicate of opposing team
myteam_ewm_df.drop(66, axis=1, inplace=True)

In [41]:
myteam_ewm_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,67
0,2010-11-15,,,,,,,,,,...,,,,,,,,,High Point,Old Dominion
1,2010-11-23,8.0,32.0,8.0,22.0,17.0,29.0,10.0,19.0,10.0,...,,,,,,,,,High Point,Citadel
2,2010-11-28,14.666667,42.0,4.0,12.0,19.666667,31.666667,10.0,20.333333,7.333333,...,,,,,,,,,High Point,Hampton
3,2010-12-02,13.714286,39.142857,5.142857,12.0,19.857143,27.857143,7.142857,23.571429,10.0,...,,,,,,,,,High Point,Gardner-Webb
4,2010-12-04,16.533333,39.066667,4.0,10.933333,19.4,28.466667,6.533333,22.733333,9.466667,...,,,,,,,,,High Point,North Carolina-Asheville


In [42]:
myteam_stats = pd.Series(myteam_stats)

In [43]:
myteam_cols = []
for i in ['ewm03', 'ewm10', 'ewm20', 'rm30']:
    for j in myteam_stats.drop([0,1]):
        myteam_cols.append(i+j)

In [44]:
myteam_cols.insert(0, 'date_game')
myteam_cols.append('school_id')
myteam_cols.append('opp_id')

In [45]:
myteam_ewm_df.columns = myteam_cols

In [46]:
myteam_ewm_df.head()

Unnamed: 0,date_game,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,...,rm30ast,rm30stl,rm30blk,rm30tov,rm30pf,rm30pts,rm30game_score,rm30W,school_id,opp_id
0,2010-11-15,,,,,,,,,,...,,,,,,,,,High Point,Old Dominion
1,2010-11-23,8.0,32.0,8.0,22.0,17.0,29.0,10.0,19.0,10.0,...,,,,,,,,,High Point,Citadel
2,2010-11-28,14.666667,42.0,4.0,12.0,19.666667,31.666667,10.0,20.333333,7.333333,...,,,,,,,,,High Point,Hampton
3,2010-12-02,13.714286,39.142857,5.142857,12.0,19.857143,27.857143,7.142857,23.571429,10.0,...,,,,,,,,,High Point,Gardner-Webb
4,2010-12-04,16.533333,39.066667,4.0,10.933333,19.4,28.466667,6.533333,22.733333,9.466667,...,,,,,,,,,High Point,North Carolina-Asheville


In [47]:
myteam_ewm_df['date_game'] = pd.to_datetime(myteam_ewm_df['date_game'])

### "YourTeam" Rolling/EWM Statistics Generation

In [48]:
yourteam_stats = ['heldto_school_id', 'heldto_date_game', 'heldto_fg2',
       'heldto_fg2a', 'heldto_fg3', 'heldto_fg3a', 'heldto_ft', 'heldto_fta',
       'heldto_orb', 'heldto_drb', 'heldto_ast', 'heldto_stl', 'heldto_blk',
       'heldto_tov', 'heldto_pf', 'heldto_pts', 'heldto_game_score',
       'heldto_W']

In [49]:
for team in teams:
    try:
        one_team_df = yourteam_df.loc[team][yourteam_stats].sort_values('heldto_date_game')

        ewm_03 = one_team_df.drop(['heldto_date_game', 'heldto_school_id'], axis=1).ewm(span=3).mean().shift()
        ewm_10 = one_team_df.drop(['heldto_date_game', 'heldto_school_id'], axis=1).ewm(span=10).mean().shift()
        ewm_20 = one_team_df.drop(['heldto_date_game', 'heldto_school_id'], axis=1).ewm(span=20).mean().shift()
        rm_30 = one_team_df.drop(['heldto_date_game', 'heldto_school_id'], axis=1).rolling(window=30).mean().shift()

        this_df = pd.concat([ewm_03, ewm_10, ewm_20, rm_30], axis=1)

        this_df['heldto_opp_id'] = team
        this_df['heldto_date_game'] = one_team_df['heldto_date_game']
        this_df['heldto_school_id'] = one_team_df['heldto_school_id']

        this_df.to_csv('../data/yourteam_ewm/'+team.replace(' ', '_')+'.csv')
    except:
        pass

In [50]:
for (dirpath, dirnames, filenames) in os.walk('../data/yourteam_ewm/'):
    break

f = filenames # [:3] for testing

with open('../data/yourteam_ewm_df.csv', 'wb') as output:
    for filename in f:
        with open('../data/yourteam_ewm/'+filename, 'rb') as _input:
            for i, line in enumerate(_input):
                if i == 0:
                    continue       
                #print(line) # for testing
                output.write(line)

In [51]:
yourteam_ewm_df = pd.read_csv('../data/yourteam_ewm_df.csv', header=None)

In [52]:
yourteam_ewm_df.shape

(87171, 68)

In [53]:
# Drop duplicate of opposing team
yourteam_ewm_df.drop(66, axis=1, inplace=True)

In [54]:
yourteam_stats = pd.Series(yourteam_stats)

In [55]:
yourteam_cols = []
for i in ['ewm03', 'ewm10', 'ewm20', 'rm30']:
    for j in yourteam_stats.drop([0,1]):
        yourteam_cols.append(i+j)

In [56]:
yourteam_cols.insert(0, 'date_game')
yourteam_cols.append('heldto_opp_id')
yourteam_cols.append('heldto_school_id')

In [57]:
yourteam_ewm_df.columns = yourteam_cols

In [58]:
yourteam_ewm_df['date_game'] = pd.to_datetime(yourteam_ewm_df['date_game'])

## Join "MyTeam" and "YourTeam"

In [59]:
temp_my = myteam_ewm_df.set_index(['school_id', 'date_game'], drop=False)

In [60]:
temp_my.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date_game,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,...,rm30ast,rm30stl,rm30blk,rm30tov,rm30pf,rm30pts,rm30game_score,rm30W,school_id,opp_id
school_id,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
High Point,2010-11-15,2010-11-15,,,,,,,,,,...,,,,,,,,,High Point,Old Dominion
High Point,2010-11-23,2010-11-23,8.0,32.0,8.0,22.0,17.0,29.0,10.0,19.0,10.0,...,,,,,,,,,High Point,Citadel
High Point,2010-11-28,2010-11-28,14.666667,42.0,4.0,12.0,19.666667,31.666667,10.0,20.333333,7.333333,...,,,,,,,,,High Point,Hampton
High Point,2010-12-02,2010-12-02,13.714286,39.142857,5.142857,12.0,19.857143,27.857143,7.142857,23.571429,10.0,...,,,,,,,,,High Point,Gardner-Webb
High Point,2010-12-04,2010-12-04,16.533333,39.066667,4.0,10.933333,19.4,28.466667,6.533333,22.733333,9.466667,...,,,,,,,,,High Point,North Carolina-Asheville


In [61]:
# Rename these cols so that 'join' will work later
yourteam_ewm_df.rename(columns={'heldto_school_id':'school_id'}, inplace=True)
yourteam_ewm_df.rename(columns={'heldto_opp_id':'opp_id'}, inplace=True)

In [62]:
temp_your = yourteam_ewm_df.set_index(['school_id', 'date_game'], drop=False)

In [63]:
temp_your.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date_game,ewm03heldto_fg2,ewm03heldto_fg2a,ewm03heldto_fg3,ewm03heldto_fg3a,ewm03heldto_ft,ewm03heldto_fta,ewm03heldto_orb,ewm03heldto_drb,ewm03heldto_ast,...,rm30heldto_ast,rm30heldto_stl,rm30heldto_blk,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,opp_id,school_id
school_id,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Old Dominion,2010-11-15,2010-11-15,,,,,,,,,,...,,,,,,,,,High Point,Old Dominion
Citadel,2010-11-23,2010-11-23,19.0,36.0,7.0,19.0,20.0,34.0,17.0,27.0,19.0,...,,,,,,,,,High Point,Citadel
Hampton,2010-11-28,2010-11-28,18.333333,40.0,5.666667,17.0,20.0,31.333333,15.666667,26.333333,17.0,...,,,,,,,,,High Point,Hampton
Gardner-Webb,2010-12-02,2010-12-02,16.428571,39.428571,4.714286,13.571429,24.571429,35.142857,10.714286,26.714286,11.285714,...,,,,,,,,,High Point,Gardner-Webb
North Carolina-Asheville,2010-12-04,2010-12-04,18.866667,38.666667,2.2,14.333333,23.2,34.533333,11.933333,26.333333,6.866667,...,,,,,,,,,High Point,North Carolina-Asheville


In [64]:
joined = temp_my.join(temp_your, how='left', on=['school_id', 'date_game'], rsuffix='_r')

In [65]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date_game,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,...,rm30heldto_ast,rm30heldto_stl,rm30heldto_blk,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,opp_id_r,school_id_r
school_id,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
High Point,2010-11-15,2010-11-15,,,,,,,,,,...,,,,,,,,,Old Dominion,High Point
High Point,2010-11-23,2010-11-23,8.0,32.0,8.0,22.0,17.0,29.0,10.0,19.0,10.0,...,,,,,,,,,Citadel,High Point
High Point,2010-11-28,2010-11-28,14.666667,42.0,4.0,12.0,19.666667,31.666667,10.0,20.333333,7.333333,...,,,,,,,,,Hampton,High Point
High Point,2010-12-02,2010-12-02,13.714286,39.142857,5.142857,12.0,19.857143,27.857143,7.142857,23.571429,10.0,...,,,,,,,,,Gardner-Webb,High Point
High Point,2010-12-04,2010-12-04,16.533333,39.066667,4.0,10.933333,19.4,28.466667,6.533333,22.733333,9.466667,...,,,,,,,,,North Carolina-Asheville,High Point


In [66]:
joined.shape

(87185, 134)

In [67]:
joined.isnull().sum()

date_game                     0
ewm03fg2                    353
ewm03fg2a                   353
ewm03fg3                    353
ewm03fg3a                   353
ewm03ft                     353
ewm03fta                    353
ewm03orb                    353
ewm03drb                    353
ewm03ast                    353
ewm03stl                    353
ewm03blk                    353
ewm03tov                    353
ewm03pf                     353
ewm03pts                    353
ewm03game_score             353
ewm03W                      353
ewm10fg2                    353
ewm10fg2a                   353
ewm10fg3                    353
ewm10fg3a                   353
ewm10ft                     353
ewm10fta                    353
ewm10orb                    353
ewm10drb                    353
ewm10ast                    353
ewm10stl                    353
ewm10blk                    353
ewm10tov                    353
ewm10pf                     353
                          ...  
ewm20hel

In [68]:
# EWM gave us first day NA's, RM gave us first 30 days NA's
joined.dropna(inplace=True)

In [69]:
# Bring in the target column, the W's
joined = joined.join(myteam_df['W'], how='left', on=['school_id', 'date_game'])

In [70]:
# Drop repeat data
joined.drop(['date_game', 'school_id', 'date_game_r', 'opp_id_r', 'school_id_r'], 
            axis=1, inplace=True)

### Add SOS and SRS data

In [71]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,ewm03stl,...,rm30heldto_drb,rm30heldto_ast,rm30heldto_stl,rm30heldto_blk,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,W
school_id,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
High Point,2011-11-18,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,20.382396,8.311464,4.645682,...,20.166667,12.666667,6.9,3.566667,12.2,19.233333,64.533333,44.153333,0.433333,0
High Point,2011-11-22,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,20.691198,11.655732,4.322841,...,19.5,13.7,6.566667,3.133333,15.3,20.366667,71.566667,46.986667,0.433333,0
High Point,2011-11-26,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,19.845599,14.327866,5.66142,...,20.966667,13.966667,6.0,3.7,10.166667,16.6,71.566667,54.456667,0.7,1
High Point,2011-11-29,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,25.9228,13.663933,7.33071,...,24.233333,10.666667,5.2,2.833333,14.9,20.6,65.566667,38.533333,0.366667,0
High Point,2011-12-03,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,20.9614,11.831967,5.665355,...,20.166667,13.7,6.966667,4.033333,13.833333,20.266667,69.133333,48.686667,0.566667,0


In [72]:
joined.reset_index(drop=False, inplace=True)

In [73]:
joined['year'] = joined['date_game'].apply(lambda x: x.year)

In [74]:
joined.set_index(['school_id', 'year'], drop=False, inplace=True)

In [75]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,school_id,date_game,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,...,rm30heldto_ast,rm30heldto_stl,rm30heldto_blk,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,W,year
school_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
High Point,2011,High Point,2011-11-18,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,20.382396,...,12.666667,6.9,3.566667,12.2,19.233333,64.533333,44.153333,0.433333,0,2011
High Point,2011,High Point,2011-11-22,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,20.691198,...,13.7,6.566667,3.133333,15.3,20.366667,71.566667,46.986667,0.433333,0,2011
High Point,2011,High Point,2011-11-26,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,19.845599,...,13.966667,6.0,3.7,10.166667,16.6,71.566667,54.456667,0.7,1,2011
High Point,2011,High Point,2011-11-29,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,25.9228,...,10.666667,5.2,2.833333,14.9,20.6,65.566667,38.533333,0.366667,0,2011
High Point,2011,High Point,2011-12-03,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,20.9614,...,13.7,6.966667,4.033333,13.833333,20.266667,69.133333,48.686667,0.566667,0,2011


In [76]:
joined = joined.join(school_df[['SRS', 'SOS']], how='left', on=['school_id', 'year'])

In [77]:
joined.drop(['school_id', 'year'], axis=1, inplace=True)

In [78]:
joined.reset_index(drop=False, inplace=True)

In [79]:
joined.set_index(['opp_id', 'year'], drop=False, inplace=True)

In [80]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,school_id,year,date_game,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,...,rm30heldto_stl,rm30heldto_blk,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,W,SRS,SOS
opp_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Central Florida,2011,High Point,2011,2011-11-18,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,...,6.9,3.566667,12.2,19.233333,64.533333,44.153333,0.433333,0,-12.82,-4.47
Tennessee Tech,2011,High Point,2011,2011-11-22,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,...,6.566667,3.133333,15.3,20.366667,71.566667,46.986667,0.433333,0,-12.82,-4.47
Citadel,2011,High Point,2011,2011-11-26,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,...,6.0,3.7,10.166667,16.6,71.566667,54.456667,0.7,1,-12.82,-4.47
Hampton,2011,High Point,2011,2011-11-29,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,...,5.2,2.833333,14.9,20.6,65.566667,38.533333,0.366667,0,-12.82,-4.47
Campbell,2011,High Point,2011,2011-12-03,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,...,6.966667,4.033333,13.833333,20.266667,69.133333,48.686667,0.566667,0,-12.82,-4.47


In [81]:
school_df.drop(['year', 'school_id'], axis=1, inplace=True)

In [82]:
school_df.reset_index(drop=False, inplace=True)

In [83]:
school_df = school_df.rename(columns={'school_id':'opp_id', 'SRS':'opp_SRS', 'SOS':'opp_SOS'})

In [84]:
school_df.set_index(['opp_id', 'year'], drop=False, inplace=True)

In [85]:
school_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,opp_id,year,WinLossPct,opp_SRS,opp_SOS
opp_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Air Force,2010,Air Force,2010,0.323,-4.9,3.13
Akron,2010,Akron,2010,0.686,2.82,-1.5
Alabama A&M,2010,Alabama A&M,2010,0.407,-20.19,-13.71
Alabama-Birmingham,2010,Alabama-Birmingham,2010,0.735,9.46,2.9
Alabama State,2010,Alabama State,2010,0.516,-14.41,-12.02


In [86]:
joined = joined.join(school_df[['opp_SRS', 'opp_SOS']], how='left', on=['opp_id', 'year'])

In [87]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,school_id,year,date_game,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,...,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,W,SRS,SOS,opp_SRS,opp_SOS
opp_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Central Florida,2011,High Point,2011,2011-11-18,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,...,12.2,19.233333,64.533333,44.153333,0.433333,0,-12.82,-4.47,6.25,1.97
Tennessee Tech,2011,High Point,2011,2011-11-22,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,...,15.3,20.366667,71.566667,46.986667,0.433333,0,-12.82,-4.47,-5.64,-5.77
Citadel,2011,High Point,2011,2011-11-26,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,...,10.166667,16.6,71.566667,54.456667,0.7,1,-12.82,-4.47,-9.84,-2.84
Hampton,2011,High Point,2011,2011-11-29,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,...,14.9,20.6,65.566667,38.533333,0.366667,0,-12.82,-4.47,-5.2,-8.77
Campbell,2011,High Point,2011,2011-12-03,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,...,13.833333,20.266667,69.133333,48.686667,0.566667,0,-12.82,-4.47,-6.67,-4.27


In [88]:
list(joined.columns)

['school_id',
 'year',
 'date_game',
 'ewm03fg2',
 'ewm03fg2a',
 'ewm03fg3',
 'ewm03fg3a',
 'ewm03ft',
 'ewm03fta',
 'ewm03orb',
 'ewm03drb',
 'ewm03ast',
 'ewm03stl',
 'ewm03blk',
 'ewm03tov',
 'ewm03pf',
 'ewm03pts',
 'ewm03game_score',
 'ewm03W',
 'ewm10fg2',
 'ewm10fg2a',
 'ewm10fg3',
 'ewm10fg3a',
 'ewm10ft',
 'ewm10fta',
 'ewm10orb',
 'ewm10drb',
 'ewm10ast',
 'ewm10stl',
 'ewm10blk',
 'ewm10tov',
 'ewm10pf',
 'ewm10pts',
 'ewm10game_score',
 'ewm10W',
 'ewm20fg2',
 'ewm20fg2a',
 'ewm20fg3',
 'ewm20fg3a',
 'ewm20ft',
 'ewm20fta',
 'ewm20orb',
 'ewm20drb',
 'ewm20ast',
 'ewm20stl',
 'ewm20blk',
 'ewm20tov',
 'ewm20pf',
 'ewm20pts',
 'ewm20game_score',
 'ewm20W',
 'rm30fg2',
 'rm30fg2a',
 'rm30fg3',
 'rm30fg3a',
 'rm30ft',
 'rm30fta',
 'rm30orb',
 'rm30drb',
 'rm30ast',
 'rm30stl',
 'rm30blk',
 'rm30tov',
 'rm30pf',
 'rm30pts',
 'rm30game_score',
 'rm30W',
 'opp_id',
 'ewm03heldto_fg2',
 'ewm03heldto_fg2a',
 'ewm03heldto_fg3',
 'ewm03heldto_fg3a',
 'ewm03heldto_ft',
 'ewm03heldto_f

In [89]:
joined.drop(['year', 'opp_id'], axis=1, inplace=True)

In [90]:
joined.reset_index(inplace=True, drop=False)

In [91]:
joined.drop(['year'], axis=1, inplace=True)

In [92]:
joined.set_index(['school_id', 'date_game', 'opp_id'], inplace=True)

In [93]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,ewm03stl,...,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,W,SRS,SOS,opp_SRS,opp_SOS
school_id,date_game,opp_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
High Point,2011-11-18,Central Florida,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,20.382396,8.311464,4.645682,...,12.2,19.233333,64.533333,44.153333,0.433333,0,-12.82,-4.47,6.25,1.97
High Point,2011-11-22,Tennessee Tech,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,20.691198,11.655732,4.322841,...,15.3,20.366667,71.566667,46.986667,0.433333,0,-12.82,-4.47,-5.64,-5.77
High Point,2011-11-26,Citadel,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,19.845599,14.327866,5.66142,...,10.166667,16.6,71.566667,54.456667,0.7,1,-12.82,-4.47,-9.84,-2.84
High Point,2011-11-29,Hampton,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,25.9228,13.663933,7.33071,...,14.9,20.6,65.566667,38.533333,0.366667,0,-12.82,-4.47,-5.2,-8.77
High Point,2011-12-03,Campbell,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,20.9614,11.831967,5.665355,...,13.833333,20.266667,69.133333,48.686667,0.566667,0,-12.82,-4.47,-6.67,-4.27


In [94]:
joined.isnull().sum()

ewm03fg2                    0
ewm03fg2a                   0
ewm03fg3                    0
ewm03fg3a                   0
ewm03ft                     0
ewm03fta                    0
ewm03orb                    0
ewm03drb                    0
ewm03ast                    0
ewm03stl                    0
ewm03blk                    0
ewm03tov                    0
ewm03pf                     0
ewm03pts                    0
ewm03game_score             0
ewm03W                      0
ewm10fg2                    0
ewm10fg2a                   0
ewm10fg3                    0
ewm10fg3a                   0
ewm10ft                     0
ewm10fta                    0
ewm10orb                    0
ewm10drb                    0
ewm10ast                    0
ewm10stl                    0
ewm10blk                    0
ewm10tov                    0
ewm10pf                     0
ewm10pts                    0
                         ... 
ewm20heldto_drb             0
ewm20heldto_ast             0
ewm20heldt

In [95]:
joined.dropna(inplace=True)

In [96]:
list(joined.dtypes)

[dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('floa

### Add Poll Rankings (Abandoned, little improvement)

In [None]:
# Leave 2018 out for now, structured a little differently
polls_2010 = pd.read_csv('../data/polls_rank/2010_polls.csv', sep='\t', header=None)
polls_2011 = pd.read_csv('../data/polls_rank/2011_polls.csv', sep='\t', header=None)
polls_2012 = pd.read_csv('../data/polls_rank/2012_polls.csv', sep='\t', header=None)
polls_2013 = pd.read_csv('../data/polls_rank/2013_polls.csv', sep='\t', header=None)
polls_2014 = pd.read_csv('../data/polls_rank/2014_polls.csv', sep='\t', header=None)
polls_2015 = pd.read_csv('../data/polls_rank/2015_polls.csv', sep='\t', header=None)
polls_2016 = pd.read_csv('../data/polls_rank/2016_polls.csv', sep='\t', header=None)
polls_2017 = pd.read_csv('../data/polls_rank/2017_polls.csv', sep='\t', header=None)

In [None]:
polls_2010.replace('Final', '2010-03-15', inplace=True)
polls_2010.replace('Preseason', '2009-11-09', inplace=True)

polls_2011.replace('Final', '2011-03-15', inplace=True)
polls_2011.replace('Preseason', '2010-11-09', inplace=True)

polls_2012.replace('Final', '2012-03-15', inplace=True)
polls_2012.replace('Preseason', '2011-11-09', inplace=True)

polls_2013.replace('Final', '2013-03-15', inplace=True)
polls_2013.replace('Preseason', '2012-11-09', inplace=True)

polls_2014.replace('Final', '2014-03-15', inplace=True)
polls_2014.replace('Preseason', '2013-11-09', inplace=True)

polls_2015.replace('Final', '2015-03-15', inplace=True)
polls_2015.replace('Preseason', '2014-11-09', inplace=True)

polls_2016.replace('Final', '2016-03-15', inplace=True)
polls_2016.replace('Preseason', '2015-11-09', inplace=True)

polls_2017.replace('Final', '2017-03-15', inplace=True)
polls_2017.replace('Preseason', '2016-11-09', inplace=True)

In [None]:
# AP Polls from 0:492, Coaches Polls from 494: (2010, 2011, 2013, 2014)
# AP Polls from 0:493, Coaches Polls from 495: (2012, 2015, 2016)
# AP Polls from 0:494, Coaches Polls from 496: (2017)
polls_2017.loc[495]

In [None]:
AP_polls_2010 = polls_2010.loc[:492]
Coach_polls_2010 = polls_2010.loc[494:]

AP_polls_2011 = polls_2011.loc[:492]
Coach_polls_2011 = polls_2011.loc[494:]

AP_polls_2012 = polls_2012.loc[:493]
Coach_polls_2012 = polls_2012.loc[495:]

AP_polls_2013 = polls_2013.loc[:492]
Coach_polls_2013 = polls_2013.loc[494:]

AP_polls_2014 = polls_2014.loc[:492]
Coach_polls_2014 = polls_2014.loc[494:]

AP_polls_2015 = polls_2015.loc[:493]
Coach_polls_2015 = polls_2015.loc[495:]

AP_polls_2016 = polls_2016.loc[:493]
Coach_polls_2016 = polls_2016.loc[495:]

AP_polls_2017 = polls_2017.loc[:494]
Coach_polls_2017 = polls_2017.loc[496:]

In [None]:
AP_polls = pd.concat([AP_polls_2010, AP_polls_2011, AP_polls_2012, AP_polls_2013, AP_polls_2014,
                      AP_polls_2015, AP_polls_2016, AP_polls_2017])

Coach_polls = pd.concat([Coach_polls_2010, Coach_polls_2011, Coach_polls_2012, Coach_polls_2013,
                         Coach_polls_2014, Coach_polls_2015, Coach_polls_2016, Coach_polls_2017])

In [None]:
polls_cols = ['wk', 'date_game', 'rank', 'school_id', 'prev_rank', 'chnge', 'conf']

In [None]:
AP_polls.columns = polls_cols
Coach_polls.columns = polls_cols

In [None]:
AP_drop_indices = AP_polls[AP_polls['wk']=='Wk'].index
Coach_drop_indices = Coach_polls[Coach_polls['wk']=='Wk'].index

In [None]:
AP_polls.drop(AP_drop_indices, inplace=True)
Coach_polls.drop(Coach_drop_indices, inplace=True)

In [None]:
print(AP_polls.shape)
print(Coach_polls.shape)

In [None]:
AP_polls.reset_index(drop=True, inplace=True)
Coach_polls.reset_index(drop=True, inplace=True)

In [None]:
AP_polls.head()

In [None]:
AP_polls['rank'] = AP_polls['rank'].astype(int)
Coach_polls['rank'] = Coach_polls['rank'].astype(int)

AP_polls['date_game'] = pd.to_datetime(AP_polls['date_game'])
Coach_polls['date_game'] = pd.to_datetime(Coach_polls['date_game'])

In [None]:
AP_polls.head()

In [None]:
joined.reset_index(inplace=True, drop=False)

In [None]:
joined.sort_values(['date_game', 'school_id'], inplace=True)
AP_polls.sort_values(['date_game', 'school_id'], inplace=True)
Coach_polls.sort_values(['date_game', 'school_id'], inplace=True)

In [None]:
joined.reset_index(inplace=True, drop=True)
AP_polls.reset_index(inplace=True, drop=True)
Coach_polls.reset_index(inplace=True, drop=True)

In [None]:
#joined['AP_poll_rank'] = 30
#joined['coach_poll_rank'] = 30
#joined['opp_AP_poll_rank'] = 30
#joined['opp_coach_poll_rank'] = 30

In [None]:
AP_polls = AP_polls[['date_game', 'rank', 'school_id']]
Coach_polls = Coach_polls[['date_game', 'rank', 'school_id']]

In [None]:
print(AP_polls.shape)
print(Coach_polls.shape)

In [None]:
for i in range(0, 3670):
    for j in range(1,7):
        AP_polls = AP_polls.append(AP_polls.loc[i].replace(AP_polls['date_game'][i], AP_polls['date_game'][i]+timedelta(days=j)))
        AP_polls.reset_index(drop=True, inplace=True)        

In [None]:
AP_polls.set_index(['date_game', 'school_id'], drop=False, inplace=True)
joined.set_index(['date_game', 'school_id'], drop=False, inplace=True)

In [None]:
joined = joined.join(AP_polls['rank'], how='left', on=['date_game', 'school_id'], rsuffix='_r')

In [None]:
joined.shape

In [None]:
for i in range(0, 3655):
    for j in range(1,7):
        Coach_polls = Coach_polls.append(Coach_polls.loc[i].replace(Coach_polls['date_game'][i], Coach_polls['date_game'][i]+timedelta(days=j)))
        Coach_polls.reset_index(drop=True, inplace=True)        

In [None]:
Coach_polls.set_index(['date_game', 'school_id'], drop=False, inplace=True)

In [None]:
joined = joined.join(Coach_polls['rank'], how='left', on=['date_game', 'school_id'], rsuffix='_r')

In [None]:
# Why is a 'left' join making extra rows?
joined.shape

In [None]:
joined.head()

In [None]:
joined.drop(['school_id', 'date_game', 'opp_id'], axis=1, inplace=True)

In [None]:
joined.fillna(30, inplace=True)

In [None]:
list(joined.dtypes)

In [None]:
joined.head()

## Make Some Interaction Features

In [97]:
len(joined.columns)

133

In [98]:
list(joined.columns)

['ewm03fg2',
 'ewm03fg2a',
 'ewm03fg3',
 'ewm03fg3a',
 'ewm03ft',
 'ewm03fta',
 'ewm03orb',
 'ewm03drb',
 'ewm03ast',
 'ewm03stl',
 'ewm03blk',
 'ewm03tov',
 'ewm03pf',
 'ewm03pts',
 'ewm03game_score',
 'ewm03W',
 'ewm10fg2',
 'ewm10fg2a',
 'ewm10fg3',
 'ewm10fg3a',
 'ewm10ft',
 'ewm10fta',
 'ewm10orb',
 'ewm10drb',
 'ewm10ast',
 'ewm10stl',
 'ewm10blk',
 'ewm10tov',
 'ewm10pf',
 'ewm10pts',
 'ewm10game_score',
 'ewm10W',
 'ewm20fg2',
 'ewm20fg2a',
 'ewm20fg3',
 'ewm20fg3a',
 'ewm20ft',
 'ewm20fta',
 'ewm20orb',
 'ewm20drb',
 'ewm20ast',
 'ewm20stl',
 'ewm20blk',
 'ewm20tov',
 'ewm20pf',
 'ewm20pts',
 'ewm20game_score',
 'ewm20W',
 'rm30fg2',
 'rm30fg2a',
 'rm30fg3',
 'rm30fg3a',
 'rm30ft',
 'rm30fta',
 'rm30orb',
 'rm30drb',
 'rm30ast',
 'rm30stl',
 'rm30blk',
 'rm30tov',
 'rm30pf',
 'rm30pts',
 'rm30game_score',
 'rm30W',
 'ewm03heldto_fg2',
 'ewm03heldto_fg2a',
 'ewm03heldto_fg3',
 'ewm03heldto_fg3a',
 'ewm03heldto_ft',
 'ewm03heldto_fta',
 'ewm03heldto_orb',
 'ewm03heldto_drb',
 'e

In [99]:
joined.columns[63]

'rm30W'

In [100]:
interact_pair = []
for i in range(0,64):
    if 'W' not in joined.columns[i]:
        interact_pair.append([joined.columns[i], joined.columns[i+64]])

In [101]:
interact_pair[:5]

[['ewm03fg2', 'ewm03heldto_fg2'],
 ['ewm03fg2a', 'ewm03heldto_fg2a'],
 ['ewm03fg3', 'ewm03heldto_fg3'],
 ['ewm03fg3a', 'ewm03heldto_fg3a'],
 ['ewm03ft', 'ewm03heldto_ft']]

In [102]:
interact_pair.append(['SOS', 'opp_SOS'])
interact_pair.append(['SRS', 'opp_SRS'])

In [103]:
len(interact_pair)

62

In [104]:
for i, j in interact_pair:
    joined[i+j] = joined[i] - joined[j]

In [105]:
joined.shape

(75355, 195)

In [106]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,ewm03stl,...,rm30drbrm30heldto_drb,rm30astrm30heldto_ast,rm30stlrm30heldto_stl,rm30blkrm30heldto_blk,rm30tovrm30heldto_tov,rm30pfrm30heldto_pf,rm30ptsrm30heldto_pts,rm30game_scorerm30heldto_game_score,SOSopp_SOS,SRSopp_SRS
school_id,date_game,opp_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
High Point,2011-11-18,Central Florida,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,20.382396,8.311464,4.645682,...,-0.1,-1.866667,-0.1,-0.8,2.8,2.066667,0.1,-5.94,-6.44,-19.07
High Point,2011-11-22,Tennessee Tech,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,20.691198,11.655732,4.322841,...,0.633333,-2.733333,0.233333,-0.433333,-0.433333,0.766667,-6.433333,-7.926667,1.3,-7.18
High Point,2011-11-26,Citadel,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,19.845599,14.327866,5.66142,...,-0.9,-2.633333,0.866667,-0.9,4.833333,4.166667,-6.633333,-15.22,-1.63,-2.98
High Point,2011-11-29,Hampton,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,25.9228,13.663933,7.33071,...,-3.966667,0.7,1.733333,-0.033333,0.166667,-0.233333,-0.1,1.423333,4.3,-7.62
High Point,2011-12-03,Campbell,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,20.9614,11.831967,5.665355,...,-0.1,-2.3,-0.266667,-1.4,1.166667,-0.133333,-3.733333,-9.02,-0.2,-6.15


In [107]:
list(joined.columns)

['ewm03fg2',
 'ewm03fg2a',
 'ewm03fg3',
 'ewm03fg3a',
 'ewm03ft',
 'ewm03fta',
 'ewm03orb',
 'ewm03drb',
 'ewm03ast',
 'ewm03stl',
 'ewm03blk',
 'ewm03tov',
 'ewm03pf',
 'ewm03pts',
 'ewm03game_score',
 'ewm03W',
 'ewm10fg2',
 'ewm10fg2a',
 'ewm10fg3',
 'ewm10fg3a',
 'ewm10ft',
 'ewm10fta',
 'ewm10orb',
 'ewm10drb',
 'ewm10ast',
 'ewm10stl',
 'ewm10blk',
 'ewm10tov',
 'ewm10pf',
 'ewm10pts',
 'ewm10game_score',
 'ewm10W',
 'ewm20fg2',
 'ewm20fg2a',
 'ewm20fg3',
 'ewm20fg3a',
 'ewm20ft',
 'ewm20fta',
 'ewm20orb',
 'ewm20drb',
 'ewm20ast',
 'ewm20stl',
 'ewm20blk',
 'ewm20tov',
 'ewm20pf',
 'ewm20pts',
 'ewm20game_score',
 'ewm20W',
 'rm30fg2',
 'rm30fg2a',
 'rm30fg3',
 'rm30fg3a',
 'rm30ft',
 'rm30fta',
 'rm30orb',
 'rm30drb',
 'rm30ast',
 'rm30stl',
 'rm30blk',
 'rm30tov',
 'rm30pf',
 'rm30pts',
 'rm30game_score',
 'rm30W',
 'ewm03heldto_fg2',
 'ewm03heldto_fg2a',
 'ewm03heldto_fg3',
 'ewm03heldto_fg3a',
 'ewm03heldto_ft',
 'ewm03heldto_fta',
 'ewm03heldto_orb',
 'ewm03heldto_drb',
 'e

In [None]:
joined.to_csv('../data/joined_modeling.csv')

# Baseline

In [110]:
# Count since 2003
# https://www.teamrankings.com/ncb/odds-history/win/
spread = pd.read_csv('../data/point_spread_history.csv', sep='\t', header=None)

In [111]:
spread_cols = ['closing_spread', 'game_count', 'record', 'null1', 'null2', 'null3', 'null4',
            'null5', 'null6', 'null7', 'null8']
spread.columns = spread_cols

In [112]:
spread.loc[85]

closing_spread            0
game_count              914
record            457-457-0
null1                 50.0%
null2                     0
null3             220-237-0
null4                 48.1%
null5                 -0.71
null6             237-220-0
null7                 51.9%
null8                  0.71
Name: 85, dtype: object

In [113]:
# Regex: ^[0-9]{1,4}(?=-)   Will find the first column (incorrect)
# Regex: (?<=-)[0-9]{1,4}(?=-) Will find the second column (correct)

In [114]:
incorrect = []
correct = []
for i in spread['record'][:85]:
    inc = re.findall('^[0-9]{1,4}(?=-)', i)
    cor = re.findall('(?<=-)[0-9]{1,4}(?=-)', i)
    
    incorrect.append(int(inc[0]))
    correct.append(int(cor[0]))

In [115]:
sum(correct) / (sum(correct) + sum(incorrect))

0.7430196294257935

### Vegas correctly picks the winner in the opening line 74.3% of the time since 2013

# Modeling

In [None]:
#temp_cols = []
#for i in test4.columns:
#    if 'pts' in i:
#        temp_cols.append(i)

In [None]:
#test4[temp_cols].corr()

In [None]:
#cols_of_interest = ['ewm03pts', 'ewm07pts', 'rm30pts', 'rm30heldto_pts']

In [None]:
#cols_of_interest = ['ewm03fg3', 'ewm07fg3', 'rm30fg3', 'rm30heldto_fg3']

In [None]:
#cols_of_interest = ['ewm03ft', 'ewm07ft', 'rm30ft', 'rm30heldto_ft']

In [None]:
#joined[joined.columns[:64].append(joined.columns[128:])]

## Begin Modeling Here (Can skip everything above now)

### Bring in modeling df

In [2]:
joined = pd.read_csv('../data/joined_modeling.csv')

In [3]:
joined.set_index(['school_id', 'date_game', 'opp_id'], inplace=True)

In [4]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,ewm03stl,...,rm30drbrm30heldto_drb,rm30astrm30heldto_ast,rm30stlrm30heldto_stl,rm30blkrm30heldto_blk,rm30tovrm30heldto_tov,rm30pfrm30heldto_pf,rm30ptsrm30heldto_pts,rm30game_scorerm30heldto_game_score,SOSopp_SOS,SRSopp_SRS
school_id,date_game,opp_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
High Point,2011-11-18,Central Florida,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,20.382396,8.311464,4.645682,...,-0.1,-1.866667,-0.1,-0.8,2.8,2.066667,0.1,-5.94,-6.44,-19.07
High Point,2011-11-22,Tennessee Tech,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,20.691198,11.655732,4.322841,...,0.633333,-2.733333,0.233333,-0.433333,-0.433333,0.766667,-6.433333,-7.926667,1.3,-7.18
High Point,2011-11-26,Citadel,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,19.845599,14.327866,5.66142,...,-0.9,-2.633333,0.866667,-0.9,4.833333,4.166667,-6.633333,-15.22,-1.63,-2.98
High Point,2011-11-29,Hampton,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,25.9228,13.663933,7.33071,...,-3.966667,0.7,1.733333,-0.033333,0.166667,-0.233333,-0.1,1.423333,4.3,-7.62
High Point,2011-12-03,Campbell,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,20.9614,11.831967,5.665355,...,-0.1,-2.3,-0.266667,-1.4,1.166667,-0.133333,-3.733333,-9.02,-0.2,-6.15


In [None]:
#drop_cols = []
#for col in joined.columns:
#    if 'SRS' in col:
#        drop_cols.append(col)

In [None]:
#joined.drop(drop_cols, axis=1, inplace=True)

In [108]:
joined.shape

(75355, 195)

In [38]:
list(joined.columns)

['ewm03fg2',
 'ewm03fg2a',
 'ewm03fg3',
 'ewm03fg3a',
 'ewm03ft',
 'ewm03fta',
 'ewm03orb',
 'ewm03drb',
 'ewm03ast',
 'ewm03stl',
 'ewm03blk',
 'ewm03tov',
 'ewm03pf',
 'ewm03pts',
 'ewm03game_score',
 'ewm03W',
 'ewm10fg2',
 'ewm10fg2a',
 'ewm10fg3',
 'ewm10fg3a',
 'ewm10ft',
 'ewm10fta',
 'ewm10orb',
 'ewm10drb',
 'ewm10ast',
 'ewm10stl',
 'ewm10blk',
 'ewm10tov',
 'ewm10pf',
 'ewm10pts',
 'ewm10game_score',
 'ewm10W',
 'ewm20fg2',
 'ewm20fg2a',
 'ewm20fg3',
 'ewm20fg3a',
 'ewm20ft',
 'ewm20fta',
 'ewm20orb',
 'ewm20drb',
 'ewm20ast',
 'ewm20stl',
 'ewm20blk',
 'ewm20tov',
 'ewm20pf',
 'ewm20pts',
 'ewm20game_score',
 'ewm20W',
 'rm30fg2',
 'rm30fg2a',
 'rm30fg3',
 'rm30fg3a',
 'rm30ft',
 'rm30fta',
 'rm30orb',
 'rm30drb',
 'rm30ast',
 'rm30stl',
 'rm30blk',
 'rm30tov',
 'rm30pf',
 'rm30pts',
 'rm30game_score',
 'rm30W',
 'ewm03heldto_fg2',
 'ewm03heldto_fg2a',
 'ewm03heldto_fg3',
 'ewm03heldto_fg3a',
 'ewm03heldto_ft',
 'ewm03heldto_fta',
 'ewm03heldto_orb',
 'ewm03heldto_drb',
 'e

In [107]:
cols_interest = list(joined.columns[:133])
#cols_interest = joined.columns[128:133]

In [108]:
cols_interest

['ewm03fg2',
 'ewm03fg2a',
 'ewm03fg3',
 'ewm03fg3a',
 'ewm03ft',
 'ewm03fta',
 'ewm03orb',
 'ewm03drb',
 'ewm03ast',
 'ewm03stl',
 'ewm03blk',
 'ewm03tov',
 'ewm03pf',
 'ewm03pts',
 'ewm03game_score',
 'ewm03W',
 'ewm10fg2',
 'ewm10fg2a',
 'ewm10fg3',
 'ewm10fg3a',
 'ewm10ft',
 'ewm10fta',
 'ewm10orb',
 'ewm10drb',
 'ewm10ast',
 'ewm10stl',
 'ewm10blk',
 'ewm10tov',
 'ewm10pf',
 'ewm10pts',
 'ewm10game_score',
 'ewm10W',
 'ewm20fg2',
 'ewm20fg2a',
 'ewm20fg3',
 'ewm20fg3a',
 'ewm20ft',
 'ewm20fta',
 'ewm20orb',
 'ewm20drb',
 'ewm20ast',
 'ewm20stl',
 'ewm20blk',
 'ewm20tov',
 'ewm20pf',
 'ewm20pts',
 'ewm20game_score',
 'ewm20W',
 'rm30fg2',
 'rm30fg2a',
 'rm30fg3',
 'rm30fg3a',
 'rm30ft',
 'rm30fta',
 'rm30orb',
 'rm30drb',
 'rm30ast',
 'rm30stl',
 'rm30blk',
 'rm30tov',
 'rm30pf',
 'rm30pts',
 'rm30game_score',
 'rm30W',
 'ewm03heldto_fg2',
 'ewm03heldto_fg2a',
 'ewm03heldto_fg3',
 'ewm03heldto_fg3a',
 'ewm03heldto_ft',
 'ewm03heldto_fta',
 'ewm03heldto_orb',
 'ewm03heldto_drb',
 'e

In [54]:
#cols_interest.append(joined.columns[130])

In [55]:
#cols_interest.append(joined.columns[132])

### Split X and y, and apply Scaling

In [154]:
#X = joined[joined.columns[:64].append(joined.columns[128:])].drop('W', axis=1)
X = joined[cols_interest].drop('W', axis=1)
#X = team_b_df.drop('W', axis=1)
y = joined['W']
#y = team_b_df['W']

In [155]:
ss = StandardScaler()
X = ss.fit_transform(X)

### Apply some PCA to reduce features

In [137]:
pca = PCA(n_components=50)
pca = pca.fit(X)

In [138]:
pca.explained_variance_

array([18.99564042, 17.27010269, 12.42070411,  8.2121557 ,  7.39360937,
        6.08029454,  4.8569003 ,  4.75379698,  4.24374435,  3.66004452,
        3.32185342,  3.15004117,  2.80948152,  2.68697144,  2.32570721,
        2.21367291,  1.94215655,  1.88492781,  1.72275091,  1.52227339,
        1.45170423,  1.27736111,  1.18634258,  1.10493616,  1.00720926,
        0.94243617,  0.87424196,  0.79484227,  0.72909466,  0.72231591,
        0.63279467,  0.59393612,  0.55893006,  0.54049493,  0.50610423,
        0.46165248,  0.45161205,  0.43478386,  0.4098997 ,  0.38831478,
        0.37125649,  0.33351421,  0.31474496,  0.29193043,  0.27940687,
        0.27434997,  0.25230706,  0.23371698,  0.19979726,  0.1928314 ])

In [139]:
X = pca.transform(X)

## Train/Test Split

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(52748, 132)
(52748,)
(22607, 132)
(22607,)


## Logistic Regression

In [141]:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [142]:
scores = cross_val_score(logreg, X_train, y_train)
print(np.mean(scores))

0.7190604765659181


## Grid Search on Logistic Regression

In [157]:
lr_params = {'penalty':['l1', 'l2'],
             'tol':[.00001, .0001, .001],
             'C':[.001, .01, 1, 10],
            }

In [158]:
grid_lr = GridSearchCV(LogisticRegression(), lr_params, cv=5)
grid_lr.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'tol': [1e-05, 0.0001, 0.001], 'C': [0.001, 0.01, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [159]:
scores = cross_val_score(grid_lr, X_train, y_train)
np.mean(scores)

0.7254116307289898

In [160]:
grid_lr.best_params_

{'C': 0.01, 'penalty': 'l2', 'tol': 1e-05}

In [161]:
scores = cross_val_score(grid_lr, X_test, y_test)
np.mean(scores)

0.7269430379390164

In [162]:
preds = grid_lr.predict(X_test)

In [None]:
print(classification_report(y_test, preds))
pd.DataFrame(confusion_matrix(y_test, preds), columns=['Pred -', 'Pred +'], index=['Act -', 'Act +'])

### AUC ROC Score and Curve

In [None]:
preds = logreg.predict(X_test)

In [None]:
print(classification_report(y_test, preds))
pd.DataFrame(confusion_matrix(y_test, preds), columns=['Pred -', 'Pred +'], index=['Act -', 'Act +'])

In [None]:
proba_pairs = logreg.predict_proba(X_test)
probas = [item[1] for item in proba_pairs]
roc_auc_score(y_test, probas)

In [None]:
threshold[4200]

In [None]:
fpr, tpr, threshold = roc_curve(y_test, probas)
roc_auc = auc(fpr, tpr)

fix, ax = plt.subplots(figsize=(14,10))

plt.title('Receiver Operating Characteristic', fontsize=20)
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.scatter(fpr[4200], tpr[4200], s=200, label='Threshold = 50%')
plt.legend(loc = 'lower right', fontsize=14)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate', fontsize=16)
plt.xlabel('False Positive Rate', fontsize=16)
plt.show()

### What about double predictions?

In [None]:
test = joined
test.head()

In [None]:
test.reset_index(inplace=True)

In [None]:
test.sort_values(['date_game', 'school_id', 'opp_id'], inplace=True)

In [None]:
test.reset_index(inplace=True, drop=True)

In [None]:
test.shape

In [None]:
test.head()

In [None]:
test.index[-1]

In [None]:
team_a = []
team_b = []
for i in test.index:
    if i % 500 == 0:
        print('parsing...', i)
    if i not in team_b:
        for j in range(i, test.index[-1]):
            if test['school_id'][j] == test['opp_id'][i] and test['date_game'][i] == test['date_game'][j]:
                team_a.append(i)
                team_b.append(j)
                break

In [None]:
print(len(team_a))
print(len(team_b))

In [None]:
team_a_df = test.loc[team_a]
team_b_df = test.loc[team_b]

In [None]:
team_a_df.set_index(['school_id', 'date_game', 'opp_id'], inplace=True)
team_b_df.set_index(['school_id', 'date_game', 'opp_id'], inplace=True)

In [None]:
team_a_preds = logreg.predict(team_a_df.drop('W', axis=1))
team_b_preds = logreg.predict(team_b_df.drop('W', axis=1))

In [None]:
test_sums = team_a_preds + team_b_preds

In [None]:
# In some instances, both teams are being predicted to win, and in others, neither team is 
# predicted to win!
Counter(test_sums)

In [None]:
team_a_probas = logreg.predict_proba(team_a_df.drop('W', axis=1))
team_b_probas = logreg.predict_proba(team_b_df.drop('W', axis=1))

In [None]:
team_a_probas

In [None]:
team_a_df.head()

## Random Forest Classifier

In [122]:
rfc = RandomForestClassifier(max_depth=20)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [123]:
scores = cross_val_score(rfc, X_train, y_train)
np.mean(scores)

0.665257406789609

In [None]:
feat_series = pd.Series(rfc.feature_importances_, index=joined.drop('W', axis=1).columns)
feat_series.head()

In [None]:
# SRS = Simple Rating System, takes into account SOS
# SOS = Strength of Schedule
feat_series.sort_values(ascending=False)

## GridCV

In [124]:
rf_params = {'n_estimators':[2,5,10,20],
             'criterion':['gini', 'entropy'],
             'max_depth':[5,10,20,None]}
             #'min_samples_split':[3,4,5]}

In [125]:
gridcv_rf = RandomizedSearchCV(RandomForestClassifier(), rf_params, cv=5)

In [126]:
gridcv_rf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_estimators': [2, 5, 10, 20], 'criterion': ['gini', 'entropy'], 'max_depth': [5, 10, 20, None]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [127]:
scores = gridcv_rf.score(X_train, y_train)
np.mean(scores)

0.779707287480094

In [129]:
scores = gridcv_rf.score(X_test, y_test)
np.mean(scores)

0.7055779183438758

In [134]:
gridcv_rf.best_params_

{'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 20}

In [None]:
test = pd.read_csv('../data/test.csv', sep='\t', header=None)
test.columns = ['wk', 'date', 'rank', 'school', 'prev_rank', 'chnge', 'conf']

## KNN Classifier

In [118]:
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=20, p=2,
           weights='uniform')

In [119]:
scores = cross_val_score(knn, X_train, y_train)
print(np.mean(scores))

0.6579964848735292


## March Madness!

In [None]:
page1 = pd.read_csv('../data/marchmadness_2018/scores_pg1.csv', sep='\t')
page2 = pd.read_csv('../data/marchmadness_2018/scores_pg2.csv', sep='\t')

In [None]:
page1.reset_index(inplace=True)

In [None]:
print(page1.shape)
print(page2.shape)

In [None]:
page1.columns = ['rank', 'year', 'date', 'region', 'round', 'school_id', 'pts', 'opp_id',
                'opp_pts', 'ot', 'pts_df', 'location']

page2.columns = ['rank', 'year', 'date', 'region', 'round', 'school_id', 'pts', 'opp_id',
                'opp_pts', 'ot', 'pts_df', 'location']

In [None]:
page2.drop(20, inplace=True)

In [None]:
mm_2018 = pd.concat([page1, page2], ignore_index=True)

In [None]:
mm_2018.reset_index(drop=True, inplace=True)

In [None]:
schools_list = []
opp_list = []
school_wins = []

remove_digits = str.maketrans('', '', digits)
for i in mm_2018.index:
    school_dirty = mm_2018['school_id'][i]
    school_clean = school_dirty.translate(remove_digits).lstrip()
    
    opp_dirty = mm_2018['opp_id'][i]
    opp_clean = opp_dirty.translate(remove_digits).lstrip()
    
    schools_list.append(school_clean)
    opp_list.append(opp_clean)
    
    if mm_2018['pts'][i] > mm_2018['opp_pts'][i]:
        school_wins.append(1)
    else:
        school_wins.append(0)

In [None]:
print(len(schools_list))
print(len(opp_list))
print(len(school_wins))

In [None]:
mm_2018['school_id'] = schools_list
mm_2018['opp_id'] = opp_list
mm_2018['W'] = school_wins

In [None]:
mm_2018.head()

In [None]:
notin_list = []
for team in mm_2018['school_id']:
    if team not in df['school_id'].unique():
        notin_list.append(team)

In [None]:
notin_list

In [None]:
mm_2018['rank'] = mm_2018['rank'].apply(lambda x: int(x))
mm_2018.sort_values('rank', inplace=True, ascending=False)
mm_2018.reset_index(drop=True, inplace=True)

In [None]:
def row_maker(school, opp):
    school_max_date = joined.loc[school].index.max()[0]
    #opp = joined.loc[school, school_max_date].index[0]
    opp_max_date = joined.loc[opp].index.max()[0]
    
    row = pd.concat([joined.loc[school, school_max_date][joined.columns[0:64]].reset_index(drop=True),
                     joined.loc[opp, opp_max_date][joined.columns[64:128]].reset_index(drop=True),
                     joined.loc[school, school_max_date][joined.columns[129:131]].reset_index(drop=True),
                     joined.loc[opp, opp_max_date][joined.columns[129:131]].reset_index(drop=True).rename(columns={'SRS':'opp_SRS', 'SOS':'opp_SOS'})], 
                     axis=1)
    
    for i, j in interact_pair:
        row[i+j] = row[i] - row[j]
    
    return(row)

In [None]:
mm_2018[mm_2018['school_id']=='St. Bonaventure']

In [None]:
first4_indices = np.arange(0, 8, 1)
firstround_indices = np.arange(8, 72, 1)
secondround_indices = np.arange(72, 104, 1)
sweet16_indices = np.arange(104, 120, 1)
regional_indices = np.arange(120, 128, 1)
final4_indices = np.arange(128, 132, 1)
final_indices = np.arange(132, 134, 1)

In [None]:
# Predictions of 'First 4' Round
temp_df = pd.DataFrame(columns=joined.columns.drop('W'))

for i in mm_2018.index[firstround_indices]:
    temp_df = temp_df.append(row_maker(mm_2018['school_id'][i], mm_2018['opp_id'][i]))
    temp_df.reset_index(drop=True, inplace=True)

In [None]:
temp_df

In [None]:
print(classification_report(mm_2018['W'][firstround_indices], logreg.predict(temp_df)))
pd.DataFrame(confusion_matrix(mm_2018['W'][firstround_indices], logreg.predict(temp_df)), columns=['Pred -', 'Pred +'], index=['Act -', 'Act +'])

In [None]:
incorrect_pick = []
t_preds = logreg.predict(temp_df)
for i in firstround_indices:
    if mm_2018['W'][i] != t_preds[i-8]:
        incorrect_pick.append(mm_2018['school_id'][i])

In [None]:
incorrect_pick

In [None]:
list(row_maker('Villanova', 'Georgetown').columns)

In [None]:
list(joined.columns)

In [None]:
joined.shape