In [None]:
# Look into Home/Away data
# Refine offensive/defensive characterization
# Could explore boosted models
# Could explore neural nets

## Imports

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

#from sklearn.linear_model import LinearRegression
#from sklearn.ensemble import RandomForestRegressor

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
#from sklearn.metrics import r2_score
#from sklearn.feature_extraction.text import CountVectorizer

from sklearn.decomposition import PCA

import os
import re
from datetime import datetime
from datetime import timedelta
from operator import itemgetter
from string import digits

import requests
from bs4 import BeautifulSoup
import time

from collections import Counter

import matplotlib.pyplot as plt

%matplotlib inline

## Read in Player_By_Game Data from Scraper

In [198]:
df = pd.read_csv('../data/player_boxscores_df.csv', index_col=0)
df['date_game'] = pd.to_datetime(df['date_game'])
df.set_index(['player', 'date_game'], drop=False, inplace=True)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,player,pos,date_game,school_id,opp_id,gs,mp,fg,fga,fg2,...,stl,blk,tov,pf,pts,game_score,year,month,season,W
player,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Nate Wolters,2013-02-07,Nate Wolters,G,2013-02-07,South Dakota State,IPFW,1.0,40,17,28,8,...,1,1,3,0,53,42.6,2013,2,2013,1
Marshon Brooks,2011-02-23,Marshon Brooks,G,2011-02-23,Providence,Notre Dame,1.0,40,20,28,14,...,1,0,1,4,52,42.3,2011,2,2011,0
Jimmer Fredette,2011-03-11,Jimmer Fredette,G,2011-03-11,Brigham Young,New Mexico,1.0,40,22,37,15,...,1,0,2,2,52,36.5,2011,3,2011,1
Markus Howard,2018-01-03,Markus Howard,G,2018-01-03,Marquette,Providence,1.0,44,17,29,6,...,2,0,2,1,52,39.8,2018,1,2018,1
Mike Daum,2017-02-18,Mike Daum,F,2017-02-18,South Dakota State,IPFW,1.0,36,14,29,7,...,0,1,2,0,51,39.8,2017,2,2017,1


In [199]:
# May be useful to scrape and get Class/Height/Weight of Players, then generate a "Mismatch"
# feature to apply to the team_level game.

In [200]:
df.duplicated().sum()

0

In [201]:
df.shape

(870131, 28)

In [202]:
print(df['date_game'].max())
print(df['date_game'].min())

2018-03-07 00:00:00
2010-11-08 00:00:00


In [203]:
len(df[df['season']==2011]['school_id'].unique())

345

In [204]:
len(df[df['year']==2018]['school_id'].unique())

351

## Bring in School Data from Scraper

In [235]:
school_df = pd.read_csv('../data/schools.csv', index_col=0)
school_df.head()

Unnamed: 0,School,WinLossPct,SRS,SOS,Year
0,Air Force,0.323,-4.9,3.13,2010
1,Akron,0.686,2.82,-1.5,2010
2,Alabama A&M,0.407,-20.19,-13.71,2010
3,Alabama-Birmingham,0.735,9.46,2.9,2010
4,Alabama State,0.516,-14.41,-12.02,2010


In [236]:
school_df = school_df.rename(columns = {'School':'school_id', 'Year':'year'})

In [237]:
school_df.set_index(['school_id', 'year'], drop=False, inplace=True)

In [238]:
school_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,school_id,WinLossPct,SRS,SOS,year
school_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Air Force,2010,Air Force,0.323,-4.9,3.13,2010
Akron,2010,Akron,0.686,2.82,-1.5,2010
Alabama A&M,2010,Alabama A&M,0.407,-20.19,-13.71,2010
Alabama-Birmingham,2010,Alabama-Birmingham,0.735,9.46,2.9,2010
Alabama State,2010,Alabama State,0.516,-14.41,-12.02,2010


In [239]:
# May be useful scrape Ortg/Drtg and apply as features to the team_level game.

# Evaluation at the Team Level

### Set up "MyTeam"

In [240]:
myteam_df = df.groupby(['school_id', 'opp_id', 'date_game']).sum()

Defaulting to column but this will raise an ambiguity error in a future version
  if __name__ == '__main__':


In [241]:
# Sum of 'game_started' flag and 'minutes_played' meaningless, so we can drop
myteam_df.drop(['gs', 'mp', 'month', 'year', 'season'], axis=1, inplace=True)

In [242]:
# Drop 'fg' and 'fga' since these are captured in 'fg2'/'fg2a'/'fg3'/'fg3a'
myteam_df.drop(['fg', 'fga'], axis=1, inplace=True)

In [243]:
# Change to 0/1 Flags
myteam_df['W'] = myteam_df['W'].apply(lambda x: 1 if x > 0 else 0)

In [244]:
myteam_df.reset_index(inplace=True)

In [245]:
myteam_df.set_index(['school_id', 'date_game'], drop=False, inplace=True)

In [246]:
myteam_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,school_id,opp_id,date_game,fg2,fg2a,fg3,fg3a,ft,fta,orb,drb,ast,stl,blk,tov,pf,pts,game_score,W
school_id,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Abilene Christian,2017-12-06,Abilene Christian,Air Force,2017-12-06,14,29,9,25,7,15,7,17,11,8,4,12,16,62,40.3,1
Abilene Christian,2017-11-13,Abilene Christian,Arkansas State,2017-11-13,22,32,3,13,16,20,5,14,11,4,2,13,19,69,46.1,0
Abilene Christian,2014-12-23,Abilene Christian,Arkansas-Pine Bluff,2014-12-23,14,25,10,20,11,16,5,22,15,7,2,18,21,69,47.7,1
Abilene Christian,2014-12-20,Abilene Christian,Boise State,2014-12-20,5,36,6,19,5,6,11,10,4,4,0,9,12,33,2.2,0
Abilene Christian,2017-11-26,Abilene Christian,Bowling Green State,2017-11-26,28,48,6,16,14,18,10,26,18,2,6,8,18,88,73.6,1


### Set up "YourTeam" (which will be base defensive characteristics on)

In [247]:
yourteam_df = df.groupby(['opp_id', 'school_id', 'date_game']).sum()

# Sum of 'game_started' flag and 'minutes_played' meaningless, so we can drop
yourteam_df.drop(['gs', 'mp', 'year', 'month', 'season'], axis=1, inplace=True)

# Drop 'fg' and 'fga' since these are captured in 'fg2'/'fg2a'/'fg3'/'fg3a'
yourteam_df.drop(['fg', 'fga'], axis=1, inplace=True)

# Change to 0/1 Flags
yourteam_df['W'] = yourteam_df['W'].apply(lambda x: 1 if x > 0 else 0)

yourteam_df.reset_index(inplace=True)

yourteam_df.set_index(['opp_id', 'date_game'], drop=False, inplace=True)

Defaulting to column but this will raise an ambiguity error in a future version
  if __name__ == '__main__':


In [248]:
yourteam_df.columns = 'heldto_' + yourteam_df.columns

In [249]:
yourteam_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,heldto_opp_id,heldto_school_id,heldto_date_game,heldto_fg2,heldto_fg2a,heldto_fg3,heldto_fg3a,heldto_ft,heldto_fta,heldto_orb,heldto_drb,heldto_ast,heldto_stl,heldto_blk,heldto_tov,heldto_pf,heldto_pts,heldto_game_score,heldto_W
opp_id,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Abilene Christian,2017-12-06,Abilene Christian,Air Force,2017-12-06,22,33,3,18,5,8,5,28,15,5,2,20,14,58,34.3,0
Abilene Christian,2017-11-13,Abilene Christian,Arkansas State,2017-11-13,21,35,7,13,20,26,9,15,7,6,1,9,12,83,66.8,1
Abilene Christian,2014-12-23,Abilene Christian,Arkansas-Pine Bluff,2014-12-23,15,34,5,19,16,20,11,18,5,3,2,14,19,61,29.7,0
Abilene Christian,2014-12-20,Abilene Christian,Boise State,2014-12-20,21,32,10,20,5,6,8,32,17,2,6,12,7,77,71.1,1
Abilene Christian,2017-11-26,Abilene Christian,Bowling Green State,2017-11-26,22,48,8,24,15,22,17,20,9,4,3,8,18,83,56.9,0


### "MyTeam" Rolling/EWM Statistics Generation

In [250]:
myteam_stats = ['date_game', 'opp_id', 'fg2', 'fg2a', 'fg3', 'fg3a', 'ft', 'fta', 'orb', 'drb', 
         'ast', 'stl', 'blk', 'tov', 'pf', 'pts', 'game_score', 'W']

In [251]:
teams = df['school_id'].unique()

In [252]:
len(teams)

353

In [None]:
for team in teams:
    one_team_df = myteam_df.loc[team][myteam_stats].sort_values('date_game')
    
    ewm_03 = one_team_df.drop(['date_game', 'opp_id'], axis=1).ewm(span=3).mean().shift()
    ewm_10 = one_team_df.drop(['date_game', 'opp_id'], axis=1).ewm(span=10).mean().shift()
    ewm_20 = one_team_df.drop(['date_game', 'opp_id'], axis=1).ewm(span=20).mean().shift()
    rm_30 = one_team_df.drop(['date_game', 'opp_id'], axis=1).rolling(window=30).mean().shift()

    this_df = pd.concat([ewm_03, ewm_10, ewm_20, rm_30], axis=1)
    
    this_df['school_id'] = team
    this_df['date_game'] = one_team_df['date_game']
    this_df['opp_id'] = one_team_df['opp_id']

    this_df.to_csv('../data/myteam_ewm/'+team.replace(' ', '_')+'.csv')

In [None]:
for (dirpath, dirnames, filenames) in os.walk('../data/myteam_ewm/'):
    break

f = filenames # [:3] for testing

with open('../data/myteam_ewm_df.csv', 'wb') as output:
    for filename in f:
        with open('../data/myteam_ewm/'+filename, 'rb') as _input:
            for i, line in enumerate(_input):
                if i == 0:
                    continue       
                #print(line) # for testing
                output.write(line)

In [253]:
myteam_ewm_df = pd.read_csv('../data/myteam_ewm_df.csv', header=None)

In [254]:
myteam_ewm_df.shape

(87171, 68)

In [255]:
# Drop duplicate of opposing team
myteam_ewm_df.drop(66, axis=1, inplace=True)

In [256]:
myteam_ewm_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,57,58,59,60,61,62,63,64,65,67
0,2010-11-15,,,,,,,,,,...,,,,,,,,,High Point,Old Dominion
1,2010-11-23,8.0,32.0,8.0,22.0,17.0,29.0,10.0,19.0,10.0,...,,,,,,,,,High Point,Citadel
2,2010-11-28,14.666667,42.0,4.0,12.0,19.666667,31.666667,10.0,20.333333,7.333333,...,,,,,,,,,High Point,Hampton
3,2010-12-02,13.714286,39.142857,5.142857,12.0,19.857143,27.857143,7.142857,23.571429,10.0,...,,,,,,,,,High Point,Gardner-Webb
4,2010-12-04,16.533333,39.066667,4.0,10.933333,19.4,28.466667,6.533333,22.733333,9.466667,...,,,,,,,,,High Point,North Carolina-Asheville


In [257]:
myteam_stats = pd.Series(myteam_stats)

In [258]:
myteam_cols = []
for i in ['ewm03', 'ewm10', 'ewm20', 'rm30']:
    for j in myteam_stats.drop([0,1]):
        myteam_cols.append(i+j)

In [259]:
myteam_cols.insert(0, 'date_game')
myteam_cols.append('school_id')
myteam_cols.append('opp_id')

In [260]:
myteam_ewm_df.columns = myteam_cols

In [261]:
myteam_ewm_df.head()

Unnamed: 0,date_game,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,...,rm30ast,rm30stl,rm30blk,rm30tov,rm30pf,rm30pts,rm30game_score,rm30W,school_id,opp_id
0,2010-11-15,,,,,,,,,,...,,,,,,,,,High Point,Old Dominion
1,2010-11-23,8.0,32.0,8.0,22.0,17.0,29.0,10.0,19.0,10.0,...,,,,,,,,,High Point,Citadel
2,2010-11-28,14.666667,42.0,4.0,12.0,19.666667,31.666667,10.0,20.333333,7.333333,...,,,,,,,,,High Point,Hampton
3,2010-12-02,13.714286,39.142857,5.142857,12.0,19.857143,27.857143,7.142857,23.571429,10.0,...,,,,,,,,,High Point,Gardner-Webb
4,2010-12-04,16.533333,39.066667,4.0,10.933333,19.4,28.466667,6.533333,22.733333,9.466667,...,,,,,,,,,High Point,North Carolina-Asheville


In [262]:
myteam_ewm_df['date_game'] = pd.to_datetime(myteam_ewm_df['date_game'])

### "YourTeam" Rolling/EWM Statistics Generation

In [263]:
yourteam_stats = ['heldto_school_id', 'heldto_date_game', 'heldto_fg2',
       'heldto_fg2a', 'heldto_fg3', 'heldto_fg3a', 'heldto_ft', 'heldto_fta',
       'heldto_orb', 'heldto_drb', 'heldto_ast', 'heldto_stl', 'heldto_blk',
       'heldto_tov', 'heldto_pf', 'heldto_pts', 'heldto_game_score',
       'heldto_W']

In [264]:
for team in teams:
    try:
        one_team_df = yourteam_df.loc[team][yourteam_stats].sort_values('heldto_date_game')

        ewm_03 = one_team_df.drop(['heldto_date_game', 'heldto_school_id'], axis=1).ewm(span=3).mean().shift()
        ewm_10 = one_team_df.drop(['heldto_date_game', 'heldto_school_id'], axis=1).ewm(span=10).mean().shift()
        ewm_20 = one_team_df.drop(['heldto_date_game', 'heldto_school_id'], axis=1).ewm(span=20).mean().shift()
        rm_30 = one_team_df.drop(['heldto_date_game', 'heldto_school_id'], axis=1).rolling(window=30).mean().shift()

        this_df = pd.concat([ewm_03, ewm_10, ewm_20, rm_30], axis=1)

        this_df['heldto_opp_id'] = team
        this_df['heldto_date_game'] = one_team_df['heldto_date_game']
        this_df['heldto_school_id'] = one_team_df['heldto_school_id']

        this_df.to_csv('../data/yourteam_ewm/'+team.replace(' ', '_')+'.csv')
    except:
        pass

In [265]:
for (dirpath, dirnames, filenames) in os.walk('../data/yourteam_ewm/'):
    break

f = filenames # [:3] for testing

with open('../data/yourteam_ewm_df.csv', 'wb') as output:
    for filename in f:
        with open('../data/yourteam_ewm/'+filename, 'rb') as _input:
            for i, line in enumerate(_input):
                if i == 0:
                    continue       
                #print(line) # for testing
                output.write(line)

In [314]:
yourteam_ewm_df = pd.read_csv('../data/yourteam_ewm_df.csv', header=None)

In [315]:
yourteam_ewm_df.shape

(87171, 68)

In [316]:
# Drop duplicate of opposing team
yourteam_ewm_df.drop(66, axis=1, inplace=True)

In [317]:
yourteam_stats = pd.Series(yourteam_stats)

In [318]:
yourteam_cols = []
for i in ['ewm03', 'ewm10', 'ewm20', 'rm30']:
    for j in yourteam_stats.drop([0,1]):
        yourteam_cols.append(i+j)

In [319]:
yourteam_cols.insert(0, 'date_game')
yourteam_cols.append('heldto_opp_id')
yourteam_cols.append('heldto_school_id')

In [320]:
yourteam_ewm_df.columns = yourteam_cols

In [321]:
yourteam_ewm_df['date_game'] = pd.to_datetime(yourteam_ewm_df['date_game'])

## Join "MyTeam" and "YourTeam"

In [274]:
temp_my = myteam_ewm_df.set_index(['school_id', 'date_game'], drop=False)

In [275]:
temp_my.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date_game,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,...,rm30ast,rm30stl,rm30blk,rm30tov,rm30pf,rm30pts,rm30game_score,rm30W,school_id,opp_id
school_id,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
High Point,2010-11-15,2010-11-15,,,,,,,,,,...,,,,,,,,,High Point,Old Dominion
High Point,2010-11-23,2010-11-23,8.0,32.0,8.0,22.0,17.0,29.0,10.0,19.0,10.0,...,,,,,,,,,High Point,Citadel
High Point,2010-11-28,2010-11-28,14.666667,42.0,4.0,12.0,19.666667,31.666667,10.0,20.333333,7.333333,...,,,,,,,,,High Point,Hampton
High Point,2010-12-02,2010-12-02,13.714286,39.142857,5.142857,12.0,19.857143,27.857143,7.142857,23.571429,10.0,...,,,,,,,,,High Point,Gardner-Webb
High Point,2010-12-04,2010-12-04,16.533333,39.066667,4.0,10.933333,19.4,28.466667,6.533333,22.733333,9.466667,...,,,,,,,,,High Point,North Carolina-Asheville


In [276]:
# Rename these cols so that 'join' will work later
yourteam_ewm_df.rename(columns={'heldto_school_id':'school_id'}, inplace=True)
yourteam_ewm_df.rename(columns={'heldto_opp_id':'opp_id'}, inplace=True)

In [277]:
temp_your = yourteam_ewm_df.set_index(['school_id', 'date_game'], drop=False)

In [278]:
temp_your.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date_game,ewm03heldto_fg2,ewm03heldto_fg2a,ewm03heldto_fg3,ewm03heldto_fg3a,ewm03heldto_ft,ewm03heldto_fta,ewm03heldto_orb,ewm03heldto_drb,ewm03heldto_ast,...,rm30heldto_ast,rm30heldto_stl,rm30heldto_blk,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,opp_id,school_id
school_id,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Old Dominion,2010-11-15,2010-11-15,,,,,,,,,,...,,,,,,,,,High Point,Old Dominion
Citadel,2010-11-23,2010-11-23,19.0,36.0,7.0,19.0,20.0,34.0,17.0,27.0,19.0,...,,,,,,,,,High Point,Citadel
Hampton,2010-11-28,2010-11-28,18.333333,40.0,5.666667,17.0,20.0,31.333333,15.666667,26.333333,17.0,...,,,,,,,,,High Point,Hampton
Gardner-Webb,2010-12-02,2010-12-02,16.428571,39.428571,4.714286,13.571429,24.571429,35.142857,10.714286,26.714286,11.285714,...,,,,,,,,,High Point,Gardner-Webb
North Carolina-Asheville,2010-12-04,2010-12-04,18.866667,38.666667,2.2,14.333333,23.2,34.533333,11.933333,26.333333,6.866667,...,,,,,,,,,High Point,North Carolina-Asheville


In [279]:
joined = temp_my.join(temp_your, how='left', on=['school_id', 'date_game'], rsuffix='_r')

In [280]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,date_game,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,...,rm30heldto_ast,rm30heldto_stl,rm30heldto_blk,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,opp_id_r,school_id_r
school_id,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
High Point,2010-11-15,2010-11-15,,,,,,,,,,...,,,,,,,,,Old Dominion,High Point
High Point,2010-11-23,2010-11-23,8.0,32.0,8.0,22.0,17.0,29.0,10.0,19.0,10.0,...,,,,,,,,,Citadel,High Point
High Point,2010-11-28,2010-11-28,14.666667,42.0,4.0,12.0,19.666667,31.666667,10.0,20.333333,7.333333,...,,,,,,,,,Hampton,High Point
High Point,2010-12-02,2010-12-02,13.714286,39.142857,5.142857,12.0,19.857143,27.857143,7.142857,23.571429,10.0,...,,,,,,,,,Gardner-Webb,High Point
High Point,2010-12-04,2010-12-04,16.533333,39.066667,4.0,10.933333,19.4,28.466667,6.533333,22.733333,9.466667,...,,,,,,,,,North Carolina-Asheville,High Point


In [281]:
joined.shape

(87185, 134)

In [282]:
joined.isnull().sum()

date_game                     0
ewm03fg2                    353
ewm03fg2a                   353
ewm03fg3                    353
ewm03fg3a                   353
ewm03ft                     353
ewm03fta                    353
ewm03orb                    353
ewm03drb                    353
ewm03ast                    353
ewm03stl                    353
ewm03blk                    353
ewm03tov                    353
ewm03pf                     353
ewm03pts                    353
ewm03game_score             353
ewm03W                      353
ewm10fg2                    353
ewm10fg2a                   353
ewm10fg3                    353
ewm10fg3a                   353
ewm10ft                     353
ewm10fta                    353
ewm10orb                    353
ewm10drb                    353
ewm10ast                    353
ewm10stl                    353
ewm10blk                    353
ewm10tov                    353
ewm10pf                     353
                          ...  
ewm20hel

In [283]:
# EWM gave us first day NA's, RM gave us first 30 days NA's
joined.dropna(inplace=True)

In [284]:
# Bring in the target column, the W's
joined = joined.join(myteam_df['W'], how='left', on=['school_id', 'date_game'])

In [285]:
# Drop repeat data
joined.drop(['date_game', 'school_id', 'date_game_r', 'opp_id_r', 'school_id_r'], 
            axis=1, inplace=True)

### Add SOS and SRS data

In [286]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,ewm03stl,...,rm30heldto_drb,rm30heldto_ast,rm30heldto_stl,rm30heldto_blk,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,W
school_id,date_game,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
High Point,2011-11-18,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,20.382396,8.311464,4.645682,...,20.166667,12.666667,6.9,3.566667,12.2,19.233333,64.533333,44.153333,0.433333,0
High Point,2011-11-22,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,20.691198,11.655732,4.322841,...,19.5,13.7,6.566667,3.133333,15.3,20.366667,71.566667,46.986667,0.433333,0
High Point,2011-11-26,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,19.845599,14.327866,5.66142,...,20.966667,13.966667,6.0,3.7,10.166667,16.6,71.566667,54.456667,0.7,1
High Point,2011-11-29,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,25.9228,13.663933,7.33071,...,24.233333,10.666667,5.2,2.833333,14.9,20.6,65.566667,38.533333,0.366667,0
High Point,2011-12-03,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,20.9614,11.831967,5.665355,...,20.166667,13.7,6.966667,4.033333,13.833333,20.266667,69.133333,48.686667,0.566667,0


In [287]:
joined.reset_index(drop=False, inplace=True)

In [288]:
joined['year'] = joined['date_game'].apply(lambda x: x.year)

In [289]:
joined.set_index(['school_id', 'year'], drop=False, inplace=True)

In [290]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,school_id,date_game,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,...,rm30heldto_ast,rm30heldto_stl,rm30heldto_blk,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,W,year
school_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
High Point,2011,High Point,2011-11-18,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,20.382396,...,12.666667,6.9,3.566667,12.2,19.233333,64.533333,44.153333,0.433333,0,2011
High Point,2011,High Point,2011-11-22,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,20.691198,...,13.7,6.566667,3.133333,15.3,20.366667,71.566667,46.986667,0.433333,0,2011
High Point,2011,High Point,2011-11-26,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,19.845599,...,13.966667,6.0,3.7,10.166667,16.6,71.566667,54.456667,0.7,1,2011
High Point,2011,High Point,2011-11-29,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,25.9228,...,10.666667,5.2,2.833333,14.9,20.6,65.566667,38.533333,0.366667,0,2011
High Point,2011,High Point,2011-12-03,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,20.9614,...,13.7,6.966667,4.033333,13.833333,20.266667,69.133333,48.686667,0.566667,0,2011


In [291]:
joined = joined.join(school_df[['SRS', 'SOS']], how='left', on=['school_id', 'year'])

In [292]:
joined.drop(['school_id', 'year'], axis=1, inplace=True)

In [293]:
joined.reset_index(drop=False, inplace=True)

In [294]:
joined.set_index(['opp_id', 'year'], drop=False, inplace=True)

In [295]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,school_id,year,date_game,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,...,rm30heldto_stl,rm30heldto_blk,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,W,SRS,SOS
opp_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Central Florida,2011,High Point,2011,2011-11-18,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,...,6.9,3.566667,12.2,19.233333,64.533333,44.153333,0.433333,0,-12.82,-4.47
Tennessee Tech,2011,High Point,2011,2011-11-22,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,...,6.566667,3.133333,15.3,20.366667,71.566667,46.986667,0.433333,0,-12.82,-4.47
Citadel,2011,High Point,2011,2011-11-26,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,...,6.0,3.7,10.166667,16.6,71.566667,54.456667,0.7,1,-12.82,-4.47
Hampton,2011,High Point,2011,2011-11-29,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,...,5.2,2.833333,14.9,20.6,65.566667,38.533333,0.366667,0,-12.82,-4.47
Campbell,2011,High Point,2011,2011-12-03,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,...,6.966667,4.033333,13.833333,20.266667,69.133333,48.686667,0.566667,0,-12.82,-4.47


In [296]:
school_df.drop(['year', 'school_id'], axis=1, inplace=True)

In [297]:
school_df.reset_index(drop=False, inplace=True)

In [298]:
school_df = school_df.rename(columns={'school_id':'opp_id', 'SRS':'opp_SRS', 'SOS':'opp_SOS'})

In [299]:
school_df.set_index(['opp_id', 'year'], drop=False, inplace=True)

In [300]:
school_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,opp_id,year,WinLossPct,opp_SRS,opp_SOS
opp_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Air Force,2010,Air Force,2010,0.323,-4.9,3.13
Akron,2010,Akron,2010,0.686,2.82,-1.5
Alabama A&M,2010,Alabama A&M,2010,0.407,-20.19,-13.71
Alabama-Birmingham,2010,Alabama-Birmingham,2010,0.735,9.46,2.9
Alabama State,2010,Alabama State,2010,0.516,-14.41,-12.02


In [301]:
joined = joined.join(school_df[['opp_SRS', 'opp_SOS']], how='left', on=['opp_id', 'year'])

In [302]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,school_id,year,date_game,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,...,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,W,SRS,SOS,opp_SRS,opp_SOS
opp_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Central Florida,2011,High Point,2011,2011-11-18,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,...,12.2,19.233333,64.533333,44.153333,0.433333,0,-12.82,-4.47,6.25,1.97
Tennessee Tech,2011,High Point,2011,2011-11-22,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,...,15.3,20.366667,71.566667,46.986667,0.433333,0,-12.82,-4.47,-5.64,-5.77
Citadel,2011,High Point,2011,2011-11-26,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,...,10.166667,16.6,71.566667,54.456667,0.7,1,-12.82,-4.47,-9.84,-2.84
Hampton,2011,High Point,2011,2011-11-29,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,...,14.9,20.6,65.566667,38.533333,0.366667,0,-12.82,-4.47,-5.2,-8.77
Campbell,2011,High Point,2011,2011-12-03,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,...,13.833333,20.266667,69.133333,48.686667,0.566667,0,-12.82,-4.47,-6.67,-4.27


In [303]:
list(joined.columns)

['school_id',
 'year',
 'date_game',
 'ewm03fg2',
 'ewm03fg2a',
 'ewm03fg3',
 'ewm03fg3a',
 'ewm03ft',
 'ewm03fta',
 'ewm03orb',
 'ewm03drb',
 'ewm03ast',
 'ewm03stl',
 'ewm03blk',
 'ewm03tov',
 'ewm03pf',
 'ewm03pts',
 'ewm03game_score',
 'ewm03W',
 'ewm10fg2',
 'ewm10fg2a',
 'ewm10fg3',
 'ewm10fg3a',
 'ewm10ft',
 'ewm10fta',
 'ewm10orb',
 'ewm10drb',
 'ewm10ast',
 'ewm10stl',
 'ewm10blk',
 'ewm10tov',
 'ewm10pf',
 'ewm10pts',
 'ewm10game_score',
 'ewm10W',
 'ewm20fg2',
 'ewm20fg2a',
 'ewm20fg3',
 'ewm20fg3a',
 'ewm20ft',
 'ewm20fta',
 'ewm20orb',
 'ewm20drb',
 'ewm20ast',
 'ewm20stl',
 'ewm20blk',
 'ewm20tov',
 'ewm20pf',
 'ewm20pts',
 'ewm20game_score',
 'ewm20W',
 'rm30fg2',
 'rm30fg2a',
 'rm30fg3',
 'rm30fg3a',
 'rm30ft',
 'rm30fta',
 'rm30orb',
 'rm30drb',
 'rm30ast',
 'rm30stl',
 'rm30blk',
 'rm30tov',
 'rm30pf',
 'rm30pts',
 'rm30game_score',
 'rm30W',
 'opp_id',
 'ewm03heldto_fg2',
 'ewm03heldto_fg2a',
 'ewm03heldto_fg3',
 'ewm03heldto_fg3a',
 'ewm03heldto_ft',
 'ewm03heldto_f

In [304]:
joined.drop(['year', 'opp_id'], axis=1, inplace=True)

In [305]:
joined.reset_index(inplace=True, drop=False)

In [306]:
joined.drop(['year'], axis=1, inplace=True)

In [307]:
joined.set_index(['school_id', 'date_game', 'opp_id'], inplace=True)

In [308]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,ewm03stl,...,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,W,SRS,SOS,opp_SRS,opp_SOS
school_id,date_game,opp_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
High Point,2011-11-18,Central Florida,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,20.382396,8.311464,4.645682,...,12.2,19.233333,64.533333,44.153333,0.433333,0,-12.82,-4.47,6.25,1.97
High Point,2011-11-22,Tennessee Tech,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,20.691198,11.655732,4.322841,...,15.3,20.366667,71.566667,46.986667,0.433333,0,-12.82,-4.47,-5.64,-5.77
High Point,2011-11-26,Citadel,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,19.845599,14.327866,5.66142,...,10.166667,16.6,71.566667,54.456667,0.7,1,-12.82,-4.47,-9.84,-2.84
High Point,2011-11-29,Hampton,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,25.9228,13.663933,7.33071,...,14.9,20.6,65.566667,38.533333,0.366667,0,-12.82,-4.47,-5.2,-8.77
High Point,2011-12-03,Campbell,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,20.9614,11.831967,5.665355,...,13.833333,20.266667,69.133333,48.686667,0.566667,0,-12.82,-4.47,-6.67,-4.27


In [309]:
joined.isnull().sum()

ewm03fg2                    0
ewm03fg2a                   0
ewm03fg3                    0
ewm03fg3a                   0
ewm03ft                     0
ewm03fta                    0
ewm03orb                    0
ewm03drb                    0
ewm03ast                    0
ewm03stl                    0
ewm03blk                    0
ewm03tov                    0
ewm03pf                     0
ewm03pts                    0
ewm03game_score             0
ewm03W                      0
ewm10fg2                    0
ewm10fg2a                   0
ewm10fg3                    0
ewm10fg3a                   0
ewm10ft                     0
ewm10fta                    0
ewm10orb                    0
ewm10drb                    0
ewm10ast                    0
ewm10stl                    0
ewm10blk                    0
ewm10tov                    0
ewm10pf                     0
ewm10pts                    0
                         ... 
ewm20heldto_drb             0
ewm20heldto_ast             0
ewm20heldt

In [310]:
joined.dropna(inplace=True)

In [311]:
list(joined.dtypes)

[dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('float64'),
 dtype('floa

### Add Poll Rankings (Abandoned, little improvement)

In [None]:
# Leave 2018 out for now, structured a little differently
polls_2010 = pd.read_csv('../data/polls_rank/2010_polls.csv', sep='\t', header=None)
polls_2011 = pd.read_csv('../data/polls_rank/2011_polls.csv', sep='\t', header=None)
polls_2012 = pd.read_csv('../data/polls_rank/2012_polls.csv', sep='\t', header=None)
polls_2013 = pd.read_csv('../data/polls_rank/2013_polls.csv', sep='\t', header=None)
polls_2014 = pd.read_csv('../data/polls_rank/2014_polls.csv', sep='\t', header=None)
polls_2015 = pd.read_csv('../data/polls_rank/2015_polls.csv', sep='\t', header=None)
polls_2016 = pd.read_csv('../data/polls_rank/2016_polls.csv', sep='\t', header=None)
polls_2017 = pd.read_csv('../data/polls_rank/2017_polls.csv', sep='\t', header=None)

In [None]:
polls_2010.replace('Final', '2010-03-15', inplace=True)
polls_2010.replace('Preseason', '2009-11-09', inplace=True)

polls_2011.replace('Final', '2011-03-15', inplace=True)
polls_2011.replace('Preseason', '2010-11-09', inplace=True)

polls_2012.replace('Final', '2012-03-15', inplace=True)
polls_2012.replace('Preseason', '2011-11-09', inplace=True)

polls_2013.replace('Final', '2013-03-15', inplace=True)
polls_2013.replace('Preseason', '2012-11-09', inplace=True)

polls_2014.replace('Final', '2014-03-15', inplace=True)
polls_2014.replace('Preseason', '2013-11-09', inplace=True)

polls_2015.replace('Final', '2015-03-15', inplace=True)
polls_2015.replace('Preseason', '2014-11-09', inplace=True)

polls_2016.replace('Final', '2016-03-15', inplace=True)
polls_2016.replace('Preseason', '2015-11-09', inplace=True)

polls_2017.replace('Final', '2017-03-15', inplace=True)
polls_2017.replace('Preseason', '2016-11-09', inplace=True)

In [None]:
# AP Polls from 0:492, Coaches Polls from 494: (2010, 2011, 2013, 2014)
# AP Polls from 0:493, Coaches Polls from 495: (2012, 2015, 2016)
# AP Polls from 0:494, Coaches Polls from 496: (2017)
polls_2017.loc[495]

In [None]:
AP_polls_2010 = polls_2010.loc[:492]
Coach_polls_2010 = polls_2010.loc[494:]

AP_polls_2011 = polls_2011.loc[:492]
Coach_polls_2011 = polls_2011.loc[494:]

AP_polls_2012 = polls_2012.loc[:493]
Coach_polls_2012 = polls_2012.loc[495:]

AP_polls_2013 = polls_2013.loc[:492]
Coach_polls_2013 = polls_2013.loc[494:]

AP_polls_2014 = polls_2014.loc[:492]
Coach_polls_2014 = polls_2014.loc[494:]

AP_polls_2015 = polls_2015.loc[:493]
Coach_polls_2015 = polls_2015.loc[495:]

AP_polls_2016 = polls_2016.loc[:493]
Coach_polls_2016 = polls_2016.loc[495:]

AP_polls_2017 = polls_2017.loc[:494]
Coach_polls_2017 = polls_2017.loc[496:]

In [None]:
AP_polls = pd.concat([AP_polls_2010, AP_polls_2011, AP_polls_2012, AP_polls_2013, AP_polls_2014,
                      AP_polls_2015, AP_polls_2016, AP_polls_2017])

Coach_polls = pd.concat([Coach_polls_2010, Coach_polls_2011, Coach_polls_2012, Coach_polls_2013,
                         Coach_polls_2014, Coach_polls_2015, Coach_polls_2016, Coach_polls_2017])

In [None]:
polls_cols = ['wk', 'date_game', 'rank', 'school_id', 'prev_rank', 'chnge', 'conf']

In [None]:
AP_polls.columns = polls_cols
Coach_polls.columns = polls_cols

In [None]:
AP_drop_indices = AP_polls[AP_polls['wk']=='Wk'].index
Coach_drop_indices = Coach_polls[Coach_polls['wk']=='Wk'].index

In [None]:
AP_polls.drop(AP_drop_indices, inplace=True)
Coach_polls.drop(Coach_drop_indices, inplace=True)

In [None]:
print(AP_polls.shape)
print(Coach_polls.shape)

In [None]:
AP_polls.reset_index(drop=True, inplace=True)
Coach_polls.reset_index(drop=True, inplace=True)

In [None]:
AP_polls.head()

In [None]:
AP_polls['rank'] = AP_polls['rank'].astype(int)
Coach_polls['rank'] = Coach_polls['rank'].astype(int)

AP_polls['date_game'] = pd.to_datetime(AP_polls['date_game'])
Coach_polls['date_game'] = pd.to_datetime(Coach_polls['date_game'])

In [None]:
AP_polls.head()

In [None]:
joined.reset_index(inplace=True, drop=False)

In [None]:
joined.sort_values(['date_game', 'school_id'], inplace=True)
AP_polls.sort_values(['date_game', 'school_id'], inplace=True)
Coach_polls.sort_values(['date_game', 'school_id'], inplace=True)

In [None]:
joined.reset_index(inplace=True, drop=True)
AP_polls.reset_index(inplace=True, drop=True)
Coach_polls.reset_index(inplace=True, drop=True)

In [None]:
#joined['AP_poll_rank'] = 30
#joined['coach_poll_rank'] = 30
#joined['opp_AP_poll_rank'] = 30
#joined['opp_coach_poll_rank'] = 30

In [None]:
AP_polls = AP_polls[['date_game', 'rank', 'school_id']]
Coach_polls = Coach_polls[['date_game', 'rank', 'school_id']]

In [None]:
print(AP_polls.shape)
print(Coach_polls.shape)

In [None]:
for i in range(0, 3670):
    for j in range(1,7):
        AP_polls = AP_polls.append(AP_polls.loc[i].replace(AP_polls['date_game'][i], AP_polls['date_game'][i]+timedelta(days=j)))
        AP_polls.reset_index(drop=True, inplace=True)        

In [None]:
AP_polls.set_index(['date_game', 'school_id'], drop=False, inplace=True)
joined.set_index(['date_game', 'school_id'], drop=False, inplace=True)

In [None]:
joined = joined.join(AP_polls['rank'], how='left', on=['date_game', 'school_id'], rsuffix='_r')

In [None]:
joined.shape

In [None]:
for i in range(0, 3655):
    for j in range(1,7):
        Coach_polls = Coach_polls.append(Coach_polls.loc[i].replace(Coach_polls['date_game'][i], Coach_polls['date_game'][i]+timedelta(days=j)))
        Coach_polls.reset_index(drop=True, inplace=True)        

In [None]:
Coach_polls.set_index(['date_game', 'school_id'], drop=False, inplace=True)

In [None]:
joined = joined.join(Coach_polls['rank'], how='left', on=['date_game', 'school_id'], rsuffix='_r')

In [None]:
# Why is a 'left' join making extra rows?
joined.shape

In [None]:
joined.head()

In [None]:
joined.drop(['school_id', 'date_game', 'opp_id'], axis=1, inplace=True)

In [None]:
joined.fillna(30, inplace=True)

In [None]:
list(joined.dtypes)

In [None]:
joined.head()

## Make Some Interaction Features

In [97]:
len(joined.columns)

133

In [98]:
list(joined.columns)

['ewm03fg2',
 'ewm03fg2a',
 'ewm03fg3',
 'ewm03fg3a',
 'ewm03ft',
 'ewm03fta',
 'ewm03orb',
 'ewm03drb',
 'ewm03ast',
 'ewm03stl',
 'ewm03blk',
 'ewm03tov',
 'ewm03pf',
 'ewm03pts',
 'ewm03game_score',
 'ewm03W',
 'ewm10fg2',
 'ewm10fg2a',
 'ewm10fg3',
 'ewm10fg3a',
 'ewm10ft',
 'ewm10fta',
 'ewm10orb',
 'ewm10drb',
 'ewm10ast',
 'ewm10stl',
 'ewm10blk',
 'ewm10tov',
 'ewm10pf',
 'ewm10pts',
 'ewm10game_score',
 'ewm10W',
 'ewm20fg2',
 'ewm20fg2a',
 'ewm20fg3',
 'ewm20fg3a',
 'ewm20ft',
 'ewm20fta',
 'ewm20orb',
 'ewm20drb',
 'ewm20ast',
 'ewm20stl',
 'ewm20blk',
 'ewm20tov',
 'ewm20pf',
 'ewm20pts',
 'ewm20game_score',
 'ewm20W',
 'rm30fg2',
 'rm30fg2a',
 'rm30fg3',
 'rm30fg3a',
 'rm30ft',
 'rm30fta',
 'rm30orb',
 'rm30drb',
 'rm30ast',
 'rm30stl',
 'rm30blk',
 'rm30tov',
 'rm30pf',
 'rm30pts',
 'rm30game_score',
 'rm30W',
 'ewm03heldto_fg2',
 'ewm03heldto_fg2a',
 'ewm03heldto_fg3',
 'ewm03heldto_fg3a',
 'ewm03heldto_ft',
 'ewm03heldto_fta',
 'ewm03heldto_orb',
 'ewm03heldto_drb',
 'e

In [99]:
joined.columns[63]

'rm30W'

In [100]:
interact_pair = []
for i in range(0,64):
    if 'W' not in joined.columns[i]:
        interact_pair.append([joined.columns[i], joined.columns[i+64]])

In [101]:
interact_pair[:5]

[['ewm03fg2', 'ewm03heldto_fg2'],
 ['ewm03fg2a', 'ewm03heldto_fg2a'],
 ['ewm03fg3', 'ewm03heldto_fg3'],
 ['ewm03fg3a', 'ewm03heldto_fg3a'],
 ['ewm03ft', 'ewm03heldto_ft']]

In [102]:
interact_pair.append(['SOS', 'opp_SOS'])
interact_pair.append(['SRS', 'opp_SRS'])

In [103]:
len(interact_pair)

62

In [104]:
for i, j in interact_pair:
    joined[i+j] = joined[i] - joined[j]

In [105]:
joined.shape

(75355, 195)

In [106]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,ewm03stl,...,rm30drbrm30heldto_drb,rm30astrm30heldto_ast,rm30stlrm30heldto_stl,rm30blkrm30heldto_blk,rm30tovrm30heldto_tov,rm30pfrm30heldto_pf,rm30ptsrm30heldto_pts,rm30game_scorerm30heldto_game_score,SOSopp_SOS,SRSopp_SRS
school_id,date_game,opp_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
High Point,2011-11-18,Central Florida,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,20.382396,8.311464,4.645682,...,-0.1,-1.866667,-0.1,-0.8,2.8,2.066667,0.1,-5.94,-6.44,-19.07
High Point,2011-11-22,Tennessee Tech,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,20.691198,11.655732,4.322841,...,0.633333,-2.733333,0.233333,-0.433333,-0.433333,0.766667,-6.433333,-7.926667,1.3,-7.18
High Point,2011-11-26,Citadel,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,19.845599,14.327866,5.66142,...,-0.9,-2.633333,0.866667,-0.9,4.833333,4.166667,-6.633333,-15.22,-1.63,-2.98
High Point,2011-11-29,Hampton,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,25.9228,13.663933,7.33071,...,-3.966667,0.7,1.733333,-0.033333,0.166667,-0.233333,-0.1,1.423333,4.3,-7.62
High Point,2011-12-03,Campbell,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,20.9614,11.831967,5.665355,...,-0.1,-2.3,-0.266667,-1.4,1.166667,-0.133333,-3.733333,-9.02,-0.2,-6.15


In [107]:
list(joined.columns)

['ewm03fg2',
 'ewm03fg2a',
 'ewm03fg3',
 'ewm03fg3a',
 'ewm03ft',
 'ewm03fta',
 'ewm03orb',
 'ewm03drb',
 'ewm03ast',
 'ewm03stl',
 'ewm03blk',
 'ewm03tov',
 'ewm03pf',
 'ewm03pts',
 'ewm03game_score',
 'ewm03W',
 'ewm10fg2',
 'ewm10fg2a',
 'ewm10fg3',
 'ewm10fg3a',
 'ewm10ft',
 'ewm10fta',
 'ewm10orb',
 'ewm10drb',
 'ewm10ast',
 'ewm10stl',
 'ewm10blk',
 'ewm10tov',
 'ewm10pf',
 'ewm10pts',
 'ewm10game_score',
 'ewm10W',
 'ewm20fg2',
 'ewm20fg2a',
 'ewm20fg3',
 'ewm20fg3a',
 'ewm20ft',
 'ewm20fta',
 'ewm20orb',
 'ewm20drb',
 'ewm20ast',
 'ewm20stl',
 'ewm20blk',
 'ewm20tov',
 'ewm20pf',
 'ewm20pts',
 'ewm20game_score',
 'ewm20W',
 'rm30fg2',
 'rm30fg2a',
 'rm30fg3',
 'rm30fg3a',
 'rm30ft',
 'rm30fta',
 'rm30orb',
 'rm30drb',
 'rm30ast',
 'rm30stl',
 'rm30blk',
 'rm30tov',
 'rm30pf',
 'rm30pts',
 'rm30game_score',
 'rm30W',
 'ewm03heldto_fg2',
 'ewm03heldto_fg2a',
 'ewm03heldto_fg3',
 'ewm03heldto_fg3a',
 'ewm03heldto_ft',
 'ewm03heldto_fta',
 'ewm03heldto_orb',
 'ewm03heldto_drb',
 'e

In [None]:
joined.to_csv('../data/joined_modeling.csv')

# Baseline

In [110]:
# Count since 2003
# https://www.teamrankings.com/ncb/odds-history/win/
spread = pd.read_csv('../data/point_spread_history.csv', sep='\t', header=None)

In [111]:
spread_cols = ['closing_spread', 'game_count', 'record', 'null1', 'null2', 'null3', 'null4',
            'null5', 'null6', 'null7', 'null8']
spread.columns = spread_cols

In [112]:
spread.loc[85]

closing_spread            0
game_count              914
record            457-457-0
null1                 50.0%
null2                     0
null3             220-237-0
null4                 48.1%
null5                 -0.71
null6             237-220-0
null7                 51.9%
null8                  0.71
Name: 85, dtype: object

In [113]:
# Regex: ^[0-9]{1,4}(?=-)   Will find the first column (incorrect)
# Regex: (?<=-)[0-9]{1,4}(?=-) Will find the second column (correct)

In [114]:
incorrect = []
correct = []
for i in spread['record'][:85]:
    inc = re.findall('^[0-9]{1,4}(?=-)', i)
    cor = re.findall('(?<=-)[0-9]{1,4}(?=-)', i)
    
    incorrect.append(int(inc[0]))
    correct.append(int(cor[0]))

In [115]:
sum(correct) / (sum(correct) + sum(incorrect))

0.7430196294257935

### Vegas correctly picks the winner in the opening line 74.3% of the time since 2013

# Modeling

## Begin Modeling Here (Can skip everything above now)

### Bring in modeling df

In [2]:
joined = pd.read_csv('../data/joined_modeling.csv')

In [3]:
joined.set_index(['school_id', 'date_game', 'opp_id'], inplace=True)

In [4]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,ewm03stl,...,rm30drbrm30heldto_drb,rm30astrm30heldto_ast,rm30stlrm30heldto_stl,rm30blkrm30heldto_blk,rm30tovrm30heldto_tov,rm30pfrm30heldto_pf,rm30ptsrm30heldto_pts,rm30game_scorerm30heldto_game_score,SOSopp_SOS,SRSopp_SRS
school_id,date_game,opp_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
High Point,2011-11-18,Central Florida,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,20.382396,8.311464,4.645682,...,-0.1,-1.866667,-0.1,-0.8,2.8,2.066667,0.1,-5.94,-6.44,-19.07
High Point,2011-11-22,Tennessee Tech,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,20.691198,11.655732,4.322841,...,0.633333,-2.733333,0.233333,-0.433333,-0.433333,0.766667,-6.433333,-7.926667,1.3,-7.18
High Point,2011-11-26,Citadel,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,19.845599,14.327866,5.66142,...,-0.9,-2.633333,0.866667,-0.9,4.833333,4.166667,-6.633333,-15.22,-1.63,-2.98
High Point,2011-11-29,Hampton,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,25.9228,13.663933,7.33071,...,-3.966667,0.7,1.733333,-0.033333,0.166667,-0.233333,-0.1,1.423333,4.3,-7.62
High Point,2011-12-03,Campbell,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,20.9614,11.831967,5.665355,...,-0.1,-2.3,-0.266667,-1.4,1.166667,-0.133333,-3.733333,-9.02,-0.2,-6.15


In [None]:
#drop_cols = []
#for col in joined.columns:
#    if 'SRS' in col:
#        drop_cols.append(col)

In [None]:
#joined.drop(drop_cols, axis=1, inplace=True)

In [108]:
joined.shape

(75355, 195)

In [38]:
list(joined.columns)

['ewm03fg2',
 'ewm03fg2a',
 'ewm03fg3',
 'ewm03fg3a',
 'ewm03ft',
 'ewm03fta',
 'ewm03orb',
 'ewm03drb',
 'ewm03ast',
 'ewm03stl',
 'ewm03blk',
 'ewm03tov',
 'ewm03pf',
 'ewm03pts',
 'ewm03game_score',
 'ewm03W',
 'ewm10fg2',
 'ewm10fg2a',
 'ewm10fg3',
 'ewm10fg3a',
 'ewm10ft',
 'ewm10fta',
 'ewm10orb',
 'ewm10drb',
 'ewm10ast',
 'ewm10stl',
 'ewm10blk',
 'ewm10tov',
 'ewm10pf',
 'ewm10pts',
 'ewm10game_score',
 'ewm10W',
 'ewm20fg2',
 'ewm20fg2a',
 'ewm20fg3',
 'ewm20fg3a',
 'ewm20ft',
 'ewm20fta',
 'ewm20orb',
 'ewm20drb',
 'ewm20ast',
 'ewm20stl',
 'ewm20blk',
 'ewm20tov',
 'ewm20pf',
 'ewm20pts',
 'ewm20game_score',
 'ewm20W',
 'rm30fg2',
 'rm30fg2a',
 'rm30fg3',
 'rm30fg3a',
 'rm30ft',
 'rm30fta',
 'rm30orb',
 'rm30drb',
 'rm30ast',
 'rm30stl',
 'rm30blk',
 'rm30tov',
 'rm30pf',
 'rm30pts',
 'rm30game_score',
 'rm30W',
 'ewm03heldto_fg2',
 'ewm03heldto_fg2a',
 'ewm03heldto_fg3',
 'ewm03heldto_fg3a',
 'ewm03heldto_ft',
 'ewm03heldto_fta',
 'ewm03heldto_orb',
 'ewm03heldto_drb',
 'e

In [5]:
cols_interest = list(joined.columns[:133])
#cols_interest = joined.columns[128:133]

In [6]:
cols_interest

['ewm03fg2',
 'ewm03fg2a',
 'ewm03fg3',
 'ewm03fg3a',
 'ewm03ft',
 'ewm03fta',
 'ewm03orb',
 'ewm03drb',
 'ewm03ast',
 'ewm03stl',
 'ewm03blk',
 'ewm03tov',
 'ewm03pf',
 'ewm03pts',
 'ewm03game_score',
 'ewm03W',
 'ewm10fg2',
 'ewm10fg2a',
 'ewm10fg3',
 'ewm10fg3a',
 'ewm10ft',
 'ewm10fta',
 'ewm10orb',
 'ewm10drb',
 'ewm10ast',
 'ewm10stl',
 'ewm10blk',
 'ewm10tov',
 'ewm10pf',
 'ewm10pts',
 'ewm10game_score',
 'ewm10W',
 'ewm20fg2',
 'ewm20fg2a',
 'ewm20fg3',
 'ewm20fg3a',
 'ewm20ft',
 'ewm20fta',
 'ewm20orb',
 'ewm20drb',
 'ewm20ast',
 'ewm20stl',
 'ewm20blk',
 'ewm20tov',
 'ewm20pf',
 'ewm20pts',
 'ewm20game_score',
 'ewm20W',
 'rm30fg2',
 'rm30fg2a',
 'rm30fg3',
 'rm30fg3a',
 'rm30ft',
 'rm30fta',
 'rm30orb',
 'rm30drb',
 'rm30ast',
 'rm30stl',
 'rm30blk',
 'rm30tov',
 'rm30pf',
 'rm30pts',
 'rm30game_score',
 'rm30W',
 'ewm03heldto_fg2',
 'ewm03heldto_fg2a',
 'ewm03heldto_fg3',
 'ewm03heldto_fg3a',
 'ewm03heldto_ft',
 'ewm03heldto_fta',
 'ewm03heldto_orb',
 'ewm03heldto_drb',
 'e

In [54]:
#cols_interest.append(joined.columns[130])

In [55]:
#cols_interest.append(joined.columns[132])

### Split X and y, and apply Scaling

In [147]:
#X = joined[joined.columns[:64].append(joined.columns[128:])].drop('W', axis=1)
X = joined[cols_interest].drop('W', axis=1)
#X = team_b_df.drop('W', axis=1)
y = joined['W']
#y = team_b_df['W']

In [148]:
ss = StandardScaler()
X = ss.fit_transform(X)

### Apply some PCA to reduce features

In [149]:
pca = PCA(n_components=50)
pca = pca.fit(X)

In [150]:
pca.explained_variance_

array([18.99564042, 17.27010269, 12.42070411,  8.2121557 ,  7.39360937,
        6.08029454,  4.8569003 ,  4.75379698,  4.24374435,  3.66004452,
        3.32185342,  3.15004117,  2.80948152,  2.68697144,  2.32570721,
        2.21367291,  1.94215655,  1.88492781,  1.72275091,  1.52227339,
        1.45170423,  1.27736111,  1.18634258,  1.10493617,  1.00720926,
        0.94243617,  0.87424196,  0.79484226,  0.72909465,  0.72231592,
        0.63279465,  0.5939362 ,  0.55893001,  0.54049491,  0.50610406,
        0.46165252,  0.45161221,  0.43478475,  0.40989952,  0.38831738,
        0.37126595,  0.3335199 ,  0.31475104,  0.29193395,  0.27941333,
        0.2743362 ,  0.25236852,  0.23378064,  0.19967883,  0.192123  ])

In [151]:
X = pca.transform(X)

In [152]:
X = pd.DataFrame(X, index=joined[cols_interest].drop('W', axis=1).index)

## Train/Test Split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(52748, 50)
(52748,)
(22607, 50)
(22607,)


## Logistic Regression

In [36]:
logreg = LogisticRegression(C=.01)
logreg.fit(X_train, y_train)

LogisticRegression(C=0.01, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [37]:
scores = cross_val_score(logreg, X_train, y_train)
print(np.mean(scores))

0.7224918589745378


## Grid Search on Logistic Regression

In [157]:
lr_params = {'penalty':['l1', 'l2'],
             'tol':[.00001, .0001, .001],
             'C':[.001, .01, 1, 10],
            }

In [158]:
grid_lr = GridSearchCV(LogisticRegression(), lr_params, cv=5)
grid_lr.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'tol': [1e-05, 0.0001, 0.001], 'C': [0.001, 0.01, 1, 10]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [159]:
scores = cross_val_score(grid_lr, X_train, y_train)
np.mean(scores)

0.7254116307289898

In [160]:
grid_lr.best_params_

{'C': 0.01, 'penalty': 'l2', 'tol': 1e-05}

In [161]:
scores = cross_val_score(grid_lr, X_test, y_test)
np.mean(scores)

0.7269430379390164

In [162]:
preds = grid_lr.predict(X_test)

In [164]:
print(classification_report(y_test, preds))
pd.DataFrame(confusion_matrix(y_test, preds), columns=['Pred -', 'Pred +'], index=['Act -', 'Act +'])

             precision    recall  f1-score   support

          0       0.73      0.73      0.73     11385
          1       0.72      0.73      0.73     11222

avg / total       0.73      0.73      0.73     22607



Unnamed: 0,Pred -,Pred +
Act -,8258,3127
Act +,3010,8212


### AUC ROC Score and Curve

In [15]:
preds = logreg.predict(X_test)

In [16]:
print(classification_report(y_test, preds))
pd.DataFrame(confusion_matrix(y_test, preds), columns=['Pred -', 'Pred +'], index=['Act -', 'Act +'])

             precision    recall  f1-score   support

          0       0.72      0.71      0.72     11375
          1       0.71      0.72      0.72     11232

avg / total       0.72      0.72      0.72     22607



Unnamed: 0,Pred -,Pred +
Act -,8121,3254
Act +,3147,8085


In [None]:
proba_pairs = logreg.predict_proba(X_test)
probas = [item[1] for item in proba_pairs]
roc_auc_score(y_test, probas)

In [None]:
threshold[4200]

In [None]:
fpr, tpr, threshold = roc_curve(y_test, probas)
roc_auc = auc(fpr, tpr)

fix, ax = plt.subplots(figsize=(14,10))

plt.title('Receiver Operating Characteristic', fontsize=20)
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.scatter(fpr[4200], tpr[4200], s=200, label='Threshold = 50%')
plt.legend(loc = 'lower right', fontsize=14)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate', fontsize=16)
plt.xlabel('False Positive Rate', fontsize=16)
plt.show()

### What about double predictions?

In [153]:
pred_proba = logreg.predict_proba(X)

In [154]:
preds = logreg.predict(X)

In [156]:
preds

array([0, 0, 0, ..., 0, 1, 0])

In [157]:
X['W'] = y

In [158]:
prob1 = []
for i in test_pred_proba:
    prob1.append(i[1])

In [159]:
X['prob1'] = prob1

In [160]:
X['orig_pred'] = preds

In [161]:
X.head(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,W,prob1,orig_pred
school_id,date_game,opp_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
High Point,2011-11-18,Central Florida,3.79096,-6.081436,-1.205655,0.215862,2.258905,0.954076,-1.209752,0.740085,0.972303,0.216865,...,-0.387634,-0.927539,-0.085734,-0.220914,0.641706,-0.403588,-0.399778,0,0.075002,0
High Point,2011-11-22,Tennessee Tech,3.931356,-4.571748,-0.828146,0.611397,-4.823761,-0.536561,-1.679852,-1.790301,-1.487629,-1.358986,...,-0.436251,1.335334,0.706973,-0.846529,0.075936,0.319487,-0.486209,0,0.213931,0
High Point,2011-11-26,Citadel,6.209464,-2.413404,-0.129863,-1.072159,2.149813,0.434139,-0.494764,1.703296,-1.207812,0.071694,...,-0.349529,0.291418,0.200816,-0.693933,-0.314911,-0.540619,-0.184733,1,0.386038,0
High Point,2011-11-29,Hampton,0.454386,-3.254574,-1.482772,-0.301871,-0.669659,0.866319,-3.333138,-1.82865,-0.053367,0.775072,...,-0.202736,-0.184447,-0.551318,-0.392278,0.676153,-0.259727,-0.033954,0,0.282891,0
High Point,2011-12-03,Campbell,5.278938,-1.81961,-1.936405,4.385084,-1.638864,0.187775,-3.209625,-2.566547,0.070555,-1.235582,...,-0.565004,0.200756,-0.096707,-0.521479,0.26998,0.035179,0.316416,0,0.272463,0
High Point,2011-12-07,Wake Forest,5.23273,-2.315327,-0.397413,0.3043,0.472577,1.2863,-2.069609,0.057848,0.052331,1.356419,...,-0.592243,-0.53874,-0.056561,-0.097292,0.22629,-0.33143,-0.089769,0,0.12048,0
High Point,2011-12-17,Marshall,-0.394789,-5.053176,0.909499,4.140854,2.090304,3.5204,-2.886082,0.333212,-0.893369,0.142537,...,-0.059149,-0.530275,-0.712492,-0.404795,0.334221,-0.138082,0.145277,0,0.055102,0
High Point,2011-12-21,Wofford,-1.135603,-7.5956,1.792859,2.453736,1.151321,2.714658,-1.918204,1.011617,-1.743226,1.319047,...,0.464003,-0.309558,0.797933,-0.37111,-0.095657,0.366033,0.091093,1,0.06951,0
High Point,2011-12-31,Coastal Carolina,-0.086295,-3.248458,3.407372,-0.924722,0.798661,1.534494,-1.024319,1.502055,-0.257058,-1.589805,...,0.334158,-0.179296,-0.697307,-0.000307,0.26295,-0.030801,0.618555,0,0.123307,0
High Point,2012-01-02,Charleston Southern,3.979149,-1.698766,0.273933,0.802957,1.957876,0.869807,-2.623152,-1.365201,0.777798,-1.119881,...,0.34884,0.537024,0.4015,-0.460619,0.173691,-0.219939,0.14177,1,0.295662,0


In [162]:
X.reset_index(inplace=True)

In [163]:
X.sort_values(['date_game', 'school_id', 'opp_id'], inplace=True)

In [164]:
X.reset_index(inplace=True, drop=True)

In [165]:
X.head()

Unnamed: 0,school_id,date_game,opp_id,0,1,2,3,4,5,6,...,43,44,45,46,47,48,49,W,prob1,orig_pred
0,James Madison,2011-02-26,Virginia Commonwealth,-1.126635,-0.3159,-0.732924,1.114919,4.413185,-2.85909,0.709293,...,-0.043985,-0.549453,1.005969,-0.614596,-0.547648,0.222794,-0.611651,1,0.339285,0
1,Virginia Commonwealth,2011-02-26,James Madison,-1.602078,-0.714932,2.932586,-0.46873,0.988985,-0.492004,0.011567,...,0.26556,-0.246081,-0.424792,0.047127,0.417467,-0.100864,-0.225713,0,0.733492,1
2,Detroit Mercy,2011-03-01,Loyola (IL),-4.309646,0.462461,-4.008595,-3.783444,1.136622,2.059967,1.488198,...,0.229844,0.701309,0.642304,-0.37353,0.652018,0.030005,0.721704,1,0.598441,1
3,Loyola (IL),2011-03-01,Detroit Mercy,2.631673,-0.63723,1.151751,-2.029922,1.445821,-0.139568,-1.141033,...,0.117277,0.191076,0.078192,0.119577,0.899368,-0.436771,-0.070236,0,0.424766,0
4,Illinois State,2011-03-03,Southern Illinois,3.206396,-5.072742,-0.699457,0.307324,0.468525,-1.346195,2.464895,...,-0.353864,-0.819297,0.306177,0.120034,0.426111,-0.436024,-0.275414,0,0.424961,0


In [166]:
X.index[-1]

75354

In [167]:
team_a = []
team_b = []
for i in X.index:
    if i % 500 == 0:
        print('parsing...', i)
    if i not in team_b:
        for j in range(i, X.index[-1]):
            if X['school_id'][j] == X['opp_id'][i] and X['date_game'][i] == X['date_game'][j]:
                team_a.append(i)
                team_b.append(j)
                break

parsing... 0
parsing... 500
parsing... 1000
parsing... 1500
parsing... 2000
parsing... 2500
parsing... 3000
parsing... 3500
parsing... 4000
parsing... 4500
parsing... 5000
parsing... 5500
parsing... 6000
parsing... 6500
parsing... 7000
parsing... 7500
parsing... 8000
parsing... 8500
parsing... 9000
parsing... 9500
parsing... 10000
parsing... 10500
parsing... 11000
parsing... 11500
parsing... 12000
parsing... 12500
parsing... 13000
parsing... 13500
parsing... 14000
parsing... 14500
parsing... 15000
parsing... 15500
parsing... 16000
parsing... 16500
parsing... 17000
parsing... 17500
parsing... 18000
parsing... 18500
parsing... 19000
parsing... 19500
parsing... 20000
parsing... 20500
parsing... 21000
parsing... 21500
parsing... 22000
parsing... 22500
parsing... 23000
parsing... 23500
parsing... 24000
parsing... 24500
parsing... 25000
parsing... 25500
parsing... 26000
parsing... 26500
parsing... 27000
parsing... 27500
parsing... 28000
parsing... 28500
parsing... 29000
parsing... 29500
pars

In [168]:
print(len(team_a))
print(len(team_b))

37651
37651


In [169]:
X_team_a = X.loc[team_a]
X_team_b = X.loc[team_b]

In [170]:
X_team_a.head()

Unnamed: 0,school_id,date_game,opp_id,0,1,2,3,4,5,6,...,43,44,45,46,47,48,49,W,prob1,orig_pred
0,James Madison,2011-02-26,Virginia Commonwealth,-1.126635,-0.3159,-0.732924,1.114919,4.413185,-2.85909,0.709293,...,-0.043985,-0.549453,1.005969,-0.614596,-0.547648,0.222794,-0.611651,1,0.339285,0
2,Detroit Mercy,2011-03-01,Loyola (IL),-4.309646,0.462461,-4.008595,-3.783444,1.136622,2.059967,1.488198,...,0.229844,0.701309,0.642304,-0.37353,0.652018,0.030005,0.721704,1,0.598441,1
4,Illinois State,2011-03-03,Southern Illinois,3.206396,-5.072742,-0.699457,0.307324,0.468525,-1.346195,2.464895,...,-0.353864,-0.819297,0.306177,0.120034,0.426111,-0.436024,-0.275414,0,0.424961,0
6,Belmont,2011-03-04,Mercer,-4.5739,4.045212,1.637228,5.285766,-5.21734,1.86489,-4.796025,...,-0.875938,-0.510293,0.32453,-0.480558,0.553419,-0.926061,-0.079848,1,0.854592,1
7,Cleveland State,2011-03-04,Wright State,-4.856683,-2.594846,0.701281,1.773014,-2.965519,0.62455,2.183951,...,0.042935,-0.082859,-0.215088,0.362639,-0.557945,-0.334266,-0.933414,1,0.607643,1


In [172]:
X_team_b.head()

Unnamed: 0,school_id,date_game,opp_id,0,1,2,3,4,5,6,...,43,44,45,46,47,48,49,W,prob1,orig_pred
1,Virginia Commonwealth,2011-02-26,James Madison,-1.602078,-0.714932,2.932586,-0.46873,0.988985,-0.492004,0.011567,...,0.26556,-0.246081,-0.424792,0.047127,0.417467,-0.100864,-0.225713,0,0.733492,1
3,Loyola (IL),2011-03-01,Detroit Mercy,2.631673,-0.63723,1.151751,-2.029922,1.445821,-0.139568,-1.141033,...,0.117277,0.191076,0.078192,0.119577,0.899368,-0.436771,-0.070236,0,0.424766,0
5,Southern Illinois,2011-03-03,Illinois State,1.79211,-6.424995,-0.760293,-1.635151,-0.490843,-1.128347,2.227399,...,0.207982,0.238657,-0.354454,-0.461278,0.03168,-0.609977,-0.109822,1,0.527602,1
12,Mercer,2011-03-04,Belmont,-4.762402,-5.47194,-3.072437,4.89628,-3.375397,1.722534,-2.797522,...,0.131848,0.53211,0.226264,0.077479,-0.296104,0.513045,-0.621028,0,0.160063,0
19,Wright State,2011-03-04,Cleveland State,1.511035,-4.620702,1.719798,4.319319,-1.35354,-0.016617,1.63146,...,0.614284,-0.491927,0.698683,0.002417,-0.347355,-0.057007,-0.813374,0,0.328176,0


In [173]:
prob_opp1 = []
for i in team_b:
    prob_opp1.append(X.loc[i, 'prob1'])

In [174]:
X_team_a['prob_opp1'] = prob_opp1

In [175]:
X_team_a.head()

Unnamed: 0,school_id,date_game,opp_id,0,1,2,3,4,5,6,...,44,45,46,47,48,49,W,prob1,orig_pred,prob_opp1
0,James Madison,2011-02-26,Virginia Commonwealth,-1.126635,-0.3159,-0.732924,1.114919,4.413185,-2.85909,0.709293,...,-0.549453,1.005969,-0.614596,-0.547648,0.222794,-0.611651,1,0.339285,0,0.733492
2,Detroit Mercy,2011-03-01,Loyola (IL),-4.309646,0.462461,-4.008595,-3.783444,1.136622,2.059967,1.488198,...,0.701309,0.642304,-0.37353,0.652018,0.030005,0.721704,1,0.598441,1,0.424766
4,Illinois State,2011-03-03,Southern Illinois,3.206396,-5.072742,-0.699457,0.307324,0.468525,-1.346195,2.464895,...,-0.819297,0.306177,0.120034,0.426111,-0.436024,-0.275414,0,0.424961,0,0.527602
6,Belmont,2011-03-04,Mercer,-4.5739,4.045212,1.637228,5.285766,-5.21734,1.86489,-4.796025,...,-0.510293,0.32453,-0.480558,0.553419,-0.926061,-0.079848,1,0.854592,1,0.160063
7,Cleveland State,2011-03-04,Wright State,-4.856683,-2.594846,0.701281,1.773014,-2.965519,0.62455,2.183951,...,-0.082859,-0.215088,0.362639,-0.557945,-0.334266,-0.933414,1,0.607643,1,0.328176


In [176]:
W_adj = []
for i in X_team_a.index:
    if X_team_a.loc[i, 'prob1'] > X_team_a.loc[i, 'prob_opp1']:
        W_adj.append(1)
    else:
        W_adj.append(0)

In [178]:
X_team_a['W_adj'] = W_adj

In [179]:
X_team_a.head()

Unnamed: 0,school_id,date_game,opp_id,0,1,2,3,4,5,6,...,45,46,47,48,49,W,prob1,orig_pred,prob_opp1,W_adj
0,James Madison,2011-02-26,Virginia Commonwealth,-1.126635,-0.3159,-0.732924,1.114919,4.413185,-2.85909,0.709293,...,1.005969,-0.614596,-0.547648,0.222794,-0.611651,1,0.339285,0,0.733492,0
2,Detroit Mercy,2011-03-01,Loyola (IL),-4.309646,0.462461,-4.008595,-3.783444,1.136622,2.059967,1.488198,...,0.642304,-0.37353,0.652018,0.030005,0.721704,1,0.598441,1,0.424766,1
4,Illinois State,2011-03-03,Southern Illinois,3.206396,-5.072742,-0.699457,0.307324,0.468525,-1.346195,2.464895,...,0.306177,0.120034,0.426111,-0.436024,-0.275414,0,0.424961,0,0.527602,0
6,Belmont,2011-03-04,Mercer,-4.5739,4.045212,1.637228,5.285766,-5.21734,1.86489,-4.796025,...,0.32453,-0.480558,0.553419,-0.926061,-0.079848,1,0.854592,1,0.160063,1
7,Cleveland State,2011-03-04,Wright State,-4.856683,-2.594846,0.701281,1.773014,-2.965519,0.62455,2.183951,...,-0.215088,0.362639,-0.557945,-0.334266,-0.933414,1,0.607643,1,0.328176,1


In [180]:
X_team_a.set_index(['school_id', 'date_game', 'opp_id'], inplace=True)

In [181]:
X_team_a['W'].sum()

18206

In [182]:
X_team_a['W_adj'].sum()

17995

In [183]:
print(classification_report(X_team_a['W'], X_team_a['orig_pred']))
pd.DataFrame(confusion_matrix(X_team_a['W'], X_team_a['orig_pred']), columns=['Pred -', 'Pred +'], index=['Act -', 'Act +'])

             precision    recall  f1-score   support

          0       0.72      0.73      0.73     19445
          1       0.71      0.70      0.71     18206

avg / total       0.72      0.72      0.72     37651



Unnamed: 0,Pred -,Pred +
Act -,14286,5159
Act +,5435,12771


In [184]:
print(classification_report(X_team_a['W'], X_team_a['W_adj']))
pd.DataFrame(confusion_matrix(X_team_a['W'], X_team_a['W_adj']), columns=['Pred -', 'Pred +'], index=['Act -', 'Act +'])

             precision    recall  f1-score   support

          0       0.73      0.74      0.73     19445
          1       0.72      0.71      0.71     18206

avg / total       0.72      0.72      0.72     37651



Unnamed: 0,Pred -,Pred +
Act -,14321,5124
Act +,5335,12871


## March Madness!

In [185]:
page1 = pd.read_csv('../data/marchmadness_2018/scores_pg1.csv', sep='\t')
page2 = pd.read_csv('../data/marchmadness_2018/scores_pg2.csv', sep='\t')

In [186]:
page1.reset_index(inplace=True)

In [187]:
print(page1.shape)
print(page2.shape)

(100, 12)
(35, 12)


In [188]:
page1.columns = ['rank', 'year', 'date', 'region', 'round', 'school_id', 'pts', 'opp_id',
                'opp_pts', 'ot', 'pts_df', 'location']

page2.columns = ['rank', 'year', 'date', 'region', 'round', 'school_id', 'pts', 'opp_id',
                'opp_pts', 'ot', 'pts_df', 'location']

In [189]:
page2.drop(20, inplace=True)

In [190]:
mm_2018 = pd.concat([page1, page2], ignore_index=True)

In [191]:
mm_2018.reset_index(drop=True, inplace=True)

In [192]:
schools_list = []
opp_list = []
school_wins = []

remove_digits = str.maketrans('', '', digits)
for i in mm_2018.index:
    school_dirty = mm_2018['school_id'][i]
    school_clean = school_dirty.translate(remove_digits).lstrip()
    
    opp_dirty = mm_2018['opp_id'][i]
    opp_clean = opp_dirty.translate(remove_digits).lstrip()
    
    schools_list.append(school_clean)
    opp_list.append(opp_clean)
    
    if mm_2018['pts'][i] > mm_2018['opp_pts'][i]:
        school_wins.append(1)
    else:
        school_wins.append(0)

In [193]:
print(len(schools_list))
print(len(opp_list))
print(len(school_wins))

134
134
134


In [194]:
mm_2018['school_id'] = schools_list
mm_2018['opp_id'] = opp_list
mm_2018['W'] = school_wins

In [195]:
mm_2018.head()

Unnamed: 0,rank,year,date,region,round,school_id,pts,opp_id,opp_pts,ot,pts_df,location,W
0,1,2018,"April 2, 2018",National,National Final,Villanova,79,Michigan,62,,17,"San Antonio, TX",1
1,2,2018,"April 2, 2018",National,National Final,Michigan,62,Villanova,79,,-17,"San Antonio, TX",0
2,3,2018,"March 31, 2018",National,National Semifinal,Villanova,95,Kansas,79,,16,"San Antonio, TX",1
3,4,2018,"March 31, 2018",National,National Semifinal,Michigan,69,Loyola (IL),57,,12,"San Antonio, TX",1
4,5,2018,"March 31, 2018",National,National Semifinal,Loyola (IL),57,Michigan,69,,-12,"San Antonio, TX",0


In [205]:
notin_list = []
for team in mm_2018['school_id']:
    if team not in df['school_id'].unique():
        notin_list.append(team)

In [206]:
notin_list

[]

In [207]:
mm_2018['rank'] = mm_2018['rank'].apply(lambda x: int(x))
mm_2018.sort_values('rank', inplace=True, ascending=False)
mm_2018.reset_index(drop=True, inplace=True)

In [209]:
mm_2018['date'] = pd.to_datetime(mm_2018['date'])

In [210]:
mm_2018.dtypes

rank                  int64
year                 object
date         datetime64[ns]
region               object
round                object
school_id            object
pts                  object
opp_id               object
opp_pts              object
ot                   object
pts_df               object
location             object
W                     int64
dtype: object

In [211]:
joined.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ewm03fg2,ewm03fg2a,ewm03fg3,ewm03fg3a,ewm03ft,ewm03fta,ewm03orb,ewm03drb,ewm03ast,ewm03stl,...,rm30drbrm30heldto_drb,rm30astrm30heldto_ast,rm30stlrm30heldto_stl,rm30blkrm30heldto_blk,rm30tovrm30heldto_tov,rm30pfrm30heldto_pf,rm30ptsrm30heldto_pts,rm30game_scorerm30heldto_game_score,SOSopp_SOS,SRSopp_SRS
school_id,date_game,opp_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
High Point,2011-11-18,Central Florida,14.449124,32.502785,6.527332,17.261717,10.273871,14.04575,6.731944,20.382396,8.311464,4.645682,...,-0.1,-1.866667,-0.1,-0.8,2.8,2.066667,0.1,-5.94,-6.44,-19.07
High Point,2011-11-22,Tennessee Tech,15.724562,33.751393,7.263666,18.130859,12.136936,18.022875,5.865972,20.691198,11.655732,4.322841,...,0.633333,-2.733333,0.233333,-0.433333,-0.433333,0.766667,-6.433333,-7.926667,1.3,-7.18
High Point,2011-11-26,Citadel,15.362281,35.875696,7.131833,20.565429,9.068468,15.011437,8.932986,19.845599,14.327866,5.66142,...,-0.9,-2.633333,0.866667,-0.9,4.833333,4.166667,-6.633333,-15.22,-1.63,-2.98
High Point,2011-11-29,Hampton,17.68114,39.937848,7.065916,17.782715,14.034234,23.005719,10.466493,25.9228,13.663933,7.33071,...,-3.966667,0.7,1.733333,-0.033333,0.166667,-0.233333,-0.1,1.423333,4.3,-7.62
High Point,2011-12-03,Campbell,12.84057,34.468924,10.032958,23.891357,11.517117,19.502859,12.733246,20.9614,11.831967,5.665355,...,-0.1,-2.3,-0.266667,-1.4,1.166667,-0.133333,-3.733333,-9.02,-0.2,-6.15


In [323]:
yourteam_ewm_df.head()

Unnamed: 0,date_game,ewm03heldto_fg2,ewm03heldto_fg2a,ewm03heldto_fg3,ewm03heldto_fg3a,ewm03heldto_ft,ewm03heldto_fta,ewm03heldto_orb,ewm03heldto_drb,ewm03heldto_ast,...,rm30heldto_ast,rm30heldto_stl,rm30heldto_blk,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W,heldto_opp_id,heldto_school_id
0,2010-11-15,,,,,,,,,,...,,,,,,,,,High Point,Old Dominion
1,2010-11-23,19.0,36.0,7.0,19.0,20.0,34.0,17.0,27.0,19.0,...,,,,,,,,,High Point,Citadel
2,2010-11-28,18.333333,40.0,5.666667,17.0,20.0,31.333333,15.666667,26.333333,17.0,...,,,,,,,,,High Point,Hampton
3,2010-12-02,16.428571,39.428571,4.714286,13.571429,24.571429,35.142857,10.714286,26.714286,11.285714,...,,,,,,,,,High Point,Gardner-Webb
4,2010-12-04,18.866667,38.666667,2.2,14.333333,23.2,34.533333,11.933333,26.333333,6.866667,...,,,,,,,,,High Point,North Carolina-Asheville


In [324]:
yourteam_ewm_df.set_index(['heldto_opp_id', 'date_game', 'heldto_school_id'], inplace=True)
yourteam_ewm_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ewm03heldto_fg2,ewm03heldto_fg2a,ewm03heldto_fg3,ewm03heldto_fg3a,ewm03heldto_ft,ewm03heldto_fta,ewm03heldto_orb,ewm03heldto_drb,ewm03heldto_ast,ewm03heldto_stl,...,rm30heldto_orb,rm30heldto_drb,rm30heldto_ast,rm30heldto_stl,rm30heldto_blk,rm30heldto_tov,rm30heldto_pf,rm30heldto_pts,rm30heldto_game_score,rm30heldto_W
heldto_opp_id,date_game,heldto_school_id,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
High Point,2010-11-15,Old Dominion,,,,,,,,,,,...,,,,,,,,,,
High Point,2010-11-23,Citadel,19.0,36.0,7.0,19.0,20.0,34.0,17.0,27.0,19.0,14.0,...,,,,,,,,,,
High Point,2010-11-28,Hampton,18.333333,40.0,5.666667,17.0,20.0,31.333333,15.666667,26.333333,17.0,6.666667,...,,,,,,,,,,
High Point,2010-12-02,Gardner-Webb,16.428571,39.428571,4.714286,13.571429,24.571429,35.142857,10.714286,26.714286,11.285714,5.142857,...,,,,,,,,,,
High Point,2010-12-04,North Carolina-Asheville,18.866667,38.666667,2.2,14.333333,23.2,34.533333,11.933333,26.333333,6.866667,6.133333,...,,,,,,,,,,


In [325]:
yourteam_ewm_df.shape

(87171, 64)

In [361]:
def row_maker(school, opp):
    school_max_date = joined.loc[school].index.max()[0]
    opp_max_date = joined.loc[opp].index.max()[0]
    last_school_opp_played = joined.loc[opp, opp_max_date].index[0]
    
    row = pd.concat([joined.loc[school, school_max_date][joined.columns[0:64]].reset_index(drop=True),
                     yourteam_ewm_df.loc[opp, opp_max_date][joined.columns[64:128]].reset_index(drop=True),
                     joined.loc[school, school_max_date][joined.columns[129:131]].reset_index(drop=True),
                     joined.loc[last_school_opp_played, opp_max_date][joined.columns[131:133]].reset_index(drop=True).rename(columns={'SRS':'opp_SRS', 'SOS':'opp_SOS'})], 
                    axis=1)
    
    return(row)

In [363]:
row_maker('Villanova', 'Gonzaga').shape



(1, 132)

In [453]:
first4_indices = np.arange(0, 8, 1)
firstround_indices = np.arange(8, 72, 1)
secondround_indices = np.arange(72, 104, 1)
sweet16_indices = np.arange(104, 120, 1)
regional_indices = np.arange(120, 128, 1)
final4_indices = np.arange(128, 132, 1)
final_indices = np.arange(132, 134, 1)

## First 4

In [454]:
# Predictions of 'First 4' Round
temp_df = pd.DataFrame(columns=joined.columns.drop('W'))

for i in first4_indices:
    temp_df = temp_df.append(row_maker(mm_2018['school_id'][i], mm_2018['opp_id'][i]))
    temp_df.reset_index(drop=True, inplace=True)



In [455]:
temp_df = pca.transform(temp_df)
temp_df = pd.DataFrame(temp_df, columns = np.arange(0,50), index=mm_2018['school_id'][first4_indices])

In [456]:
temp_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
school_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Long Island University,9.540127,241.171095,-36.459663,22.602504,-10.47083,64.245706,-44.876218,-11.743921,-60.719648,-0.056785,...,-8.429311,-1.013363,-5.856958,-12.958628,3.585506,8.333517,-7.469275,7.940514,-3.762565,-4.209224
Radford,50.857581,243.047903,-40.136786,39.2199,10.04182,57.303718,-36.892919,-26.522902,-57.621585,-13.013492,...,-9.475974,-1.129156,-5.103476,-13.399744,3.910504,6.094585,-4.982698,7.363737,-7.310075,-3.527716
St. Bonaventure,29.908289,287.566449,-39.272487,20.986849,-4.654294,68.352169,-32.831351,-22.561408,-58.80889,-16.91394,...,-10.184357,0.602011,-7.370795,-21.74655,-5.755606,7.720282,-8.748425,8.503292,-10.654689,-4.72686
UCLA,34.83011,276.652273,-27.84748,31.723937,-7.626652,53.760068,-44.773539,-31.627099,-65.048351,-8.86311,...,-7.399744,-3.543497,-6.143728,-18.145617,2.403135,7.618834,-3.792212,9.792928,-8.665316,-3.679824
Arizona State,-1.92223,246.725978,-31.70483,21.4038,-10.733528,62.944306,-39.74736,-13.14972,-48.969001,-9.376549,...,-9.159078,9.430197,-6.07099,-12.066518,0.625027,8.058898,-6.074781,6.768249,-0.736427,-5.667515
North Carolina Central,50.45311,262.663933,-51.633619,32.812932,21.500071,59.156246,-46.893089,-21.353568,-75.697525,-8.830596,...,-8.777475,-2.638728,-4.562545,-17.880203,-0.267073,6.301482,-3.685994,4.742119,-7.29957,-4.782469
Syracuse,60.804143,219.127619,-41.451305,32.644423,10.464009,40.06668,-31.324162,-47.0719,-48.5003,-14.544136,...,-6.336952,-3.779872,-4.816709,-6.666391,1.959645,5.999221,0.713604,2.45091,-0.409327,-1.371444
Texas Southern,29.838455,269.68333,-44.237627,28.663489,-4.521232,64.12984,-31.661811,-12.539519,-62.679467,-13.328357,...,-14.637477,0.268622,-5.312119,-13.429717,2.781479,7.628475,-7.585283,8.380859,-8.254904,-2.872846


In [457]:
temp_df['W'] = mm_2018['W'][first4_indices].values

In [459]:
temp_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,W
school_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Long Island University,9.540127,241.171095,-36.459663,22.602504,-10.47083,64.245706,-44.876218,-11.743921,-60.719648,-0.056785,...,-1.013363,-5.856958,-12.958628,3.585506,8.333517,-7.469275,7.940514,-3.762565,-4.209224,0
Radford,50.857581,243.047903,-40.136786,39.2199,10.04182,57.303718,-36.892919,-26.522902,-57.621585,-13.013492,...,-1.129156,-5.103476,-13.399744,3.910504,6.094585,-4.982698,7.363737,-7.310075,-3.527716,1
St. Bonaventure,29.908289,287.566449,-39.272487,20.986849,-4.654294,68.352169,-32.831351,-22.561408,-58.80889,-16.91394,...,0.602011,-7.370795,-21.74655,-5.755606,7.720282,-8.748425,8.503292,-10.654689,-4.72686,1
UCLA,34.83011,276.652273,-27.84748,31.723937,-7.626652,53.760068,-44.773539,-31.627099,-65.048351,-8.86311,...,-3.543497,-6.143728,-18.145617,2.403135,7.618834,-3.792212,9.792928,-8.665316,-3.679824,0
Arizona State,-1.92223,246.725978,-31.70483,21.4038,-10.733528,62.944306,-39.74736,-13.14972,-48.969001,-9.376549,...,9.430197,-6.07099,-12.066518,0.625027,8.058898,-6.074781,6.768249,-0.736427,-5.667515,0
North Carolina Central,50.45311,262.663933,-51.633619,32.812932,21.500071,59.156246,-46.893089,-21.353568,-75.697525,-8.830596,...,-2.638728,-4.562545,-17.880203,-0.267073,6.301482,-3.685994,4.742119,-7.29957,-4.782469,0
Syracuse,60.804143,219.127619,-41.451305,32.644423,10.464009,40.06668,-31.324162,-47.0719,-48.5003,-14.544136,...,-3.779872,-4.816709,-6.666391,1.959645,5.999221,0.713604,2.45091,-0.409327,-1.371444,1
Texas Southern,29.838455,269.68333,-44.237627,28.663489,-4.521232,64.12984,-31.661811,-12.539519,-62.679467,-13.328357,...,0.268622,-5.312119,-13.429717,2.781479,7.628475,-7.585283,8.380859,-8.254904,-2.872846,1


In [460]:
temp_df['pred'] = logreg.predict(temp_df.drop('W', axis=1))

In [462]:
temp_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,42,43,44,45,46,47,48,49,W,pred
school_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Long Island University,9.540127,241.171095,-36.459663,22.602504,-10.47083,64.245706,-44.876218,-11.743921,-60.719648,-0.056785,...,-5.856958,-12.958628,3.585506,8.333517,-7.469275,7.940514,-3.762565,-4.209224,0,0
Radford,50.857581,243.047903,-40.136786,39.2199,10.04182,57.303718,-36.892919,-26.522902,-57.621585,-13.013492,...,-5.103476,-13.399744,3.910504,6.094585,-4.982698,7.363737,-7.310075,-3.527716,1,1
St. Bonaventure,29.908289,287.566449,-39.272487,20.986849,-4.654294,68.352169,-32.831351,-22.561408,-58.80889,-16.91394,...,-7.370795,-21.74655,-5.755606,7.720282,-8.748425,8.503292,-10.654689,-4.72686,1,0
UCLA,34.83011,276.652273,-27.84748,31.723937,-7.626652,53.760068,-44.773539,-31.627099,-65.048351,-8.86311,...,-6.143728,-18.145617,2.403135,7.618834,-3.792212,9.792928,-8.665316,-3.679824,0,1
Arizona State,-1.92223,246.725978,-31.70483,21.4038,-10.733528,62.944306,-39.74736,-13.14972,-48.969001,-9.376549,...,-6.07099,-12.066518,0.625027,8.058898,-6.074781,6.768249,-0.736427,-5.667515,0,1
North Carolina Central,50.45311,262.663933,-51.633619,32.812932,21.500071,59.156246,-46.893089,-21.353568,-75.697525,-8.830596,...,-4.562545,-17.880203,-0.267073,6.301482,-3.685994,4.742119,-7.29957,-4.782469,0,0
Syracuse,60.804143,219.127619,-41.451305,32.644423,10.464009,40.06668,-31.324162,-47.0719,-48.5003,-14.544136,...,-4.816709,-6.666391,1.959645,5.999221,0.713604,2.45091,-0.409327,-1.371444,1,0
Texas Southern,29.838455,269.68333,-44.237627,28.663489,-4.521232,64.12984,-31.661811,-12.539519,-62.679467,-13.328357,...,-5.312119,-13.429717,2.781479,7.628475,-7.585283,8.380859,-8.254904,-2.872846,1,1


# Round of 64

In [481]:
# Predictions of Round of 64
temp_df = pd.DataFrame(columns=joined.columns.drop('W'))

for i in firstround_indices:
    temp_df = temp_df.append(row_maker(mm_2018['school_id'][i], mm_2018['opp_id'][i]))
    temp_df.reset_index(drop=True, inplace=True)



In [482]:
temp_df = pca.transform(temp_df)
temp_df = pd.DataFrame(temp_df, columns = np.arange(0,50))

In [487]:
temp_df['school_id'] = mm_2018['school_id'][firstround_indices].values
temp_df['opp_id'] = mm_2018['opp_id'][firstround_indices].values

In [488]:
temp_df['W'] = mm_2018['W'][firstround_indices].values

In [490]:
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,43,44,45,46,47,48,49,school_id,opp_id,W
0,33.598022,245.350623,-38.389682,10.363170,-3.723563,55.159348,-38.368237,-30.748003,-51.478447,-3.982511,...,-14.788960,0.083271,5.500812,0.031697,3.093240,-2.192344,-2.572244,Alabama,Virginia Tech,1
1,27.786856,276.675822,-48.530684,30.255525,3.346603,49.772580,-38.946294,-26.380955,-67.998022,-12.386208,...,-15.571664,1.774205,7.551549,-10.551544,10.160955,-3.883171,-3.551728,Arizona,Buffalo,0
2,16.046644,280.148698,-37.322316,34.660246,4.466834,63.569757,-45.047400,-19.178693,-67.897782,-14.428414,...,-16.418540,-1.948364,8.187659,-13.363620,10.516425,-6.880219,-4.003940,Buffalo,Arizona,1
3,20.873416,272.122624,-17.747289,36.160316,-1.855437,59.184644,-43.962470,-29.796052,-59.123863,-12.781176,...,-18.203125,1.974573,5.367349,-13.825429,15.097246,-9.242317,-6.447683,Davidson,Kentucky,0
4,23.612909,267.853284,-36.275817,32.671254,12.561662,53.492959,-43.387755,-22.304488,-59.350779,-3.673833,...,-11.622888,3.104884,13.351715,-6.759811,10.877258,-1.566276,-5.478227,Duke,Iona,1
5,42.257265,262.349375,-32.534600,28.238489,-2.924802,45.341052,-42.078248,-34.861275,-60.214260,-17.241359,...,-20.067400,4.590065,5.036065,-2.520781,5.899493,-8.280858,-1.230133,Florida,St. Bonaventure,1
6,10.501977,228.039146,-25.277054,25.312086,-8.708946,41.862111,-39.364771,-18.296676,-53.571191,-3.265383,...,-6.722245,3.232300,9.046216,-7.383903,11.646085,3.337549,-2.715161,Gonzaga,North Carolina-Greensboro,1
7,15.327976,258.049299,-25.903061,28.080496,-8.933283,52.182516,-55.332472,-19.587852,-65.811553,-6.511329,...,-13.719305,3.326284,7.749321,-10.523000,9.339177,-4.005565,-7.390532,Houston,San Diego State,1
8,17.723289,238.299003,-29.679412,26.222909,8.241416,59.952865,-41.006155,-26.459420,-52.812561,-10.423449,...,-9.668124,5.053606,5.733318,-5.793857,12.309139,-2.297132,-0.558374,Iona,Duke,0
9,16.310095,268.360310,-29.466264,43.767863,8.213099,45.124887,-44.753045,-15.907371,-67.886048,-8.260722,...,-17.086880,1.478983,4.636068,-9.980502,12.881775,-5.424235,-3.812769,Kansas,Pennsylvania,1


In [491]:
temp_df['pred'] = logreg.predict(temp_df.drop(['school_id', 'opp_id', 'W'], axis=1))

In [492]:
pred_proba = logreg.predict_proba(temp_df.drop(['school_id', 'opp_id', 'W', 'pred'], axis=1))

In [493]:
pred_proba_list = []
for i in pred_proba:
    pred_proba_list.append(i[1])

In [494]:
temp_df['pred_proba'] = pred_proba_list

In [495]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,school_id,opp_id,W,pred,pred_proba
0,33.598022,245.350623,-38.389682,10.36317,-3.723563,55.159348,-38.368237,-30.748003,-51.478447,-3.982511,...,5.500812,0.031697,3.09324,-2.192344,-2.572244,Alabama,Virginia Tech,1,1,0.805134
1,27.786856,276.675822,-48.530684,30.255525,3.346603,49.77258,-38.946294,-26.380955,-67.998022,-12.386208,...,7.551549,-10.551544,10.160955,-3.883171,-3.551728,Arizona,Buffalo,0,1,0.999996
2,16.046644,280.148698,-37.322316,34.660246,4.466834,63.569757,-45.0474,-19.178693,-67.897782,-14.428414,...,8.187659,-13.36362,10.516425,-6.880219,-4.00394,Buffalo,Arizona,1,0,0.000599
3,20.873416,272.122624,-17.747289,36.160316,-1.855437,59.184644,-43.96247,-29.796052,-59.123863,-12.781176,...,5.367349,-13.825429,15.097246,-9.242317,-6.447683,Davidson,Kentucky,0,0,0.003928
4,23.612909,267.853284,-36.275817,32.671254,12.561662,53.492959,-43.387755,-22.304488,-59.350779,-3.673833,...,13.351715,-6.759811,10.877258,-1.566276,-5.478227,Duke,Iona,1,1,1.0


In [496]:
pred_proba_opp = []
for i in temp_df.index:
    for j in temp_df.index:
        if temp_df['school_id'][i] == temp_df['opp_id'][j]:
            pred_proba_opp.append(temp_df['pred_proba'][j])

In [497]:
temp_df['pred_proba_opp'] = pred_proba_opp

In [500]:
W_adj = []
for i in temp_df.index:
    if temp_df['pred_proba'][i] > temp_df['pred_proba_opp'][i]:
        W_adj.append(1)
    else:
        W_adj.append(0)

In [501]:
temp_df['W_adj'] = W_adj

In [502]:
temp_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,school_id,opp_id,W,pred,pred_proba,pred_proba_opp,W_adj
0,33.598022,245.350623,-38.389682,10.36317,-3.723563,55.159348,-38.368237,-30.748003,-51.478447,-3.982511,...,3.09324,-2.192344,-2.572244,Alabama,Virginia Tech,1,1,0.805134,0.3773697,1
1,27.786856,276.675822,-48.530684,30.255525,3.346603,49.77258,-38.946294,-26.380955,-67.998022,-12.386208,...,10.160955,-3.883171,-3.551728,Arizona,Buffalo,0,1,0.999996,0.0005988045,1
2,16.046644,280.148698,-37.322316,34.660246,4.466834,63.569757,-45.0474,-19.178693,-67.897782,-14.428414,...,10.516425,-6.880219,-4.00394,Buffalo,Arizona,1,0,0.000599,0.9999957,0
3,20.873416,272.122624,-17.747289,36.160316,-1.855437,59.184644,-43.96247,-29.796052,-59.123863,-12.781176,...,15.097246,-9.242317,-6.447683,Davidson,Kentucky,0,0,0.003928,0.9999827,0
4,23.612909,267.853284,-36.275817,32.671254,12.561662,53.492959,-43.387755,-22.304488,-59.350779,-3.673833,...,10.877258,-1.566276,-5.478227,Duke,Iona,1,1,1.0,6.506739e-10,1


In [503]:
print(classification_report(temp_df['W'], temp_df['pred']))
pd.DataFrame(confusion_matrix(temp_df['W'], temp_df['pred']), columns=['Pred -', 'Pred +'], index=['Act -', 'Act +'])

             precision    recall  f1-score   support

          0       0.76      0.69      0.72        32
          1       0.71      0.78      0.75        32

avg / total       0.74      0.73      0.73        64



Unnamed: 0,Pred -,Pred +
Act -,22,10
Act +,7,25


In [504]:
print(classification_report(temp_df['W'], temp_df['W_adj']))
pd.DataFrame(confusion_matrix(temp_df['W'], temp_df['W_adj']), columns=['Pred -', 'Pred +'], index=['Act -', 'Act +'])

             precision    recall  f1-score   support

          0       0.72      0.72      0.72        32
          1       0.72      0.72      0.72        32

avg / total       0.72      0.72      0.72        64



Unnamed: 0,Pred -,Pred +
Act -,23,9
Act +,9,23


In [515]:
incorrect_picks = []
for i in temp_df.index:
    if temp_df['W'][i] != temp_df['W_adj'][i]:
        incorrect_picks.append(temp_df['school_id'][i])

In [516]:
incorrect_picks

['Arizona',
 'Buffalo',
 'Loyola (IL)',
 'Miami (FL)',
 'Oklahoma',
 'Rhode Island',
 'Creighton',
 'Florida State',
 'Kansas State',
 'Marshall',
 'Maryland-Baltimore County',
 'Missouri',
 'Nevada',
 'Syracuse',
 'Texas',
 'Texas Christian',
 'Virginia',
 'Wichita State']

In [509]:
len(incorrect_picks)

17

In [517]:
correct_picks = []
for i in temp_df.index:
    if temp_df['W'][i] == temp_df['W_adj'][i]:
        correct_picks.append(temp_df['school_id'][i])

In [511]:
correct_picks

['Alabama',
 'Davidson',
 'Duke',
 'Florida',
 'Gonzaga',
 'Houston',
 'Iona',
 'Kansas',
 'Kentucky',
 'Michigan',
 'Montana',
 'North Carolina-Greensboro',
 'North Carolina State',
 'Ohio State',
 'Pennsylvania',
 'Radford',
 'San Diego State',
 'Seton Hall',
 'South Dakota State',
 'St. Bonaventure',
 'Stephen F. Austin',
 'Tennessee',
 'Texas Tech',
 'Villanova',
 'Virginia Tech',
 'Wright State',
 'Auburn',
 'Bucknell',
 'Butler',
 'Cal State Fullerton',
 'Cincinnati',
 'Clemson',
 'College of Charleston',
 'Florida State',
 'Georgia State',
 'Lipscomb',
 'Michigan State',
 'Murray State',
 'Nevada',
 'New Mexico State',
 'North Carolina',
 'Providence',
 'Purdue',
 'Texas A&M',
 'Texas Southern',
 'West Virginia',
 'Xavier']

In [451]:
len(correct_picks)

26

In [523]:
temp_df.loc[:30].sort_values('school_id')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,school_id,opp_id,W,pred,pred_proba,pred_proba_opp,W_adj
0,33.598022,245.350623,-38.389682,10.36317,-3.723563,55.159348,-38.368237,-30.748003,-51.478447,-3.982511,...,3.09324,-2.192344,-2.572244,Alabama,Virginia Tech,1,1,0.8051342,0.3773697,1
1,27.786856,276.675822,-48.530684,30.255525,3.346603,49.77258,-38.946294,-26.380955,-67.998022,-12.386208,...,10.160955,-3.883171,-3.551728,Arizona,Buffalo,0,1,0.9999957,0.0005988045,1
2,16.046644,280.148698,-37.322316,34.660246,4.466834,63.569757,-45.0474,-19.178693,-67.897782,-14.428414,...,10.516425,-6.880219,-4.00394,Buffalo,Arizona,1,0,0.0005988045,0.9999957,0
3,20.873416,272.122624,-17.747289,36.160316,-1.855437,59.184644,-43.96247,-29.796052,-59.123863,-12.781176,...,15.097246,-9.242317,-6.447683,Davidson,Kentucky,0,0,0.003928428,0.9999827,0
4,23.612909,267.853284,-36.275817,32.671254,12.561662,53.492959,-43.387755,-22.304488,-59.350779,-3.673833,...,10.877258,-1.566276,-5.478227,Duke,Iona,1,1,1.0,6.506739e-10,1
5,42.257265,262.349375,-32.5346,28.238489,-2.924802,45.341052,-42.078248,-34.861275,-60.21426,-17.241359,...,5.899493,-8.280858,-1.230133,Florida,St. Bonaventure,1,1,0.9999717,0.000286073,1
6,10.501977,228.039146,-25.277054,25.312086,-8.708946,41.862111,-39.364771,-18.296676,-53.571191,-3.265383,...,11.646085,3.337549,-2.715161,Gonzaga,North Carolina-Greensboro,1,1,0.9999997,2.085316e-05,1
7,15.327976,258.049299,-25.903061,28.080496,-8.933283,52.182516,-55.332472,-19.587852,-65.811553,-6.511329,...,9.339177,-4.005565,-7.390532,Houston,San Diego State,1,1,0.9979445,0.1575945,1
8,17.723289,238.299003,-29.679412,26.222909,8.241416,59.952865,-41.006155,-26.45942,-52.812561,-10.423449,...,12.309139,-2.297132,-0.558374,Iona,Duke,0,0,6.506739e-10,1.0,0
9,16.310095,268.36031,-29.466264,43.767863,8.213099,45.124887,-44.753045,-15.907371,-67.886048,-8.260722,...,12.881775,-5.424235,-3.812769,Kansas,Pennsylvania,1,1,1.0,1.115617e-08,1


In [524]:
temp_df.loc[30:].sort_values('school_id')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,school_id,opp_id,W,pred,pred_proba,pred_proba_opp,W_adj
32,19.034199,272.01719,-45.674736,33.95241,-0.7547,53.639402,-39.549414,-20.152532,-67.131176,-10.960147,...,7.647294,-8.333351,-0.009245,Arkansas,Butler,0,1,0.5530691,0.8725885,0
33,-5.857335,243.561283,-31.030772,27.608241,-14.555481,70.566153,-33.625843,-5.692327,-52.880285,-2.281428,...,6.860581,0.760534,-9.786588,Auburn,College of Charleston,1,1,1.0,1.303843e-06,1
34,9.247859,247.55221,-34.99781,32.647703,6.64545,68.934317,-32.403126,-26.905972,-53.495439,-9.142303,...,12.798469,-1.80937,0.467556,Bucknell,Michigan State,0,0,1.816885e-07,1.0,0
35,36.754307,270.397604,-34.373816,36.039749,-8.228027,43.106726,-49.266696,-26.38123,-71.347388,-17.056794,...,7.123927,-8.594701,-7.049437,Butler,Arkansas,1,1,0.8725885,0.5530691,1
36,11.124398,259.69026,-52.29705,25.681928,12.32107,59.468026,-40.873831,-21.991826,-66.959683,-17.01527,...,11.717236,-8.849849,-3.296954,Cal State Fullerton,Purdue,0,0,9.337662e-09,1.0,0
37,30.794095,274.136711,-38.035918,22.304743,-11.341491,39.976524,-42.792401,-18.987713,-59.902597,-14.650703,...,9.815255,-7.18195,-3.065342,Cincinnati,Georgia State,1,1,1.0,1.53179e-06,1
38,15.435164,238.60377,-29.077245,32.405517,-3.94697,55.213823,-38.488798,-27.156783,-56.895347,-4.036868,...,9.131422,-7.72859,-3.068499,Clemson,New Mexico State,1,1,0.9999694,9.681897e-05,1
39,62.350185,231.063667,-32.157456,29.627338,7.29719,36.871088,-28.308597,-40.238554,-54.343324,-20.538698,...,7.160599,-0.705753,0.298059,College of Charleston,Auburn,0,0,1.303843e-06,1.0,0
40,19.424078,267.274856,-18.986544,40.167526,0.34322,46.319277,-46.050271,-17.213118,-62.396659,-7.068279,...,10.360482,-9.258001,-4.421226,Creighton,Kansas State,0,1,0.9923149,0.3581282,1
41,21.212345,267.057444,-40.574477,25.207451,3.600777,59.555094,-39.559666,-25.876993,-59.236965,-12.072937,...,7.575166,-6.450962,-2.023261,Florida State,Missouri,1,1,0.6916185,0.8832138,0


# Round of 32

In [525]:
# Predictions of Round of 32
temp_df = pd.DataFrame(columns=joined.columns.drop('W'))

for i in secondround_indices:
    temp_df = temp_df.append(row_maker(mm_2018['school_id'][i], mm_2018['opp_id'][i]))
    temp_df.reset_index(drop=True, inplace=True)
    
temp_df = pca.transform(temp_df)
temp_df = pd.DataFrame(temp_df, columns = np.arange(0,50))

temp_df['school_id'] = mm_2018['school_id'][secondround_indices].values
temp_df['opp_id'] = mm_2018['opp_id'][secondround_indices].values

temp_df['W'] = mm_2018['W'][secondround_indices].values

temp_df['pred'] = logreg.predict(temp_df.drop(['school_id', 'opp_id', 'W'], axis=1))

pred_proba = logreg.predict_proba(temp_df.drop(['school_id', 'opp_id', 'W', 'pred'], axis=1))

pred_proba_list = []
for i in pred_proba:
    pred_proba_list.append(i[1])
    
temp_df['pred_proba'] = pred_proba_list

pred_proba_opp = []
for i in temp_df.index:
    for j in temp_df.index:
        if temp_df['school_id'][i] == temp_df['opp_id'][j]:
            pred_proba_opp.append(temp_df['pred_proba'][j])
            
temp_df['pred_proba_opp'] = pred_proba_opp

W_adj = []
for i in temp_df.index:
    if temp_df['pred_proba'][i] > temp_df['pred_proba_opp'][i]:
        W_adj.append(1)
    else:
        W_adj.append(0)
        
temp_df['W_adj'] = W_adj



In [527]:
temp_df.sort_values('school_id')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,school_id,opp_id,W,pred,pred_proba,pred_proba_opp,W_adj
0,34.891347,247.33762,-42.801783,24.550919,12.466175,60.212652,-35.924351,-33.901742,-59.544153,-5.987835,...,3.683939,-6.866243,0.542426,Alabama,Villanova,0,0,0.0006260239,0.9999676,0
16,11.594534,261.629263,-34.324888,32.963164,-4.188293,69.552835,-37.322358,-23.521906,-53.367842,-8.59261,...,8.241296,-5.917579,-6.500452,Auburn,Clemson,0,1,0.6541137,0.7913606,0
1,14.265044,279.983288,-35.284532,29.221582,-3.290866,61.773187,-49.763723,-25.446612,-65.655504,-13.527697,...,12.793211,-7.516976,-3.144293,Buffalo,Kentucky,0,0,5.017814e-05,0.9999994,0
17,16.210816,255.125314,-30.618294,32.311283,15.761649,56.036197,-50.888728,-22.338526,-68.234408,-12.869612,...,8.100721,-8.828854,-6.646533,Butler,Purdue,0,0,0.06187574,0.9982403,0
18,19.951544,267.840788,-37.966557,26.798173,9.999493,51.180903,-47.682247,-25.242345,-60.282039,-13.693441,...,11.697221,-6.50381,-3.881775,Cincinnati,Nevada,0,1,0.9980895,0.379417,1
19,42.014775,261.773614,-32.214936,34.090473,0.052403,50.060643,-33.927499,-32.79578,-57.955406,-10.867818,...,5.003332,-7.989801,-4.027933,Clemson,Auburn,1,1,0.7913606,0.6541137,1
2,20.223965,263.957504,-40.322998,43.759276,2.772847,45.363138,-40.718012,-21.22721,-60.778413,-4.288906,...,11.191275,-6.211896,-4.084955,Duke,Rhode Island,1,1,0.9999985,2.78191e-05,1
3,26.721175,249.076941,-33.407671,33.813115,-2.952849,44.692467,-40.214675,-29.834277,-58.200784,-15.612255,...,6.854827,-7.376918,-1.17667,Florida,Texas Tech,0,0,0.413137,0.7055506,0
20,28.819349,275.442558,-38.68034,20.337549,9.403305,62.322342,-40.842588,-31.624242,-60.747234,-14.930192,...,8.828371,-3.892903,-4.608309,Florida State,Xavier,1,0,0.06190769,0.9989157,0
4,25.0878,243.383255,-28.215407,32.034597,13.17022,45.724709,-35.311044,-26.113212,-57.897288,-9.860836,...,12.622185,-3.279922,0.361202,Gonzaga,Ohio State,1,0,0.150688,0.9898725,0


# Round of 16

In [528]:
# Predictions of Round of 16
temp_df = pd.DataFrame(columns=joined.columns.drop('W'))

for i in sweet16_indices:
    temp_df = temp_df.append(row_maker(mm_2018['school_id'][i], mm_2018['opp_id'][i]))
    temp_df.reset_index(drop=True, inplace=True)
    
temp_df = pca.transform(temp_df)
temp_df = pd.DataFrame(temp_df, columns = np.arange(0,50))

temp_df['school_id'] = mm_2018['school_id'][sweet16_indices].values
temp_df['opp_id'] = mm_2018['opp_id'][sweet16_indices].values

temp_df['W'] = mm_2018['W'][sweet16_indices].values

temp_df['pred'] = logreg.predict(temp_df.drop(['school_id', 'opp_id', 'W'], axis=1))

pred_proba = logreg.predict_proba(temp_df.drop(['school_id', 'opp_id', 'W', 'pred'], axis=1))

pred_proba_list = []
for i in pred_proba:
    pred_proba_list.append(i[1])
    
temp_df['pred_proba'] = pred_proba_list

pred_proba_opp = []
for i in temp_df.index:
    for j in temp_df.index:
        if temp_df['school_id'][i] == temp_df['opp_id'][j]:
            pred_proba_opp.append(temp_df['pred_proba'][j])
            
temp_df['pred_proba_opp'] = pred_proba_opp

W_adj = []
for i in temp_df.index:
    if temp_df['pred_proba'][i] > temp_df['pred_proba_opp'][i]:
        W_adj.append(1)
    else:
        W_adj.append(0)
        
temp_df['W_adj'] = W_adj



In [529]:
temp_df.sort_values('school_id')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,school_id,opp_id,W,pred,pred_proba,pred_proba_opp,W_adj
8,27.411341,251.843818,-27.73363,25.922121,14.751256,59.829083,-38.738957,-32.581335,-52.653502,-7.606262,...,7.750813,-6.827799,-4.394714,Clemson,Kansas,0,0,0.016423,0.999707,0
9,-3.910207,245.778545,-29.607792,19.96445,-3.988386,49.064522,-44.541351,-15.449034,-49.451594,-0.255483,...,12.076755,0.491368,-6.948341,Duke,Syracuse,1,1,0.999982,0.000164,1
0,-5.187039,243.707907,-39.045137,28.404942,-3.699891,61.299382,-41.01214,-17.779292,-60.12985,-5.034416,...,9.937122,-2.276776,-3.034265,Florida State,Gonzaga,1,1,0.840414,0.752799,1
1,39.112671,256.24833,-27.644105,27.685721,5.895023,43.607715,-38.173916,-36.183978,-60.656492,-14.527535,...,10.867649,-2.530688,-0.377451,Gonzaga,Florida State,0,1,0.752799,0.840414,0
10,7.808456,264.172316,-24.939386,33.369402,5.755351,44.673984,-47.845464,-20.280383,-61.113799,-9.585014,...,12.120209,-4.334679,-3.705435,Kansas,Clemson,1,1,0.999707,0.016423,1
2,34.577949,245.877142,-40.757076,21.870113,5.861324,47.767062,-46.136185,-35.03577,-60.323524,-18.632719,...,9.007951,-6.691496,-3.121068,Kansas State,Kentucky,1,0,0.026344,0.995479,0
3,10.956962,272.656571,-56.25599,27.658163,-3.722462,55.135853,-34.483239,-20.359834,-60.579679,-10.171694,...,8.926957,-9.206722,-1.238978,Kentucky,Kansas State,0,1,0.995479,0.026344,1
4,35.412547,248.719166,-26.189105,28.235847,12.905069,46.574445,-45.241436,-29.872254,-60.001923,-13.545275,...,10.829633,-6.263313,-4.458778,Loyola (IL),Nevada,1,0,0.055942,0.999942,0
5,30.494594,264.911599,-26.816276,31.765389,10.040929,56.578309,-34.194128,-29.149579,-54.011133,-12.629745,...,9.616036,-7.853363,-1.374671,Michigan,Texas A&M,1,1,0.946846,0.150015,1
6,-10.115708,266.300024,-28.214268,33.242697,-3.848119,58.895537,-45.81409,-9.790241,-60.135867,-5.892679,...,13.146476,-5.963628,-4.390291,Nevada,Loyola (IL),0,1,0.999942,0.055942,1


# Round of 8

In [530]:
# Predictions of Round of 8
temp_df = pd.DataFrame(columns=joined.columns.drop('W'))

for i in regional_indices:
    temp_df = temp_df.append(row_maker(mm_2018['school_id'][i], mm_2018['opp_id'][i]))
    temp_df.reset_index(drop=True, inplace=True)
    
temp_df = pca.transform(temp_df)
temp_df = pd.DataFrame(temp_df, columns = np.arange(0,50))

temp_df['school_id'] = mm_2018['school_id'][regional_indices].values
temp_df['opp_id'] = mm_2018['opp_id'][regional_indices].values

temp_df['W'] = mm_2018['W'][regional_indices].values

temp_df['pred'] = logreg.predict(temp_df.drop(['school_id', 'opp_id', 'W'], axis=1))

pred_proba = logreg.predict_proba(temp_df.drop(['school_id', 'opp_id', 'W', 'pred'], axis=1))

pred_proba_list = []
for i in pred_proba:
    pred_proba_list.append(i[1])
    
temp_df['pred_proba'] = pred_proba_list

pred_proba_opp = []
for i in temp_df.index:
    for j in temp_df.index:
        if temp_df['school_id'][i] == temp_df['opp_id'][j]:
            pred_proba_opp.append(temp_df['pred_proba'][j])
            
temp_df['pred_proba_opp'] = pred_proba_opp

W_adj = []
for i in temp_df.index:
    if temp_df['pred_proba'][i] > temp_df['pred_proba_opp'][i]:
        W_adj.append(1)
    else:
        W_adj.append(0)
        
temp_df['W_adj'] = W_adj



In [531]:
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,school_id,opp_id,W,pred,pred_proba,pred_proba_opp,W_adj
0,4.545755,252.609041,-42.277064,35.212997,4.411493,61.975584,-40.530056,-23.941229,-63.722667,-7.402392,...,9.649949,-6.439969,-1.390712,Florida State,Michigan,0,0,0.100721,0.990064,0
1,15.400244,226.439555,-39.502852,22.414779,4.029245,49.062384,-45.608456,-18.677142,-58.487409,-6.833631,...,6.911732,-4.748267,-3.832295,Kansas State,Loyola (IL),0,1,0.999676,0.007863,1
2,33.306747,244.617911,-28.855386,33.815037,4.305123,40.150474,-41.972013,-22.144997,-58.145658,-11.258343,...,9.438082,-7.405861,-3.370219,Loyola (IL),Kansas State,1,0,0.007863,0.999676,0
3,32.765004,266.896078,-26.546088,31.521798,0.384553,54.083475,-35.684333,-33.345405,-57.28522,-15.029404,...,9.689747,-6.958064,-2.296922,Michigan,Florida State,1,1,0.990064,0.100721,1
4,15.778855,265.494231,-32.049374,24.628594,16.673671,54.551405,-44.629039,-28.59636,-55.214706,-7.289696,...,13.417213,-5.142625,-4.276287,Duke,Kansas,0,1,0.966472,0.887764,1
5,0.86891,259.582635,-23.038001,29.776988,11.877478,48.595969,-51.062454,-20.382182,-57.732268,-7.625576,...,14.455085,-5.242296,-3.218225,Kansas,Duke,1,1,0.887764,0.966472,0
6,28.80007,256.149649,-39.934985,27.529002,11.455059,60.923868,-37.933794,-29.553617,-59.61107,-12.611171,...,5.977779,-7.240281,-0.390252,Texas Tech,Villanova,0,0,0.031422,0.999233,0
7,12.943476,271.741753,-19.183942,41.739698,-5.57897,47.29529,-45.407564,-23.703844,-61.438021,-12.529322,...,14.313747,-9.647248,-5.765874,Villanova,Texas Tech,1,1,0.999233,0.031422,1


# Final Four

In [532]:
# Predictions of Round of 4
temp_df = pd.DataFrame(columns=joined.columns.drop('W'))

for i in final4_indices:
    temp_df = temp_df.append(row_maker(mm_2018['school_id'][i], mm_2018['opp_id'][i]))
    temp_df.reset_index(drop=True, inplace=True)
    
temp_df = pca.transform(temp_df)
temp_df = pd.DataFrame(temp_df, columns = np.arange(0,50))

temp_df['school_id'] = mm_2018['school_id'][final4_indices].values
temp_df['opp_id'] = mm_2018['opp_id'][final4_indices].values

temp_df['W'] = mm_2018['W'][final4_indices].values

temp_df['pred'] = logreg.predict(temp_df.drop(['school_id', 'opp_id', 'W'], axis=1))

pred_proba = logreg.predict_proba(temp_df.drop(['school_id', 'opp_id', 'W', 'pred'], axis=1))

pred_proba_list = []
for i in pred_proba:
    pred_proba_list.append(i[1])
    
temp_df['pred_proba'] = pred_proba_list

pred_proba_opp = []
for i in temp_df.index:
    for j in temp_df.index:
        if temp_df['school_id'][i] == temp_df['opp_id'][j]:
            pred_proba_opp.append(temp_df['pred_proba'][j])
            
temp_df['pred_proba_opp'] = pred_proba_opp

W_adj = []
for i in temp_df.index:
    if temp_df['pred_proba'][i] > temp_df['pred_proba_opp'][i]:
        W_adj.append(1)
    else:
        W_adj.append(0)
        
temp_df['W_adj'] = W_adj



In [533]:
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,school_id,opp_id,W,pred,pred_proba,pred_proba_opp,W_adj
0,16.883808,273.535698,-24.810214,33.181261,15.420126,47.536831,-46.630429,-23.709661,-65.841689,-11.374628,...,12.263074,-8.103203,-1.977347,Kansas,Villanova,0,1,0.92629,0.988398,0
1,18.258165,233.585703,-27.914088,37.56379,11.500921,46.999198,-43.705404,-25.370502,-62.505591,-10.838394,...,11.644508,-6.031131,-3.446426,Loyola (IL),Michigan,0,0,0.000136,0.999997,0
2,6.214034,241.864475,-25.144905,32.1727,-0.613705,56.66098,-37.79646,-19.433314,-51.570993,-3.688645,...,10.345112,-5.056973,-2.476713,Michigan,Loyola (IL),1,1,0.999997,0.000136,1
3,13.051913,274.294232,-15.220803,32.304638,14.309577,57.543633,-48.680564,-24.29301,-58.70285,-9.514015,...,14.393634,-7.686023,-7.017939,Villanova,Kansas,1,1,0.988398,0.92629,1


# Championship!

In [534]:
# Predictions of Championship
temp_df = pd.DataFrame(columns=joined.columns.drop('W'))

for i in final_indices:
    temp_df = temp_df.append(row_maker(mm_2018['school_id'][i], mm_2018['opp_id'][i]))
    temp_df.reset_index(drop=True, inplace=True)
    
temp_df = pca.transform(temp_df)
temp_df = pd.DataFrame(temp_df, columns = np.arange(0,50))

temp_df['school_id'] = mm_2018['school_id'][final_indices].values
temp_df['opp_id'] = mm_2018['opp_id'][final_indices].values

temp_df['W'] = mm_2018['W'][final_indices].values

temp_df['pred'] = logreg.predict(temp_df.drop(['school_id', 'opp_id', 'W'], axis=1))

pred_proba = logreg.predict_proba(temp_df.drop(['school_id', 'opp_id', 'W', 'pred'], axis=1))

pred_proba_list = []
for i in pred_proba:
    pred_proba_list.append(i[1])
    
temp_df['pred_proba'] = pred_proba_list

pred_proba_opp = []
for i in temp_df.index:
    for j in temp_df.index:
        if temp_df['school_id'][i] == temp_df['opp_id'][j]:
            pred_proba_opp.append(temp_df['pred_proba'][j])
            
temp_df['pred_proba_opp'] = pred_proba_opp

W_adj = []
for i in temp_df.index:
    if temp_df['pred_proba'][i] > temp_df['pred_proba_opp'][i]:
        W_adj.append(1)
    else:
        W_adj.append(0)
        
temp_df['W_adj'] = W_adj



In [535]:
temp_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,47,48,49,school_id,opp_id,W,pred,pred_proba,pred_proba_opp,W_adj
0,25.856592,261.947926,-26.263474,33.511008,11.695543,58.080486,-34.7747,-30.264939,-57.760522,-11.724114,...,10.780422,-8.721158,-0.959086,Michigan,Villanova,0,0,0.063569,0.999275,0
1,0.168905,261.521601,-18.412798,44.799696,6.572667,55.754144,-46.960492,-20.783615,-64.696001,-6.89921,...,15.080518,-8.345387,-6.141499,Villanova,Michigan,1,1,0.999275,0.063569,1


# -------------- Graveyard Models -------------------------

## Random Forest Classifier

In [122]:
rfc = RandomForestClassifier(max_depth=20)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [123]:
scores = cross_val_score(rfc, X_train, y_train)
np.mean(scores)

0.665257406789609

In [None]:
feat_series = pd.Series(rfc.feature_importances_, index=joined.drop('W', axis=1).columns)
feat_series.head()

In [None]:
# SRS = Simple Rating System, takes into account SOS
# SOS = Strength of Schedule
feat_series.sort_values(ascending=False)

## GridCV

In [124]:
rf_params = {'n_estimators':[2,5,10,20],
             'criterion':['gini', 'entropy'],
             'max_depth':[5,10,20,None]}
             #'min_samples_split':[3,4,5]}

In [125]:
gridcv_rf = RandomizedSearchCV(RandomForestClassifier(), rf_params, cv=5)

In [126]:
gridcv_rf.fit(X_train, y_train)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_estimators': [2, 5, 10, 20], 'criterion': ['gini', 'entropy'], 'max_depth': [5, 10, 20, None]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [127]:
scores = gridcv_rf.score(X_train, y_train)
np.mean(scores)

0.779707287480094

In [129]:
scores = gridcv_rf.score(X_test, y_test)
np.mean(scores)

0.7055779183438758

In [134]:
gridcv_rf.best_params_

{'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 20}

In [None]:
test = pd.read_csv('../data/test.csv', sep='\t', header=None)
test.columns = ['wk', 'date', 'rank', 'school', 'prev_rank', 'chnge', 'conf']

## KNN Classifier

In [118]:
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=20, p=2,
           weights='uniform')

In [119]:
scores = cross_val_score(knn, X_train, y_train)
print(np.mean(scores))

0.6579964848735292
