In [14]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from src.features.conferences import ConferenceFeatures
from src.features.coaches import CoachFeatures
from src.features.seeds import SeedFeatures
from src.utils import load_data_template
from src.features.games_detailed import GameDetailedFeatures
from src.features.games import GameFeatures
from src.features.rankings import RankingFeatures
from src.features.events import EventFeatures

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
data = load_data_template(season=False)
data.dropna(subset=['a_win'], inplace=True)
seed_feat = SeedFeatures()
data = seed_feat.per_team_wrapper(data, seed_feat.team_seeds, per_game=False, per_day=False, combine='subtract')
print('-- Seeds loaded --')
print(data.shape)
coach_feat = CoachFeatures(default_lags=0)
data = coach_feat.per_team_wrapper(data, coach_feat.coach_func, per_game=False, per_day=False)
print('-- Coach loaded --')
print(data.shape)
conf_feat = ConferenceFeatures(default_lags=0)
data = conf_feat.per_team_wrapper(data, conf_feat.conference_games, per_game=False, per_day=False)
data = data.fillna(0)
print('-- Conferences loaded --')
event_feat = EventFeatures(default_lags=1)
data = event_feat.per_team_wrapper(data, event_feat.steals_in_season)
data = data.fillna(0)
print(data.shape)
print('-- Events loaded --')
game_feat = GameFeatures()
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_in_season)
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_in_tourney)
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_against_opponent, per_game=True)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_tourney_against_opponent, per_game=True)
data = data.fillna(0)
print(data.shape)
print('-- Game Features loaded --')
game_detail_feat = GameDetailedFeatures(default_lags=2)
data = game_detail_feat.per_team_wrapper(data, game_detail_feat.detail_features_by_game, per_day=True)
data = data.dropna()
print(data.shape)
print('-- Game Detailed Features loaded --')
rank_feat = RankingFeatures(default_lags=0)
data = rank_feat.per_team_wrapper(data, rank_feat.pca_variables_rankings, per_game=False, per_day=False)
print(data.shape)
data = data.fillna(0)
data = rank_feat.per_team_wrapper(data, rank_feat.elos_season, per_game=False, per_day=False)
print('-- Rankings loaded --')
data = data.fillna(0)
print(data.shape)
data.sample()

-- Seeds loaded --
(2117, 8)
-- Coach loaded --
(2117, 10)
-- Conferences loaded --
(2117, 14)
-- Events loaded --
(2117, 40)
-- Game Features loaded --
(981, 92)
-- Game Detailed Features loaded --
(981, 132)
-- Rankings loaded --
(981, 134)


Unnamed: 0,Season,team_a,team_b,in_target,game_set,a_win,DayNum,seed_combined,coach_team_a,coach_team_b,...,PC13_team_b,PC14_team_b,PC15_team_b,PC16_team_b,PC17_team_b,PC18_team_b,PC19_team_b,PC20_team_b,elos_season_team_a,elos_season_team_b
10794,2011,1207,1433,False,0,False,137,-5,711,1190,...,31.627376,9.219678,4.414154,33.042765,-10.065105,-15.346422,23.224145,-0.374209,1882.15083,1726.520008


In [21]:
data.head()

Unnamed: 0,Season,team_a,team_b,in_target,game_set,a_win,DayNum,seed_combined,coach_team_a,coach_team_b,...,PC13_team_b,PC14_team_b,PC15_team_b,PC16_team_b,PC17_team_b,PC18_team_b,PC19_team_b,PC20_team_b,elos_season_team_a,elos_season_team_b
15,2014,1107,1196,True,0,False,136,15,1367,81,...,-3.411747,14.551677,-6.405917,3.843014,-16.9567,-17.820722,25.503101,0.781924,1464.338239,2068.363548
31,2014,1107,1291,True,0,True,134,0,1367,532,...,-14.213965,223.689241,-204.569299,-12.326235,38.104845,-19.633367,69.437288,20.847227,1464.338239,1421.011818
130,2014,1110,1458,True,0,False,136,13,907,91,...,-1.132329,10.936305,-10.870252,9.42024,-18.591782,-9.378375,2.721026,6.880661,1483.905737,1953.001984
148,2014,1112,1211,True,0,True,139,-7,1185,843,...,-1.632611,31.543231,-12.586882,-12.343174,-20.843797,1.913789,-9.611201,12.432687,1965.118682,1968.988326
176,2014,1112,1361,True,0,True,143,-3,1185,1219,...,-3.273371,23.031908,-24.749253,42.575242,-19.061185,-19.055921,8.556589,-14.230954,1965.118682,1926.433884


In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss


X = data.drop(['Season', 'team_a', 'team_b', 'a_win', 'in_target', 'DayNum'], axis=1)
y = data[['a_win']].astype(int)

kf = KFold(n_splits=15, shuffle=True)
metrics = []
for tr_i, t_i in kf.split(X):
    X_tr, y_tr = X.iloc[tr_i], y.iloc[tr_i]
    X_t, y_t = X.iloc[t_i], y.iloc[t_i]
    rf = RandomForestClassifier(n_estimators=1000)
    rf.fit(X_tr.values, y_tr.values.reshape(-1))
    preds = rf.predict_proba(X_t.values)
    metric = log_loss(y_t.values, preds)
    print(metric)
    metrics.append(metric)
    
print('Metric Mean: {:.2f} ({:.2f})'.format(np.mean(metrics), np.std(metrics)))

0.264553451774
0.271582408101
0.248885354084
0.223363624745
0.260372012926
0.236985474013
0.289389446913
0.25921291548
0.24012077397
0.257365484997
0.250694969988
0.269302951985
0.237028065062
0.188413487593
0.234980291496
Metric Mean: 0.25 (0.02)
