In [30]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [31]:
import copy as cp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.utils import load_data_template
from src.features.games import GameFeatures
from src.features.games_detailed import GameDetailedFeatures
from src.features.rankings import RankingFeatures
from src.features.seeds import SeedFeatures

In [32]:
data = load_data_template(season=False)
data.dropna(subset=['a_win'], inplace=True)
data.shape

(2117, 7)

In [33]:
game_feat = GameFeatures(default_lags=3)
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_in_season, fillna=0)
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_in_tourney, fillna=0)
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_against_opponent, fillna=0, per_game=True)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_tourney_against_opponent, fillna=0, per_game=True)
game_detail_feat = GameDetailedFeatures(default_lags=3)
data = game_detail_feat.per_team_wrapper(data, game_detail_feat.detail_features_by_game, per_day=True)
seed_feat = SeedFeatures(default_lags=0)
data = seed_feat.per_team_wrapper(data, seed_feat.team_seeds, combine='subtract')
data.dropna(inplace=True)
data.sample()

Unnamed: 0,Season,team_a,team_b,in_target,game_set,a_win,DayNum,last_games_won_in_season_team_a,last_games_won_in_season_team_a_lag-1,last_games_won_in_season_team_a_lag-2,...,Stl_game_team_b_lag-1,Stl_game_team_b_lag-2,Stl_game_team_b_lag-3,Blk_game_team_b_lag-1,Blk_game_team_b_lag-2,Blk_game_team_b_lag-3,PF_game_team_b_lag-1,PF_game_team_b_lag-2,PF_game_team_b_lag-3,seed_team_combined
7969,2017,1245,1417,True,0,False,137,21,18.0,20.0,...,2.0,6.0,13.0,7.0,4.0,6.0,21.0,12.0,16.0,11


In [34]:
from sklearn.datasets import dump_svmlight_file
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder


X = data.drop(['Season', 'team_a', 'team_b', 'a_win', 'in_target', 'DayNum'], axis=1)
cat_cols = [c in ['team_a', 'team_b', 'game_set'] for c in X.columns]
y = data[['a_win', 'game_set']]

n_splits = 5
path = 'data/libfm/cv_{}-{:d}-of-X-shuffle.{}'.replace('X', str(n_splits))
kf = KFold(n_splits=n_splits, shuffle=True)
metrics = []
for i, (train_index, test_index) in enumerate(kf.split(X)):
    with open(path.format("train", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[train_index]),
                           y.iloc[train_index].a_win.astype(int), f)
        
    with open(path.format("test", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[test_index]),
                           0.5*np.ones(X.iloc[test_index].shape[0]), f)
        
    !dist/libfm/bin/LibFM\
        -task c\
        -test data/libfm/cv_test-{i}-of-{n_splits}-shuffle.libfm\
        -train data/libfm/cv_train-{i}-of-{n_splits}-shuffle.libfm \
        -verbosity 0\
        -out data/libfm/predictions.csv\
        -dim '1,1,8'\
        -iter 50\
        -init_stdev .01
        
    preds = np.loadtxt('data/libfm/predictions.csv')
    ncaa_true = y.iloc[test_index][y.iloc[test_index].game_set == 0]
    ncaa_pred = preds[y.iloc[test_index].reset_index().game_set == 0]
    metric = log_loss(ncaa_true.a_win.astype(int), ncaa_pred)
    metrics.append(metric)
    
print('Metric Mean: {:.2f} ({:.2f})'.format(np.mean(metrics), np.std(metrics)))

----------------------------------------------------------------------------
libFM
  Version: 1.4.4
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=784	num_values=72460	num_features=106	min_target=0	max_target=1
Loading test... 	
has x = 0
has xt = 1
num_rows=197	num_values=18225	num_features=106	min_target=0.5	max_target=0.5
#relations: 0
Loading meta data...	
#Iter=  0	Train=0.655612	Test=0.431472	Test(ll)=0.673714
#Iter=  1	Train=0.769133	Test=0.436548	Test(ll)=0.645818
#Iter=  2	Train=0.807398	Test=0.461929	Test(ll)=0.640608
#Iter=  3	Train=0.835459	Test=0.461929	Test(ll)=0.62775
#Iter=  4	Train=0.860969	Test=0.477157	Test(ll)=0.616667
#It

#Iter=  7	Train=0.910828	Test=0.530612	Test(ll)=0.592629
#Iter=  8	Train=0.919745	Test=0.520408	Test(ll)=0.599206
#Iter=  9	Train=0.933758	Test=0.515306	Test(ll)=0.60504
#Iter= 10	Train=0.933758	Test=0.520408	Test(ll)=0.609635
#Iter= 11	Train=0.936306	Test=0.505102	Test(ll)=0.612548
#Iter= 12	Train=0.945223	Test=0.510204	Test(ll)=0.61625
#Iter= 13	Train=0.943949	Test=0.510204	Test(ll)=0.621674
#Iter= 14	Train=0.952866	Test=0.510204	Test(ll)=0.626882
#Iter= 15	Train=0.952866	Test=0.510204	Test(ll)=0.632148
#Iter= 16	Train=0.947771	Test=0.510204	Test(ll)=0.637242
#Iter= 17	Train=0.951592	Test=0.510204	Test(ll)=0.641474
#Iter= 18	Train=0.946497	Test=0.510204	Test(ll)=0.644763
#Iter= 19	Train=0.950318	Test=0.510204	Test(ll)=0.648235
#Iter= 20	Train=0.951592	Test=0.510204	Test(ll)=0.650851
#Iter= 21	Train=0.959236	Test=0.510204	Test(ll)=0.654804
#Iter= 22	Train=0.952866	Test=0.510204	Test(ll)=0.658696
#Iter= 23	Train=0.959236	Test=0.510204	Test(ll)=0.662612
#Iter= 24	Train=0.955414	Test=0.5

#Iter= 34	Train=0.971975	Test=0.479592	Test(ll)=0.67937
#Iter= 35	Train=0.970701	Test=0.479592	Test(ll)=0.680286
#Iter= 36	Train=0.969427	Test=0.479592	Test(ll)=0.681472
#Iter= 37	Train=0.966879	Test=0.479592	Test(ll)=0.682559
#Iter= 38	Train=0.971975	Test=0.479592	Test(ll)=0.682547
#Iter= 39	Train=0.961783	Test=0.479592	Test(ll)=0.681638
#Iter= 40	Train=0.963057	Test=0.479592	Test(ll)=0.682085
#Iter= 41	Train=0.964331	Test=0.479592	Test(ll)=0.682797
#Iter= 42	Train=0.969427	Test=0.479592	Test(ll)=0.683279
#Iter= 43	Train=0.965605	Test=0.479592	Test(ll)=0.683772
#Iter= 44	Train=0.965605	Test=0.484694	Test(ll)=0.68357
#Iter= 45	Train=0.968153	Test=0.484694	Test(ll)=0.68303
#Iter= 46	Train=0.964331	Test=0.484694	Test(ll)=0.683089
#Iter= 47	Train=0.974522	Test=0.479592	Test(ll)=0.68346
#Iter= 48	Train=0.975796	Test=0.479592	Test(ll)=0.684322
#Iter= 49	Train=0.975796	Test=0.479592	Test(ll)=0.685389
Metric Mean: 0.17 (0.04)


In [35]:
all_preds = []
        
for i, (train_index, test_index) in enumerate(kf.split(data)):
    
    with open(path.format("train", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[train_index]),
                           y.iloc[train_index].a_win.astype(int), f)
        
    with open(path.format("test", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[test_index]),
                           0.5*np.ones(X.iloc[test_index].shape[0]), f)
        
    !dist/libfm/bin/LibFM\
        -task c\
        -test data/libfm/cv_test-{i}-of-{n_splits}-shuffle.libfm\
        -train data/libfm/cv_train-{i}-of-{n_splits}-shuffle.libfm \
        -verbosity 0\
        -out data/libfm/predictions.csv\
        -dim '1,1,8'\
        -iter 50\
        -init_stdev .01
        
    preds = np.loadtxt('data/libfm/predictions.csv')
    all_preds.append(preds)
    
model_out = cp.deepcopy(data[['Season', 'team_a', 'team_b', 'DayNum']])
model_out['Pred'] = np.concatenate(all_preds)
model_out['ID'] = model_out['Season'].map(str) + '_' + model_out['team_a'].map(str) + '_' + model_out['team_b'].map(str)
model_out.set_index('ID', inplace=True)
model_out.head()

----------------------------------------------------------------------------
libFM
  Version: 1.4.4
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=784	num_values=72429	num_features=106	min_target=0	max_target=1
Loading test... 	
has x = 0
has xt = 1
num_rows=197	num_values=18256	num_features=106	min_target=0.5	max_target=0.5
#relations: 0
Loading meta data...	
#Iter=  0	Train=0.679847	Test=0.477157	Test(ll)=0.627189
#Iter=  1	Train=0.758929	Test=0.477157	Test(ll)=0.634645
#Iter=  2	Train=0.822704	Test=0.467005	Test(ll)=0.633106
#Iter=  3	Train=0.84949	Test=0.451777	Test(ll)=0.646409
#Iter=  4	Train=0.858418	Test=0.461929	Test(ll)=0.650617
#It

#Iter=  8	Train=0.950318	Test=0.489796	Test(ll)=0.679218
#Iter=  9	Train=0.955414	Test=0.479592	Test(ll)=0.681711
#Iter= 10	Train=0.947771	Test=0.484694	Test(ll)=0.68399
#Iter= 11	Train=0.957962	Test=0.479592	Test(ll)=0.687437
#Iter= 12	Train=0.968153	Test=0.479592	Test(ll)=0.691362
#Iter= 13	Train=0.965605	Test=0.479592	Test(ll)=0.695883
#Iter= 14	Train=0.965605	Test=0.479592	Test(ll)=0.699137
#Iter= 15	Train=0.969427	Test=0.479592	Test(ll)=0.700364
#Iter= 16	Train=0.975796	Test=0.484694	Test(ll)=0.70266
#Iter= 17	Train=0.973248	Test=0.494898	Test(ll)=0.705558
#Iter= 18	Train=0.963057	Test=0.5	Test(ll)=0.709122
#Iter= 19	Train=0.968153	Test=0.5	Test(ll)=0.713428
#Iter= 20	Train=0.968153	Test=0.494898	Test(ll)=0.716919
#Iter= 21	Train=0.966879	Test=0.494898	Test(ll)=0.719902
#Iter= 22	Train=0.969427	Test=0.494898	Test(ll)=0.722215
#Iter= 23	Train=0.968153	Test=0.494898	Test(ll)=0.725667
#Iter= 24	Train=0.971975	Test=0.494898	Test(ll)=0.728582
#Iter= 25	Train=0.963057	Test=0.484694	Test

#Iter= 37	Train=0.974522	Test=0.5	Test(ll)=0.772483
#Iter= 38	Train=0.970701	Test=0.5	Test(ll)=0.773988
#Iter= 39	Train=0.973248	Test=0.5	Test(ll)=0.775645
#Iter= 40	Train=0.971975	Test=0.5	Test(ll)=0.777327
#Iter= 41	Train=0.971975	Test=0.5	Test(ll)=0.778722
#Iter= 42	Train=0.971975	Test=0.5	Test(ll)=0.780033
#Iter= 43	Train=0.978344	Test=0.5	Test(ll)=0.781532
#Iter= 44	Train=0.983439	Test=0.5	Test(ll)=0.783096
#Iter= 45	Train=0.975796	Test=0.5	Test(ll)=0.784391
#Iter= 46	Train=0.97707	Test=0.494898	Test(ll)=0.785838
#Iter= 47	Train=0.978344	Test=0.494898	Test(ll)=0.787321
#Iter= 48	Train=0.97707	Test=0.489796	Test(ll)=0.788844
#Iter= 49	Train=0.982166	Test=0.489796	Test(ll)=0.790495


Unnamed: 0_level_0,Season,team_a,team_b,DayNum,Pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014_1107_1196,2014,1107,1196,136,0.02721
2014_1107_1291,2014,1107,1291,134,0.947749
2014_1110_1458,2014,1110,1458,136,0.052643
2014_1112_1211,2014,1112,1211,139,0.997256
2014_1112_1361,2014,1112,1361,143,0.855952


In [36]:
model_out = cp.deepcopy(data[['Season', 'team_a', 'team_b', 'DayNum']])
model_out['Pred'] = np.concatenate(all_preds)
model_out['ID'] = model_out['Season'].map(str) + '_' + model_out['team_a'].map(str) + '_' + model_out['team_b'].map(str)
model_out.set_index('ID', inplace=True)
model_out.head()

Unnamed: 0_level_0,Season,team_a,team_b,DayNum,Pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014_1107_1196,2014,1107,1196,136,0.02721
2014_1107_1291,2014,1107,1291,134,0.947749
2014_1110_1458,2014,1110,1458,136,0.052643
2014_1112_1211,2014,1112,1211,139,0.997256
2014_1112_1361,2014,1112,1361,143,0.855952


In [37]:
model_out.to_csv('data/libfm_model_out.csv')