In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import copy as cp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.utils import load_data_template
from src.features.games import GameFeatures
from src.features.games_detailed import GameDetailedFeatures
from src.features.rankings import RankingFeatures
from src.features.seeds import SeedFeatures

In [3]:
data = load_data_template(season=False)
data.dropna(subset=['a_win'], inplace=True)
data.shape

(2117, 7)

In [4]:
game_feat = GameFeatures(default_lags=3)
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_in_season, fillna=0)
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_in_tourney, fillna=0)
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_against_opponent, fillna=0, per_game=True)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_tourney_against_opponent, fillna=0, per_game=True)
game_detail_feat = GameDetailedFeatures(default_lags=3)
data = game_detail_feat.per_team_wrapper(data, game_detail_feat.detail_features_by_game, per_day=True)
seed_feat = SeedFeatures()
data = seed_feat.per_team_wrapper(data, seed_feat.team_seeds, combine='subtract')
data.dropna(inplace=True)
data.sample()

Unnamed: 0,Season,team_a,team_b,in_target,game_set,a_win,DayNum,last_games_won_in_season_team_a,last_games_won_in_season_team_a_lag-1,last_games_won_in_season_team_a_lag-2,...,Stl_game_team_b_lag-1,Stl_game_team_b_lag-2,Stl_game_team_b_lag-3,Blk_game_team_b_lag-1,Blk_game_team_b_lag-2,Blk_game_team_b_lag-3,PF_game_team_b_lag-1,PF_game_team_b_lag-2,PF_game_team_b_lag-3,seed_team_combined
10266,2003,1139,1280,False,0,True,137,24,23.0,21.0,...,11.0,7.0,7.0,2.0,4.0,2.0,19.0,24.0,10.0,7


In [5]:
from sklearn.datasets import dump_svmlight_file
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder


X = data.drop(['Season', 'team_a', 'team_b', 'a_win', 'in_target', 'DayNum'], axis=1)
cat_cols = [c in ['team_a', 'team_b', 'game_set'] for c in X.columns]
y = data[['a_win', 'game_set']]

n_splits = 5
path = 'data/libfm/cv_{}-{:d}-of-X-shuffle.{}'.replace('X', str(n_splits))
kf = KFold(n_splits=n_splits, shuffle=True)
metrics = []
for i, (train_index, test_index) in enumerate(kf.split(X)):
    with open(path.format("train", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[train_index]),
                           y.iloc[train_index].a_win.astype(int), f)
        
    with open(path.format("test", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[test_index]),
                           0.5*np.ones(X.iloc[test_index].shape[0]), f)
        
    !dist/libfm/bin/LibFM\
        -task c\
        -test data/libfm/cv_test-{i}-of-{n_splits}-shuffle.libfm\
        -train data/libfm/cv_train-{i}-of-{n_splits}-shuffle.libfm \
        -verbosity 0\
        -out data/libfm/predictions.csv\
        -dim '1,1,8'\
        -iter 50\
        -init_stdev .01
        
    preds = np.loadtxt('data/libfm/predictions.csv')
    ncaa_true = y.iloc[test_index][y.iloc[test_index].game_set == 0]
    ncaa_pred = preds[y.iloc[test_index].reset_index().game_set == 0]
    metric = log_loss(ncaa_true.a_win.astype(int), ncaa_pred)
    metrics.append(metric)
    
print('Metric Mean: {:.2f} ({:.2f})'.format(np.mean(metrics), np.std(metrics)))

----------------------------------------------------------------------------
libFM
  Version: 1.4.4
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=784	num_values=72499	num_features=106	min_target=0	max_target=1
Loading test... 	
has x = 0
has xt = 1
num_rows=197	num_values=18186	num_features=106	min_target=0.5	max_target=0.5
#relations: 0
Loading meta data...	
#Iter=  0	Train=0.659439	Test=0.426396	Test(ll)=0.92223
#Iter=  1	Train=0.751276	Test=0.441624	Test(ll)=0.86427
#Iter=  2	Train=0.807398	Test=0.461929	Test(ll)=0.827356
#Iter=  3	Train=0.844388	Test=0.477157	Test(ll)=0.803625
#Iter=  4	Train=0.881378	Test=0.507614	Test(ll)=0.789892
#Ite

#Iter= 10	Train=0.919745	Test=0.561224	Test(ll)=0.580804
#Iter= 11	Train=0.926115	Test=0.556122	Test(ll)=0.586217
#Iter= 12	Train=0.93121	Test=0.540816	Test(ll)=0.592451
#Iter= 13	Train=0.929936	Test=0.530612	Test(ll)=0.598764
#Iter= 14	Train=0.933758	Test=0.520408	Test(ll)=0.604806
#Iter= 15	Train=0.938854	Test=0.520408	Test(ll)=0.610392
#Iter= 16	Train=0.935032	Test=0.520408	Test(ll)=0.615695
#Iter= 17	Train=0.93758	Test=0.510204	Test(ll)=0.620888
#Iter= 18	Train=0.93758	Test=0.510204	Test(ll)=0.623737
#Iter= 19	Train=0.949045	Test=0.510204	Test(ll)=0.626888
#Iter= 20	Train=0.952866	Test=0.510204	Test(ll)=0.630062
#Iter= 21	Train=0.956688	Test=0.5	Test(ll)=0.634023
#Iter= 22	Train=0.947771	Test=0.505102	Test(ll)=0.638415
#Iter= 23	Train=0.955414	Test=0.505102	Test(ll)=0.643032
#Iter= 24	Train=0.95414	Test=0.505102	Test(ll)=0.647281
#Iter= 25	Train=0.952866	Test=0.505102	Test(ll)=0.651381
#Iter= 26	Train=0.955414	Test=0.510204	Test(ll)=0.6542
#Iter= 27	Train=0.959236	Test=0.505102	Tes

#Iter= 32	Train=0.974522	Test=0.433673	Test(ll)=0.806088
#Iter= 33	Train=0.97707	Test=0.433673	Test(ll)=0.808203
#Iter= 34	Train=0.979618	Test=0.433673	Test(ll)=0.810381
#Iter= 35	Train=0.978344	Test=0.433673	Test(ll)=0.812632
#Iter= 36	Train=0.980892	Test=0.433673	Test(ll)=0.814939
#Iter= 37	Train=0.980892	Test=0.433673	Test(ll)=0.817423
#Iter= 38	Train=0.980892	Test=0.433673	Test(ll)=0.819816
#Iter= 39	Train=0.985987	Test=0.433673	Test(ll)=0.821925
#Iter= 40	Train=0.987261	Test=0.433673	Test(ll)=0.823835
#Iter= 41	Train=0.983439	Test=0.433673	Test(ll)=0.825657
#Iter= 42	Train=0.985987	Test=0.433673	Test(ll)=0.827614
#Iter= 43	Train=0.984713	Test=0.433673	Test(ll)=0.829648
#Iter= 44	Train=0.987261	Test=0.433673	Test(ll)=0.8315
#Iter= 45	Train=0.983439	Test=0.433673	Test(ll)=0.833382
#Iter= 46	Train=0.984713	Test=0.433673	Test(ll)=0.835314
#Iter= 47	Train=0.987261	Test=0.443878	Test(ll)=0.836868
#Iter= 48	Train=0.980892	Test=0.443878	Test(ll)=0.837116
#Iter= 49	Train=0.984713	Test=0.44

In [6]:
all_preds = []
        
for i, (train_index, test_index) in enumerate(kf.split(data)):
    
    with open(path.format("train", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[train_index]),
                           y.iloc[train_index].a_win.astype(int), f)
        
    with open(path.format("test", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[test_index]),
                           0.5*np.ones(X.iloc[test_index].shape[0]), f)
        
    !dist/libfm/bin/LibFM\
        -task c\
        -test data/libfm/cv_test-{i}-of-{n_splits}-shuffle.libfm\
        -train data/libfm/cv_train-{i}-of-{n_splits}-shuffle.libfm \
        -verbosity 0\
        -out data/libfm/predictions.csv\
        -dim '1,1,8'\
        -iter 50\
        -init_stdev .01
        
    preds = np.loadtxt('data/libfm/predictions.csv')
    all_preds.append(preds)
    
model_out = cp.deepcopy(data[['Season', 'team_a', 'team_b', 'DayNum']])
model_out['Pred'] = np.concatenate(all_preds)
model_out['ID'] = model_out['Season'].map(str) + '_' + model_out['team_a'].map(str) + '_' + model_out['team_b'].map(str)
model_out.set_index('ID', inplace=True)
model_out.head()

----------------------------------------------------------------------------
libFM
  Version: 1.4.4
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=784	num_values=72455	num_features=106	min_target=0	max_target=1
Loading test... 	
has x = 0
has xt = 1
num_rows=197	num_values=18230	num_features=106	min_target=0.5	max_target=0.5
#relations: 0
Loading meta data...	
#Iter=  0	Train=0.628827	Test=0.482234	Test(ll)=0.74234
#Iter=  1	Train=0.73852	Test=0.482234	Test(ll)=0.671101
#Iter=  2	Train=0.82398	Test=0.472081	Test(ll)=0.659212
#Iter=  3	Train=0.844388	Test=0.492386	Test(ll)=0.66472
#Iter=  4	Train=0.867347	Test=0.502538	Test(ll)=0.673222
#Iter=

#Iter= 10	Train=0.905732	Test=0.418367	Test(ll)=0.768275
#Iter= 11	Train=0.913376	Test=0.418367	Test(ll)=0.770066
#Iter= 12	Train=0.924841	Test=0.418367	Test(ll)=0.771645
#Iter= 13	Train=0.93758	Test=0.423469	Test(ll)=0.770461
#Iter= 14	Train=0.936306	Test=0.423469	Test(ll)=0.769782
#Iter= 15	Train=0.932484	Test=0.423469	Test(ll)=0.771105
#Iter= 16	Train=0.932484	Test=0.413265	Test(ll)=0.770722
#Iter= 17	Train=0.942675	Test=0.433673	Test(ll)=0.770908
#Iter= 18	Train=0.950318	Test=0.433673	Test(ll)=0.771651
#Iter= 19	Train=0.941401	Test=0.433673	Test(ll)=0.77149
#Iter= 20	Train=0.945223	Test=0.438776	Test(ll)=0.772315
#Iter= 21	Train=0.947771	Test=0.433673	Test(ll)=0.773322
#Iter= 22	Train=0.936306	Test=0.433673	Test(ll)=0.773469
#Iter= 23	Train=0.946497	Test=0.428571	Test(ll)=0.773548
#Iter= 24	Train=0.949045	Test=0.423469	Test(ll)=0.774161
#Iter= 25	Train=0.95414	Test=0.423469	Test(ll)=0.774939
#Iter= 26	Train=0.950318	Test=0.428571	Test(ll)=0.775894
#Iter= 27	Train=0.952866	Test=0.43

#Iter= 32	Train=0.963057	Test=0.454082	Test(ll)=0.865131
#Iter= 33	Train=0.963057	Test=0.454082	Test(ll)=0.86539
#Iter= 34	Train=0.963057	Test=0.454082	Test(ll)=0.865266
#Iter= 35	Train=0.959236	Test=0.459184	Test(ll)=0.866177
#Iter= 36	Train=0.966879	Test=0.459184	Test(ll)=0.867323
#Iter= 37	Train=0.973248	Test=0.459184	Test(ll)=0.868049
#Iter= 38	Train=0.975796	Test=0.454082	Test(ll)=0.868562
#Iter= 39	Train=0.97707	Test=0.454082	Test(ll)=0.869104
#Iter= 40	Train=0.973248	Test=0.454082	Test(ll)=0.870142
#Iter= 41	Train=0.973248	Test=0.454082	Test(ll)=0.870996
#Iter= 42	Train=0.975796	Test=0.454082	Test(ll)=0.871714
#Iter= 43	Train=0.971975	Test=0.454082	Test(ll)=0.872122
#Iter= 44	Train=0.978344	Test=0.454082	Test(ll)=0.873037
#Iter= 45	Train=0.980892	Test=0.454082	Test(ll)=0.873515
#Iter= 46	Train=0.983439	Test=0.454082	Test(ll)=0.874071
#Iter= 47	Train=0.978344	Test=0.454082	Test(ll)=0.874591
#Iter= 48	Train=0.982166	Test=0.454082	Test(ll)=0.875077
#Iter= 49	Train=0.978344	Test=0.4

Unnamed: 0_level_0,Season,team_a,team_b,DayNum,Pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014_1107_1196,2014,1107,1196,136,0.999986
2014_1107_1291,2014,1107,1291,134,0.925702
2014_1110_1458,2014,1110,1458,136,0.13864
2014_1112_1211,2014,1112,1211,139,0.000131
2014_1112_1361,2014,1112,1361,143,0.064078


In [7]:
model_out = cp.deepcopy(data[['Season', 'team_a', 'team_b', 'DayNum']])
model_out['Pred'] = np.concatenate(all_preds)
model_out['ID'] = model_out['Season'].map(str) + '_' + model_out['team_a'].map(str) + '_' + model_out['team_b'].map(str)
model_out.set_index('ID', inplace=True)
model_out.head()

Unnamed: 0_level_0,Season,team_a,team_b,DayNum,Pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014_1107_1196,2014,1107,1196,136,0.999986
2014_1107_1291,2014,1107,1291,134,0.925702
2014_1110_1458,2014,1110,1458,136,0.13864
2014_1112_1211,2014,1112,1211,139,0.000131
2014_1112_1361,2014,1112,1361,143,0.064078


In [8]:
model_out.to_csv('data/libfm_model_out.csv')