In [2]:
import copy as cp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.utils import load_data_template
from src.features.games import GameFeatures
from src.features.games_detailed import GameDetailedFeatures

In [4]:
data = load_data_template(season=False)
data.dropna(subset=['a_win'], inplace=True)
data.shape

(2117, 7)

In [5]:
game_feat = GameFeatures(default_lags=1)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_season, fillna=0)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_tourney, fillna=0)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_season_against_opponent, fillna=0, per_game=True)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_tourney_against_opponent, fillna=0, per_game=True)
game_detail_feat = GameDetailedFeatures(default_lags=3)
data = game_detail_feat.per_team_wrapper(data, game_detail_feat.detail_features_by_game, per_day=True)
data.dropna(inplace=True)
data.sample()

Unnamed: 0,Season,team_a,team_b,in_target,game_set,a_win,DayNum,games_won_in_season_team_a,games_won_in_season_team_a_lag-1,games_won_in_season_team_b,...,TO_game_team_b_lag-3,Stl_game_team_b_lag-1,Stl_game_team_b_lag-2,Stl_game_team_b_lag-3,Blk_game_team_b_lag-1,Blk_game_team_b_lag-2,Blk_game_team_b_lag-3,PF_game_team_b_lag-1,PF_game_team_b_lag-2,PF_game_team_b_lag-3
10573,2008,1277,1396,False,0,True,136,25,22.0,21,...,13.0,7.0,1.0,3.0,1.0,5.0,2.0,23.0,19.0,21.0


In [6]:
from sklearn.datasets import dump_svmlight_file
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder


X = data.drop(['Season', 'team_a', 'team_b', 'a_win', 'in_target', 'DayNum'], axis=1)
cat_cols = [c in ['team_a', 'team_b', 'game_set'] for c in X.columns]
y = data[['a_win', 'game_set']]

n_splits = 5
path = 'data/libfm/cv_{}-{:d}-of-X-shuffle.{}'.replace('X', str(n_splits))
kf = KFold(n_splits=n_splits, shuffle=True)
metrics = []
for i, (train_index, test_index) in enumerate(kf.split(X)):
    with open(path.format("train", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[train_index]),
                           y.iloc[train_index].a_win.astype(int), f)
        
    with open(path.format("test", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[test_index]),
                           0.5*np.ones(X.iloc[test_index].shape[0]), f)
        
    !dist/libfm/bin/LibFM\
        -task c\
        -test data/libfm/cv_test-{i}-of-{n_splits}-shuffle.libfm\
        -train data/libfm/cv_train-{i}-of-{n_splits}-shuffle.libfm \
        -verbosity 0\
        -out data/libfm/predictions.csv\
        -dim '1,1,8'\
        -iter 50\
        -init_stdev .01
        
    preds = np.loadtxt('data/libfm/predictions.csv')
    ncaa_true = y.iloc[test_index][y.iloc[test_index].game_set == 0]
    ncaa_pred = preds[y.iloc[test_index].reset_index().game_set == 0]
    metric = log_loss(ncaa_true.a_win.astype(int), ncaa_pred)
    metrics.append(metric)
    
print('Metric Mean: {:.2f} ({:.2f})'.format(np.mean(metrics), np.std(metrics)))

----------------------------------------------------------------------------
libFM
  Version: 1.4.4
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=784	num_values=66054	num_features=91	min_target=0	max_target=1
Loading test... 	
has x = 0
has xt = 1
num_rows=197	num_values=16588	num_features=91	min_target=0.5	max_target=0.5
#relations: 0
Loading meta data...	
#Iter=  0	Train=0.572704	Test=0.456853	Test(ll)=0.772612
#Iter=  1	Train=0.663265	Test=0.461929	Test(ll)=0.689297
#Iter=  2	Train=0.752551	Test=0.472081	Test(ll)=0.651336
#Iter=  3	Train=0.803571	Test=0.472081	Test(ll)=0.631616
#Iter=  4	Train=0.82398	Test=0.497462	Test(ll)=0.618579
#Iter

#Iter= 10	Train=0.935032	Test=0.459184	Test(ll)=0.676461
#Iter= 11	Train=0.945223	Test=0.44898	Test(ll)=0.681087
#Iter= 12	Train=0.945223	Test=0.454082	Test(ll)=0.686573
#Iter= 13	Train=0.952866	Test=0.443878	Test(ll)=0.692845
#Iter= 14	Train=0.952866	Test=0.44898	Test(ll)=0.699987
#Iter= 15	Train=0.95414	Test=0.44898	Test(ll)=0.704151
#Iter= 16	Train=0.95414	Test=0.443878	Test(ll)=0.705498
#Iter= 17	Train=0.956688	Test=0.443878	Test(ll)=0.709716
#Iter= 18	Train=0.96051	Test=0.443878	Test(ll)=0.714202
#Iter= 19	Train=0.96051	Test=0.443878	Test(ll)=0.718517
#Iter= 20	Train=0.961783	Test=0.443878	Test(ll)=0.722673
#Iter= 21	Train=0.968153	Test=0.433673	Test(ll)=0.725808
#Iter= 22	Train=0.966879	Test=0.433673	Test(ll)=0.729097
#Iter= 23	Train=0.964331	Test=0.443878	Test(ll)=0.732913
#Iter= 24	Train=0.965605	Test=0.443878	Test(ll)=0.736487
#Iter= 25	Train=0.965605	Test=0.443878	Test(ll)=0.740382
#Iter= 26	Train=0.971975	Test=0.438776	Test(ll)=0.743843
#Iter= 27	Train=0.973248	Test=0.433673

#Iter= 31	Train=0.957962	Test=0.438776	Test(ll)=0.862063
#Iter= 32	Train=0.956688	Test=0.433673	Test(ll)=0.864394
#Iter= 33	Train=0.952866	Test=0.428571	Test(ll)=0.866958
#Iter= 34	Train=0.950318	Test=0.428571	Test(ll)=0.869354
#Iter= 35	Train=0.956688	Test=0.428571	Test(ll)=0.871215
#Iter= 36	Train=0.957962	Test=0.428571	Test(ll)=0.873286
#Iter= 37	Train=0.964331	Test=0.428571	Test(ll)=0.875056
#Iter= 38	Train=0.964331	Test=0.428571	Test(ll)=0.876272
#Iter= 39	Train=0.966879	Test=0.428571	Test(ll)=0.87809
#Iter= 40	Train=0.971975	Test=0.428571	Test(ll)=0.879857
#Iter= 41	Train=0.970701	Test=0.433673	Test(ll)=0.88195
#Iter= 42	Train=0.971975	Test=0.428571	Test(ll)=0.884007
#Iter= 43	Train=0.971975	Test=0.428571	Test(ll)=0.885703
#Iter= 44	Train=0.974522	Test=0.428571	Test(ll)=0.88718
#Iter= 45	Train=0.971975	Test=0.428571	Test(ll)=0.888609
#Iter= 46	Train=0.973248	Test=0.428571	Test(ll)=0.890192
#Iter= 47	Train=0.968153	Test=0.428571	Test(ll)=0.891799
#Iter= 48	Train=0.971975	Test=0.43

In [7]:
all_preds = []
        
for i, (train_index, test_index) in enumerate(kf.split(data)):
    
    with open(path.format("train", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[train_index]),
                           y.iloc[train_index].a_win.astype(int), f)
        
    with open(path.format("test", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[test_index]),
                           0.5*np.ones(X.iloc[test_index].shape[0]), f)
        
    !dist/libfm/bin/LibFM\
        -task c\
        -test data/libfm/cv_test-{i}-of-{n_splits}-shuffle.libfm\
        -train data/libfm/cv_train-{i}-of-{n_splits}-shuffle.libfm \
        -verbosity 0\
        -out data/libfm/predictions.csv\
        -dim '1,1,8'\
        -iter 50\
        -init_stdev .01
        
    preds = np.loadtxt('data/libfm/predictions.csv')
    all_preds.append(preds)
    
model_out = cp.deepcopy(data[['Season', 'team_a', 'team_b', 'DayNum']])
model_out['Pred'] = np.concatenate(all_preds)
model_out['ID'] = model_out['Season'].map(str) + '_' + model_out['team_a'].map(str) + '_' + model_out['team_b'].map(str)
model_out.set_index('ID', inplace=True)
model_out.head()

----------------------------------------------------------------------------
libFM
  Version: 1.4.4
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=784	num_values=66030	num_features=91	min_target=0	max_target=1
Loading test... 	
has x = 0
has xt = 1
num_rows=197	num_values=16612	num_features=91	min_target=0.5	max_target=0.5
#relations: 0
Loading meta data...	
#Iter=  0	Train=0.575255	Test=0.42132	Test(ll)=1.06522
#Iter=  1	Train=0.692602	Test=0.411168	Test(ll)=0.95751
#Iter=  2	Train=0.746173	Test=0.436548	Test(ll)=0.899196
#Iter=  3	Train=0.77551	Test=0.441624	Test(ll)=0.877538
#Iter=  4	Train=0.8125	Test=0.436548	Test(ll)=0.854186
#Iter=  5	

#Iter=  8	Train=0.93758	Test=0.530612	Test(ll)=0.596291
#Iter=  9	Train=0.942675	Test=0.520408	Test(ll)=0.598904
#Iter= 10	Train=0.945223	Test=0.515306	Test(ll)=0.603803
#Iter= 11	Train=0.952866	Test=0.515306	Test(ll)=0.607309
#Iter= 12	Train=0.961783	Test=0.515306	Test(ll)=0.611634
#Iter= 13	Train=0.963057	Test=0.505102	Test(ll)=0.617316
#Iter= 14	Train=0.970701	Test=0.5	Test(ll)=0.622144
#Iter= 15	Train=0.961783	Test=0.505102	Test(ll)=0.626859
#Iter= 16	Train=0.956688	Test=0.5	Test(ll)=0.630743
#Iter= 17	Train=0.96051	Test=0.505102	Test(ll)=0.63446
#Iter= 18	Train=0.961783	Test=0.510204	Test(ll)=0.638648
#Iter= 19	Train=0.96051	Test=0.510204	Test(ll)=0.642241
#Iter= 20	Train=0.968153	Test=0.510204	Test(ll)=0.645725
#Iter= 21	Train=0.964331	Test=0.505102	Test(ll)=0.648754
#Iter= 22	Train=0.970701	Test=0.5	Test(ll)=0.651556
#Iter= 23	Train=0.961783	Test=0.5	Test(ll)=0.654209
#Iter= 24	Train=0.963057	Test=0.505102	Test(ll)=0.656947
#Iter= 25	Train=0.965605	Test=0.505102	Test(ll)=0.65928

#Iter= 32	Train=0.973248	Test=0.469388	Test(ll)=0.715109
#Iter= 33	Train=0.971975	Test=0.469388	Test(ll)=0.718068
#Iter= 34	Train=0.974522	Test=0.469388	Test(ll)=0.720598
#Iter= 35	Train=0.974522	Test=0.469388	Test(ll)=0.722896
#Iter= 36	Train=0.973248	Test=0.464286	Test(ll)=0.725355
#Iter= 37	Train=0.971975	Test=0.464286	Test(ll)=0.728136
#Iter= 38	Train=0.975796	Test=0.469388	Test(ll)=0.730425
#Iter= 39	Train=0.970701	Test=0.469388	Test(ll)=0.732477
#Iter= 40	Train=0.970701	Test=0.469388	Test(ll)=0.734219
#Iter= 41	Train=0.975796	Test=0.469388	Test(ll)=0.736074
#Iter= 42	Train=0.97707	Test=0.469388	Test(ll)=0.738165
#Iter= 43	Train=0.973248	Test=0.469388	Test(ll)=0.740373
#Iter= 44	Train=0.975796	Test=0.469388	Test(ll)=0.74281
#Iter= 45	Train=0.975796	Test=0.469388	Test(ll)=0.745003
#Iter= 46	Train=0.978344	Test=0.469388	Test(ll)=0.747223
#Iter= 47	Train=0.979618	Test=0.469388	Test(ll)=0.74932
#Iter= 48	Train=0.973248	Test=0.47449	Test(ll)=0.751491
#Iter= 49	Train=0.978344	Test=0.474

Unnamed: 0_level_0,Season,team_a,team_b,DayNum,Pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014_1107_1196,2014,1107,1196,136,1.0
2014_1107_1291,2014,1107,1291,134,0.005842
2014_1110_1458,2014,1110,1458,136,0.97863
2014_1112_1211,2014,1112,1211,139,0.338499
2014_1112_1361,2014,1112,1361,143,1.0


In [8]:
model_out = cp.deepcopy(data[['Season', 'team_a', 'team_b', 'DayNum']])
model_out['Pred'] = np.concatenate(all_preds)
model_out['ID'] = model_out['Season'].map(str) + '_' + model_out['team_a'].map(str) + '_' + model_out['team_b'].map(str)
model_out.set_index('ID', inplace=True)
model_out.head()

Unnamed: 0_level_0,Season,team_a,team_b,DayNum,Pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014_1107_1196,2014,1107,1196,136,1.0
2014_1107_1291,2014,1107,1291,134,0.005842
2014_1110_1458,2014,1110,1458,136,0.97863
2014_1112_1211,2014,1112,1211,139,0.338499
2014_1112_1361,2014,1112,1361,143,1.0


In [9]:
model_out.to_csv('data/libfm_model_out.csv')