In [189]:
import copy as cp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.utils import load_data_template
from src.features.games import GameFeatures

In [317]:
data = load_data_template(season=True)
data.dropna(subset=['a_win'], inplace=True)
data.shape

(152801, 7)

In [318]:
game_feat = GameFeatures(default_lags=5)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_season, fillna=0)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_tourney, fillna=0)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_season_against_opponent, fillna=0, per_game=True)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_tourney_against_opponent, fillna=0, per_game=True)
data.sample()

Unnamed: 0,Season,team_a,team_b,in_target,game_set,a_win,DayNum,games_won_in_season_team_a,games_won_in_season_team_a_lag-1,games_won_in_season_team_a_lag-2,...,games_won_in_tourney_against_opponent_team_a_lag-1,games_won_in_tourney_against_opponent_team_a_lag-2,games_won_in_tourney_against_opponent_team_a_lag-3,games_won_in_tourney_against_opponent_team_a_lag-4,games_won_in_tourney_against_opponent_team_a_lag-5,games_won_in_tourney_against_opponent_team_b_lag-1,games_won_in_tourney_against_opponent_team_b_lag-2,games_won_in_tourney_against_opponent_team_b_lag-3,games_won_in_tourney_against_opponent_team_b_lag-4,games_won_in_tourney_against_opponent_team_b_lag-5
50851,1995,1169,1417,False,1,False,26.0,7.0,7.0,8.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [322]:
from sklearn.datasets import dump_svmlight_file
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder


X = data.drop(['Season', 'team_a', 'team_b', 'a_win', 'in_target', 'DayNum'], axis=1)
cat_cols = [c in ['team_a', 'team_b', 'game_set'] for c in X.columns]
y = data[['a_win', 'game_set']]

n_splits = 5
path = 'data/libfm/cv_{}-{:d}-of-X-shuffle.{}'.replace('X', str(n_splits))
kf = KFold(n_splits=n_splits, shuffle=True)
metrics = []
for i, (train_index, test_index) in enumerate(kf.split(X)):
    with open(path.format("train", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[train_index]),
                           y.iloc[train_index].a_win.astype(int), f)
        
    with open(path.format("test", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[test_index]),
                           0.5*np.ones(X.iloc[test_index].shape[0]), f)
        
    !dist/libfm/bin/LibFM\
        -task c\
        -test data/libfm/cv_test-{i}-of-{n_splits}-shuffle.libfm\
        -train data/libfm/cv_train-{i}-of-{n_splits}-shuffle.libfm \
        -verbosity 0\
        -out data/libfm/predictions.csv\
        -dim '1,1,8'\
        -iter 50\
        -init_stdev .01
        
    preds = np.loadtxt('data/libfm/predictions.csv')
    ncaa_true = y.iloc[test_index][y.iloc[test_index].game_set == 0]
    ncaa_pred = preds[y.iloc[test_index].reset_index().game_set == 0]
    metric = log_loss(ncaa_true.a_win.astype(int), ncaa_pred)
    metrics.append(metric)
    
print('Metric Mean: {:.2f} ({:.2f})'.format(np.mean(metrics), np.std(metrics)))

----------------------------------------------------------------------------
libFM
  Version: 1.4.4
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=122240	num_values=2170553	num_features=44	min_target=0	max_target=1
Loading test... 	
has x = 0
has xt = 1
num_rows=30561	num_values=539551	num_features=43	min_target=0.5	max_target=0.5
#relations: 0
Loading meta data...	
#Iter=  0	Train=0.877626	Test=0.48395	Test(ll)=0.398704
#Iter=  1	Train=0.879188	Test=0.483787	Test(ll)=0.424503
#Iter=  2	Train=0.879761	Test=0.48575	Test(ll)=0.445249
#Iter=  3	Train=0.879998	Test=0.483754	Test(ll)=0.463181
#Iter=  4	Train=0.880031	Test=0.483819	Test(ll)=0.47881

#Iter=  7	Train=0.880916	Test=0.460929	Test(ll)=0.540384
#Iter=  8	Train=0.880768	Test=0.460929	Test(ll)=0.549835
#Iter=  9	Train=0.880891	Test=0.460406	Test(ll)=0.558538
#Iter= 10	Train=0.880899	Test=0.46034	Test(ll)=0.566508
#Iter= 11	Train=0.880875	Test=0.460046	Test(ll)=0.57405
#Iter= 12	Train=0.880736	Test=0.460373	Test(ll)=0.580874
#Iter= 13	Train=0.880654	Test=0.460275	Test(ll)=0.58721
#Iter= 14	Train=0.880801	Test=0.460406	Test(ll)=0.59321
#Iter= 15	Train=0.88085	Test=0.46034	Test(ll)=0.598801
#Iter= 16	Train=0.880695	Test=0.459751	Test(ll)=0.604037
#Iter= 17	Train=0.880613	Test=0.459653	Test(ll)=0.608948
#Iter= 18	Train=0.880539	Test=0.459522	Test(ll)=0.613697
#Iter= 19	Train=0.880507	Test=0.459293	Test(ll)=0.618115
#Iter= 20	Train=0.880621	Test=0.458704	Test(ll)=0.622398
#Iter= 21	Train=0.880621	Test=0.458541	Test(ll)=0.626462
#Iter= 22	Train=0.880523	Test=0.458246	Test(ll)=0.630308
#Iter= 23	Train=0.880646	Test=0.458835	Test(ll)=0.633946
#Iter= 24	Train=0.880588	Test=0.45873

#Iter= 27	Train=0.880417	Test=0.471826	Test(ll)=0.613401
#Iter= 28	Train=0.880196	Test=0.471728	Test(ll)=0.61682
#Iter= 29	Train=0.88004	Test=0.471826	Test(ll)=0.620126
#Iter= 30	Train=0.880474	Test=0.471793	Test(ll)=0.623318
#Iter= 31	Train=0.880433	Test=0.47176	Test(ll)=0.626478
#Iter= 32	Train=0.880515	Test=0.471433	Test(ll)=0.629489
#Iter= 33	Train=0.880457	Test=0.471466	Test(ll)=0.632415
#Iter= 34	Train=0.880155	Test=0.471433	Test(ll)=0.635237
#Iter= 35	Train=0.880098	Test=0.471204	Test(ll)=0.637983
#Iter= 36	Train=0.880261	Test=0.470975	Test(ll)=0.640628
#Iter= 37	Train=0.880237	Test=0.470812	Test(ll)=0.643152
#Iter= 38	Train=0.880204	Test=0.47091	Test(ll)=0.645578
#Iter= 39	Train=0.880179	Test=0.471008	Test(ll)=0.647935
#Iter= 40	Train=0.880277	Test=0.470746	Test(ll)=0.65022
#Iter= 41	Train=0.880286	Test=0.470713	Test(ll)=0.652428
#Iter= 42	Train=0.880196	Test=0.470812	Test(ll)=0.654535
#Iter= 43	Train=0.880065	Test=0.470975	Test(ll)=0.656584
#Iter= 44	Train=0.879991	Test=0.4711

In [325]:
all_preds = []
        
for i, (train_index, test_index) in enumerate(kf.split(data)):
    
    with open(path.format("train", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[train_index]),
                           y.iloc[train_index].a_win.astype(int), f)
        
    with open(path.format("test", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[test_index]),
                           0.5*np.ones(X.iloc[test_index].shape[0]), f)
        
    !dist/libfm/bin/LibFM\
        -task c\
        -test data/libfm/cv_test-{i}-of-{n_splits}-shuffle.libfm\
        -train data/libfm/cv_train-{i}-of-{n_splits}-shuffle.libfm \
        -verbosity 0\
        -out data/libfm/predictions.csv\
        -dim '1,1,8'\
        -iter 50\
        -init_stdev .01
        
    preds = np.loadtxt('data/libfm/predictions.csv')
    all_preds.append(preds)
    
model_out = cp.deepcopy(data[['Season', 'team_a', 'team_b', 'DayNum']])
model_out['Pred'] = np.concatenate(all_preds)
model_out['ID'] = model_out['Season'].map(str) + '_' + model_out['team_a'].map(str) + '_' + model_out['team_b'].map(str)
model_out.set_index('ID', inplace=True)
model_out.head()

Unnamed: 0_level_0,Season,team_a,team_b,Pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014_1107_1196,2014,1107,1196,0.513852
2014_1107_1291,2014,1107,1291,0.215372
2014_1110_1458,2014,1110,1458,0.893795
2014_1112_1211,2014,1112,1211,0.821133
2014_1112_1361,2014,1112,1361,0.563801


In [327]:
model_out = cp.deepcopy(data[['Season', 'team_a', 'team_b', 'DayNum']])
model_out['Pred'] = np.concatenate(all_preds)
model_out['ID'] = model_out['Season'].map(str) + '_' + model_out['team_a'].map(str) + '_' + model_out['team_b'].map(str)
model_out.set_index('ID', inplace=True)
model_out.head()

Unnamed: 0_level_0,Season,team_a,team_b,DayNum,Pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014_1107_1196,2014,1107,1196,136.0,0.513852
2014_1107_1291,2014,1107,1291,134.0,0.215372
2014_1110_1458,2014,1110,1458,136.0,0.893795
2014_1112_1211,2014,1112,1211,139.0,0.821133
2014_1112_1361,2014,1112,1361,143.0,0.563801


In [328]:
model_out.to_csv('data/libfm/model_out.csv')