In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import copy as cp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.utils import load_data_template
from src.features.games import GameFeatures
from src.features.games_detailed import GameDetailedFeatures
from src.features.rankings import RankingFeatures
from src.features.seeds import SeedFeatures

In [3]:
data = load_data_template(season=False)
data.dropna(subset=['a_win'], inplace=True)
data.shape

(2117, 7)

In [4]:
game_feat = GameFeatures(default_lags=3)
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_in_season, combine='subtract').fillna(0)
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_in_tourney, combine='subtract').fillna(0)
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_against_opponent, per_game=True, combine='subtract').fillna(0)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_tourney_against_opponent, per_game=True, combine='subtract').fillna(0)
game_detail_feat = GameDetailedFeatures(default_lags=3)
data = game_detail_feat.per_team_wrapper(data, game_detail_feat.detail_features_by_game, per_day=True, combine='subtract').fillna(0)
seed_feat = SeedFeatures()
data = seed_feat.per_team_wrapper(data, seed_feat.team_seeds, combine='subtract').fillna(0)
data.dropna(inplace=True)
data.sample()

Unnamed: 0,Season,team_a,team_b,in_target,game_set,a_win,DayNum,last_games_won_in_season_combined,last_games_won_in_season_combined_lag-1,last_games_won_in_season_combined_lag-2,...,Stl_game_combined_lag-1,Stl_game_combined_lag-2,Stl_game_combined_lag-3,Blk_game_combined_lag-1,Blk_game_combined_lag-2,Blk_game_combined_lag-3,PF_game_combined_lag-1,PF_game_combined_lag-2,PF_game_combined_lag-3,seed_combined
7650,2017,1211,1321,True,0,True,138,9,5.0,16.0,...,1.0,9.0,-3.0,7.0,4.0,2.0,-1.0,-7.0,-5.0,-7


In [5]:
from sklearn.datasets import dump_svmlight_file
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder


X = data.drop(['Season', 'team_a', 'team_b', 'a_win', 'in_target', 'DayNum'], axis=1)
cat_cols = [c in ['team_a', 'team_b', 'game_set'] for c in X.columns]
y = data[['a_win', 'game_set']]

n_splits = 5
path = 'data/libfm/cv_{}-{:d}-of-X-shuffle.{}'.replace('X', str(n_splits))
kf = KFold(n_splits=n_splits, shuffle=True)
metrics = []
for i, (train_index, test_index) in enumerate(kf.split(X)):
    with open(path.format("train", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[train_index]),
                           y.iloc[train_index].a_win.astype(int), f)
        
    with open(path.format("test", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[test_index]),
                           0.5*np.ones(X.iloc[test_index].shape[0]), f)
        
    !dist/libfm/bin/LibFM\
        -task c\
        -test data/libfm/cv_test-{i}-of-{n_splits}-shuffle.libfm\
        -train data/libfm/cv_train-{i}-of-{n_splits}-shuffle.libfm \
        -verbosity 0\
        -out data/libfm/predictions.csv\
        -dim '1,1,8'\
        -iter 50\
        -init_stdev .01
        
    preds = np.loadtxt('data/libfm/predictions.csv')
    ncaa_true = y.iloc[test_index][y.iloc[test_index].game_set == 0]
    ncaa_pred = preds[y.iloc[test_index].reset_index().game_set == 0]
    metric = log_loss(ncaa_true.a_win.astype(int), ncaa_pred)
    metrics.append(metric)
    
print('Metric Mean: {:.2f} ({:.2f})'.format(np.mean(metrics), np.std(metrics)))

----------------------------------------------------------------------------
libFM
  Version: 1.4.4
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=1693	num_values=39731	num_features=54	min_target=0	max_target=1
Loading test... 	
has x = 0
has xt = 1
num_rows=424	num_values=10094	num_features=54	min_target=0.5	max_target=0.5
#relations: 0
Loading meta data...	
#Iter=  0	Train=0.776728	Test=0.535377	Test(ll)=0.394248
#Iter=  1	Train=0.809214	Test=0.528302	Test(ll)=0.413035
#Iter=  2	Train=0.815712	Test=0.518868	Test(ll)=0.421397
#Iter=  3	Train=0.820437	Test=0.535377	Test(ll)=0.427157
#Iter=  4	Train=0.817484	Test=0.540094	Test(ll)=0.435233
#It

#Iter= 16	Train=0.843566	Test=0.524823	Test(ll)=0.486572
#Iter= 17	Train=0.841204	Test=0.527187	Test(ll)=0.489395
#Iter= 18	Train=0.840614	Test=0.527187	Test(ll)=0.492514
#Iter= 19	Train=0.837072	Test=0.524823	Test(ll)=0.495106
#Iter= 20	Train=0.83353	Test=0.51773	Test(ll)=0.498789
#Iter= 21	Train=0.831169	Test=0.513002	Test(ll)=0.501369
#Iter= 22	Train=0.837072	Test=0.513002	Test(ll)=0.504396
#Iter= 23	Train=0.841204	Test=0.513002	Test(ll)=0.5064
#Iter= 24	Train=0.840614	Test=0.51773	Test(ll)=0.508506
#Iter= 25	Train=0.838253	Test=0.515366	Test(ll)=0.509782
#Iter= 26	Train=0.842975	Test=0.515366	Test(ll)=0.511086
#Iter= 27	Train=0.848878	Test=0.513002	Test(ll)=0.512529
#Iter= 28	Train=0.85183	Test=0.510638	Test(ll)=0.514333
#Iter= 29	Train=0.846517	Test=0.508274	Test(ll)=0.516131
#Iter= 30	Train=0.845336	Test=0.501182	Test(ll)=0.518178
#Iter= 31	Train=0.840024	Test=0.498818	Test(ll)=0.520091
#Iter= 32	Train=0.847698	Test=0.501182	Test(ll)=0.522404
#Iter= 33	Train=0.849469	Test=0.50118

#Iter= 39	Train=0.83412	Test=0.55792	Test(ll)=0.520528
#Iter= 40	Train=0.839433	Test=0.55792	Test(ll)=0.5216
#Iter= 41	Train=0.837072	Test=0.55792	Test(ll)=0.521884
#Iter= 42	Train=0.835891	Test=0.55792	Test(ll)=0.522586
#Iter= 43	Train=0.834711	Test=0.55792	Test(ll)=0.523057
#Iter= 44	Train=0.83294	Test=0.55792	Test(ll)=0.52415
#Iter= 45	Train=0.835301	Test=0.55792	Test(ll)=0.524997
#Iter= 46	Train=0.83294	Test=0.560284	Test(ll)=0.525903
#Iter= 47	Train=0.832349	Test=0.560284	Test(ll)=0.526701
#Iter= 48	Train=0.838253	Test=0.560284	Test(ll)=0.527583
#Iter= 49	Train=0.838843	Test=0.55792	Test(ll)=0.528292
Metric Mean: 0.37 (0.00)


In [6]:
all_preds = []
        
for i, (train_index, test_index) in enumerate(kf.split(data)):
    
    with open(path.format("train", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[train_index]),
                           y.iloc[train_index].a_win.astype(int), f)
        
    with open(path.format("test", i, "libfm"),'wb') as f:
        dump_svmlight_file(OneHotEncoder(categorical_features=cat_cols).fit_transform(X.iloc[test_index]),
                           0.5*np.ones(X.iloc[test_index].shape[0]), f)
        
    !dist/libfm/bin/LibFM\
        -task c\
        -test data/libfm/cv_test-{i}-of-{n_splits}-shuffle.libfm\
        -train data/libfm/cv_train-{i}-of-{n_splits}-shuffle.libfm \
        -verbosity 0\
        -out data/libfm/predictions.csv\
        -dim '1,1,8'\
        -iter 50\
        -init_stdev .01
        
    preds = np.loadtxt('data/libfm/predictions.csv')
    all_preds.append(preds)
    
model_out = cp.deepcopy(data[['Season', 'team_a', 'team_b', 'DayNum']])
model_out['Pred'] = np.concatenate(all_preds)
model_out['ID'] = model_out['Season'].map(str) + '_' + model_out['team_a'].map(str) + '_' + model_out['team_b'].map(str)
model_out.set_index('ID', inplace=True)
model_out.head()

----------------------------------------------------------------------------
libFM
  Version: 1.4.4
  Author:  Steffen Rendle, srendle@libfm.org
  WWW:     http://www.libfm.org/
This program comes with ABSOLUTELY NO WARRANTY; for details see license.txt.
This is free software, and you are welcome to redistribute it under certain
conditions; for details see license.txt.
----------------------------------------------------------------------------
Loading train...	
has x = 0
has xt = 1
num_rows=1693	num_values=39212	num_features=54	min_target=0	max_target=1
Loading test... 	
has x = 0
has xt = 1
num_rows=424	num_values=10613	num_features=54	min_target=0.5	max_target=0.5
#relations: 0
Loading meta data...	
#Iter=  0	Train=0.764914	Test=0.502358	Test(ll)=0.436036
#Iter=  1	Train=0.779681	Test=0.528302	Test(ll)=0.437277
#Iter=  2	Train=0.806261	Test=0.516509	Test(ll)=0.446619
#Iter=  3	Train=0.828706	Test=0.5	Test(ll)=0.458889
#Iter=  4	Train=0.834022	Test=0.5	Test(ll)=0.474953
#Iter=  5	Tra

#Iter= 22	Train=0.841204	Test=0.522459	Test(ll)=0.516242
#Iter= 23	Train=0.837662	Test=0.522459	Test(ll)=0.518377
#Iter= 24	Train=0.838253	Test=0.524823	Test(ll)=0.520615
#Iter= 25	Train=0.843566	Test=0.520095	Test(ll)=0.522284
#Iter= 26	Train=0.838253	Test=0.51773	Test(ll)=0.52309
#Iter= 27	Train=0.842385	Test=0.51773	Test(ll)=0.524531
#Iter= 28	Train=0.835891	Test=0.51773	Test(ll)=0.524983
#Iter= 29	Train=0.842385	Test=0.51773	Test(ll)=0.526094
#Iter= 30	Train=0.843566	Test=0.515366	Test(ll)=0.526798
#Iter= 31	Train=0.845336	Test=0.515366	Test(ll)=0.528012
#Iter= 32	Train=0.845336	Test=0.515366	Test(ll)=0.529615
#Iter= 33	Train=0.844156	Test=0.515366	Test(ll)=0.531132
#Iter= 34	Train=0.842975	Test=0.513002	Test(ll)=0.533039
#Iter= 35	Train=0.837072	Test=0.520095	Test(ll)=0.534557
#Iter= 36	Train=0.845336	Test=0.51773	Test(ll)=0.535755
#Iter= 37	Train=0.848878	Test=0.51773	Test(ll)=0.537319
#Iter= 38	Train=0.848288	Test=0.513002	Test(ll)=0.539052
#Iter= 39	Train=0.85183	Test=0.513002	

Unnamed: 0_level_0,Season,team_a,team_b,DayNum,Pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014_1107_1196,2014,1107,1196,136,0.037748
2014_1107_1291,2014,1107,1291,134,0.97857
2014_1110_1458,2014,1110,1458,136,0.008551
2014_1112_1211,2014,1112,1211,139,0.873457
2014_1112_1361,2014,1112,1361,143,0.962323


In [7]:
model_out = cp.deepcopy(data[['Season', 'team_a', 'team_b', 'DayNum']])
model_out['Pred'] = np.concatenate(all_preds)
model_out['ID'] = model_out['Season'].map(str) + '_' + model_out['team_a'].map(str) + '_' + model_out['team_b'].map(str)
model_out.set_index('ID', inplace=True)
model_out.head()

Unnamed: 0_level_0,Season,team_a,team_b,DayNum,Pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014_1107_1196,2014,1107,1196,136,0.037748
2014_1107_1291,2014,1107,1291,134,0.97857
2014_1110_1458,2014,1110,1458,136,0.008551
2014_1112_1211,2014,1112,1211,139,0.873457
2014_1112_1361,2014,1112,1361,143,0.962323


In [8]:
model_out.to_csv('data/libfm_model_out.csv')