In [7]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import copy as cp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.utils import load_data_template
from src.features import GameFeatures, GameDetailedFeatures, SeedFeatures

In [9]:
data = load_data_template(season=False)
data.dropna(subset=['a_win'], inplace=True)
data.sample()

Unnamed: 0,Season,team_a,team_b,in_target,game_set,a_win,DayNum
10303,2003,1120,1393,False,0,False,144


In [10]:
game_feat = GameFeatures(default_lags=3)
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_in_season, fillna=0, combine='subtract')
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_in_tourney, fillna=0, combine='subtract')
data = game_feat.per_team_wrapper(data, game_feat.last_games_won_against_opponent, fillna=0, per_game=True, combine='subtract')
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_tourney_against_opponent, fillna=0, per_game=True, combine='subtract')
game_detail_feat = GameDetailedFeatures(default_lags=3)
data = game_detail_feat.per_team_wrapper(data, game_detail_feat.detail_features_by_game, per_day=True, combine='subtract')
seed_feat = SeedFeatures()
data = seed_feat.per_team_wrapper(data, seed_feat.team_seeds, combine='subtract')
data.dropna(inplace=True)
data.sample()

Unnamed: 0,Season,team_a,team_b,in_target,game_set,a_win,DayNum,last_games_won_in_season_team_combined,last_games_won_in_season_team_combined_lag-1,last_games_won_in_season_team_combined_lag-2,...,Stl_game_team_combined_lag-1,Stl_game_team_combined_lag-2,Stl_game_team_combined_lag-3,Blk_game_team_combined_lag-1,Blk_game_team_combined_lag-2,Blk_game_team_combined_lag-3,PF_game_team_combined_lag-1,PF_game_team_combined_lag-2,PF_game_team_combined_lag-3,seed_team_combined
10357,2004,1338,1458,False,0,True,139,4,4.0,9.0,...,5.0,4.0,2.0,-1.0,-1.0,3.0,-6.0,-4.0,0.0,-3


In [11]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss


X = data.drop(['Season', 'team_a', 'team_b', 'a_win', 'in_target', 'DayNum'], axis=1)
y = data[['a_win', 'game_set']].astype({'a_win': int})

kf = KFold(n_splits=3, shuffle=True)
metrics = []
all_preds = []
for tr_i, t_i in kf.split(X):
    X_tr, y_tr = X.iloc[tr_i], y.iloc[tr_i].a_win
    X_t, y_t = X.iloc[t_i], y.iloc[t_i].a_win
    knn = KNeighborsClassifier(n_neighbors=15)
    knn.fit(X_tr.values, y_tr)
    preds = knn.predict_proba(X_t.values)[:,1]
    ncaa_true = y.iloc[t_i][y.iloc[t_i].game_set == 0]
    ncaa_pred = preds[y.iloc[t_i].reset_index().game_set == 0]
    metric = log_loss(ncaa_true.a_win.astype(int), ncaa_pred)
    print(metric)
    metrics.append(metric)
    all_preds.append(preds)
    
print('Metric Mean: {:.2f} ({:.2f})'.format(np.mean(metrics), np.std(metrics)))
model_out = cp.deepcopy(data[['Season', 'team_a', 'team_b', 'DayNum']])
model_out['Pred'] = np.concatenate(all_preds)
model_out['ID'] = model_out['Season'].map(str) + '_' + model_out['team_a'].map(str) + '_' + model_out['team_b'].map(str)
model_out.set_index('ID', inplace=True)
model_out.head()

0.14880915237756837
0.1399712918550905
0.16927620568982418
Metric Mean: 0.15 (0.01)


Unnamed: 0_level_0,Season,team_a,team_b,DayNum,Pred
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014_1107_1196,2014,1107,1196,136,1.0
2014_1107_1291,2014,1107,1291,134,0.866667
2014_1110_1458,2014,1110,1458,136,0.8
2014_1112_1211,2014,1112,1211,139,0.6
2014_1112_1361,2014,1112,1361,143,0.0


In [12]:
model_out.to_csv('data/knn_model_out.csv')