In [None]:
import copy as cp
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
def create_target():
    target = pd.read_csv('data/SampleSubmissionStage1.csv').set_index('ID').drop('Pred', axis=1)
    target['Season'] = target.index.map(lambda i: i.split('_')[0])
    target['team_a'] = target.index.map(lambda i: i.split('_')[1])
    target['team_b'] = target.index.map(lambda i: i.split('_')[2])
    target['in_target'] = True
    
    ncaa_games = pd.read_csv('data/NCAATourneyCompactResults.csv')
    ncaa_games['team_a'] = ncaa_games[['WTeamID', 'LTeamID']].apply(lambda t: t[0] if int(t[0]) < int(t[1]) else t[1], axis=1)
    ncaa_games['team_b'] = ncaa_games[['WTeamID', 'LTeamID']].apply(lambda t: t[0] if int(t[0]) > int(t[1]) else t[1], axis=1)
    ncaa_games['a_win'] = ncaa_games['WTeamID'] == ncaa_games['team_a']
    ncaa_games['ID'] = ncaa_games['Season'].astype(str) + '_' + ncaa_games['team_a'].astype(str) + '_' + ncaa_games['team_b'].astype(str)
    ncaa_games = ncaa_games.set_index('ID')
    
    target = target.join(ncaa_games['a_win'], how='left')    
    return target

In [None]:
target = create_target()
print target.shape
target.head()

In [None]:
target_2.to_csv('data/target.csv')

In [None]:
ncaa_games = pd.read_csv('data/NCAATourneyCompactResults.csv')
ncaa_games = ncaa_games.astype({
    'LTeamID': str,
    'WTeamID': str,
    'Season': str
})
ncaa_games['diff'] = ncaa_games['WScore'] - ncaa_games['LScore']
print(ncaa_games.shape)
ncaa_games.sample()

In [None]:
regular_games = pd.read_csv('data/RegularSeasonCompactResults.csv')
regular_games = regular_games.astype({
    'LTeamID': str,
    'WTeamID': str,
    'Season': str
})
regular_games['diff'] = regular_games['WScore'] - regular_games['LScore']
print(regular_games.shape)
regular_games.sample()

In [None]:
data = cp.deepcopy(ncaa_games)
data['team_a'] = data[['WTeamID', 'LTeamID']].apply(lambda t: t[0] if int(t[0]) < int(t[1]) else t[1], axis=1)
data['team_b'] = data[['WTeamID', 'LTeamID']].apply(lambda t: t[0] if int(t[0]) > int(t[1]) else t[1], axis=1)
data['a_win'] = data['WTeamID'] == data['team_a']
data = data[['Season', 'team_a', 'team_b', 'a_win']]
data = pd.concat([data, target.reset_index(drop=True)]).fillna(0).astype({'a_win': bool, 'in_target': bool})

lags = 5
for team, opponent_team in [('a', 'b'), ('b','a')]:
    
    
    # GAMES WON IN PAST TOURNAMENTS
    won_in_ncaa = ncaa_games\
        .groupby(['WTeamID', 'Season']).count()[['diff']]\
        .rename(columns={'diff': 'won_in_ncaa_{}'.format(team)})
    for l in range(1, lags+1):
        won_in_ncaa[
            'won_in_ncaa_{}_lag-{}'.format(team, l)] = won_in_ncaa.groupby('WTeamID')[
            'won_in_ncaa_{}'.format(team)
        ].shift(l).fillna(0)
    won_in_ncaa.drop('won_in_ncaa_{}'.format(team), inplace=True, axis=1)
    data = pd.merge(data, won_in_ncaa,
                    left_on=['team_{}'.format(team), 'Season'], right_index=True,
                    how='left').fillna(0)
    
    # GAMES WON IN PAST SEASONS
    won_in_season = regular_games\
        .groupby(['WTeamID', 'Season']).count()[['diff']]\
        .rename(columns={'diff': 'won_in_season_{}'.format(team)})
    for l in range(0, lags+1):
        won_in_season[
            'won_in_season_{}_lag-{}'.format(team, l)] = won_in_season.groupby('WTeamID')[
            'won_in_season_{}'.format(team)
        ].shift(l).fillna(0)
    won_in_season.drop('won_in_season_{}'.format(team), inplace=True, axis=1)
    data = pd.merge(data, won_in_season,
                    left_on=['team_{}'.format(team), 'Season'], right_index=True,
                    how='left').fillna(0)
    
    
    # TOURNAMENT GAMES WON AGAINST OPPONENT
    ncaa_wins_against_opponent = ncaa_games\
        .groupby(['Season', 'WTeamID', 'LTeamID']).count()[['diff']]\
        .rename(columns={'diff': 'ncaa_wins_against_opponent_{}'.format(team)})
    for l in range(1, lags+1):
        ncaa_wins_against_opponent[
            'ncaa_wins_against_opponent_{}_lag-{}'.format(team, l)
        ] = ncaa_wins_against_opponent.groupby(['WTeamID', 'LTeamID'])\
            ['ncaa_wins_against_opponent_{}'.format(team)].shift(l)
    ncaa_wins_against_opponent.drop('ncaa_wins_against_opponent_{}'.format(team), inplace=True, axis=1)
    data = pd.merge(data, ncaa_wins_against_opponent,
                    left_on=['Season', 'team_{}'.format(team), 'team_{}'.format(opponent_team)],
                    right_index=True,
                    how='left').fillna(0)
    
    
    # SEASON GAMES WON AGAINST OPPONENT
    season_wins_against_opponent = regular_games\
        .groupby(['Season', 'WTeamID', 'LTeamID']).count()[['diff']]\
        .rename(columns={'diff': 'season_wins_against_opponent_{}'.format(team)})
    for l in range(0, lags+1):
        season_wins_against_opponent[
            'season_wins_against_opponent_{}_lag-{}'.format(team, l)
        ] = season_wins_against_opponent.groupby(['WTeamID', 'LTeamID'])\
            ['season_wins_against_opponent_{}'.format(team)].shift(l)
    season_wins_against_opponent.drop('season_wins_against_opponent_{}'.format(team), inplace=True, axis=1)
    data = pd.merge(data, season_wins_against_opponent,
                    left_on=['Season', 'team_{}'.format(team), 'team_{}'.format(opponent_team)],
                    right_index=True,
                    how='left').fillna(0)
    
    
data = data.dropna()
data.head(5)

In [None]:
data.shape

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss


X = data[~data.in_target.astype(bool)].drop(['Season', 'team_a', 'team_b', 'a_win', 'in_target'], axis=1)
y = data[~data.in_target.astype(bool)][['a_win']]

kf = KFold(n_splits=8, shuffle=True)
metrics = []
for tr_i, t_i in kf.split(X):
    X_tr, y_tr = X.iloc[tr_i], y.iloc[tr_i]
    X_t, y_t = X.iloc[t_i], y.iloc[t_i]
    xgb = XGBClassifier()
    xgb.fit(X_tr.values, y_tr.values.reshape(-1))
    preds = xgb.predict_proba(X_t.values)
    metric = log_loss(y_t.values, preds)
    print(metric)
    metrics.append(metric)
    
print('Metric Mean: {:.2f} ({:.2f})'.format(np.mean(metrics), np.std(metrics)))

In [None]:
from xgboost import plot_importance

plt.figure(figsize=(20,4))
plot_importance(xgb, max_num_features=15)
plt.show()

In [None]:
data.head()

In [None]:
X_sub = data[data.in_target.astype(bool)].drop(['Season', 'team_a', 'team_b', 'a_win', 'in_target'], axis=1)
y_sub = data[data.in_target.astype(bool)][['Season', 'team_a', 'team_b', 'a_win', 'in_target']]

xgb = XGBClassifier()
xgb.fit(X.values, y.values.reshape(-1))
preds = xgb.predict_proba(X_sub.values)

sub = cp.deepcopy(y_sub)
sub['Pred'] = preds[:,1]
sub['ID'] = sub['Season'].map(str) + '_' + sub['team_a'].map(str) + '_' + sub['team_b'].map(str)
sub.set_index('ID', inplace=True)
sub.drop(['Season', 'team_a', 'team_b', 'a_win', 'in_target'], axis=1, inplace=True)
sub.head()

In [None]:
y_sub

In [None]:
plt.figure(figsize=(20,4))
plt.hist(sub['Pred'], bins=100)
plt.show()

In [None]:
sub.to_csv('submissions/xgb_baseline_2_corrected_lags_and_features.csv')

In [None]:
### 

In [None]:
target = pd.read_csv('data/SampleSubmissionStage1.csv').set_index('ID').drop('Pred', axis=1)
target['Season'] = target.index.map(lambda i: i.split('_')[0])
target['team_a'] = target.index.map(lambda i: i.split('_')[1])
target['team_b'] = target.index.map(lambda i: i.split('_')[2])
target['in_target'] = True
target.head()

In [None]:
ncaa_games = pd.read_csv('data/NCAATourneyCompactResults.csv')
ncaa_games['team_a'] = ncaa_games[['WTeamID', 'LTeamID']].apply(lambda t: t[0] if int(t[0]) < int(t[1]) else t[1], axis=1)
ncaa_games['team_b'] = ncaa_games[['WTeamID', 'LTeamID']].apply(lambda t: t[0] if int(t[0]) > int(t[1]) else t[1], axis=1)
ncaa_games['a_win'] = ncaa_games['WTeamID'] == ncaa_games['team_a']
ncaa_games['ID'] = ncaa_games['Season'].astype(str) + '_' + ncaa_games['team_a'].astype(str) + '_' + ncaa_games['team_b'].astype(str)
#ncaa_games = ncaa_games[['Season', 'team_a', 'team_b', 'a_win']]
#ncaa_games = pd.concat([ncaa_games, target.reset_index(drop=True)]).fillna(0).astype({'a_win': bool, 'in_target': bool})
ncaa_games = ncaa_games.set_index('ID')
ncaa_games.head()

In [None]:
np.nansum()

In [None]:
submission = pd.read_csv('data/SampleSubmissionStage1.csv')
submission.head()

In [None]:
target = target.join(ncaa_games['a_win']).drop('in_target', axis=1).dropna().sort_values(['Season', 'team_a', 'team_b'])

In [None]:
target['a_win'] = target['a_win'] * 1
target.to_csv('data/target.csv')

In [None]:
to_submit = pd.merge(submission, target.reset_index(), how='left', on='ID')
to_submit['Pred'] = to_submit['a_win']
to_submit = to_submit.fillna(0)[['ID', 'Pred']]
to_submit.head()

In [None]:
(to_submit*1).to_csv('data/ground_truth_submission.csv', index=False)

In [None]:
## Features

In [None]:
from src.features.games import GameFeatures

In [None]:
game_features = GameFeatures()

In [None]:
game_features.season_games

## Neural net

In [None]:
from keras import Sequential
from keras.layers import Dense

In [None]:
train_data = data.loc[~data['in_target']]
train_data.head()

In [None]:
print train_data['a_win'].shape
sum(train_data['a_win'])

In [None]:
cols_to_drop = ['Season', 'a_win', 'in_target', 'team_a', 'team_b']

In [None]:
model = Sequential()
model.add(Dense(12, input_dim=data.shape[1]-len(cols_to_drop), activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
# Compile model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['binary_crossentropy'])

In [None]:
# Fit the model
model.fit(train_data.drop(cols_to_drop, axis=1), train_data['a_win']*1, epochs=10, batch_size=10)

In [None]:
predictions = model.predict(train_data.drop(cols_to_drop, axis=1))

In [None]:
pd.Series(predictions.reshape(-1)).describe()

In [None]:
plt.figure(figsize=(15,5))
plt.hist(predictions, bins=100)
plt.show()

In [None]:
# evaluate the model
scores = model.evaluate(data.drop('a_win', axis=1), data['a_win'])

### Tensorflow nn

In [246]:
%load_ext autoreload
%autoreload 2
from src.features.games import GameFeatures
from src.utils import load_data_template
from src.models.nn.ann import ANN

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [247]:
game_feat = GameFeatures()
data = load_data_template()
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_season, fillna=0)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_tourney, fillna=0)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_season_against_opponent, per_game=True, fillna=0)
data = game_feat.per_team_wrapper(data, game_feat.games_won_in_tourney_against_opponent, per_game=True, fillna=0)
data = data.dropna()
data.sample(10)

Unnamed: 0,Season,team_a,team_b,in_target,game_set,a_win,DayNum,games_won_in_season_team_a,games_won_in_season_team_a_lag-1,games_won_in_season_team_a_lag-2,...,games_won_in_season_against_opponent_team_b,games_won_in_season_against_opponent_team_b_lag-1,games_won_in_season_against_opponent_team_b_lag-2,games_won_in_season_against_opponent_team_b_lag-3,games_won_in_tourney_against_opponent_team_a_lag-1,games_won_in_tourney_against_opponent_team_a_lag-2,games_won_in_tourney_against_opponent_team_a_lag-3,games_won_in_tourney_against_opponent_team_b_lag-1,games_won_in_tourney_against_opponent_team_b_lag-2,games_won_in_tourney_against_opponent_team_b_lag-3
10601,2008,1242,1424,False,0,True,138.0,30,30.0,24.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10481,2006,1133,1338,False,0,True,139.0,20,13.0,15.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7680,2017,1211,1462,True,0,True,145.0,32,25.0,31.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
9345,1988,1210,1350,False,0,False,139.0,21,16.0,24.0,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10945,2013,1235,1326,False,0,False,139.0,22,22.0,16.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9863,1996,1153,1280,False,0,False,146.0,25,21.0,22.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10387,2005,1261,1412,False,0,False,136.0,20,18.0,21.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10683,2009,1338,1462,False,0,True,143.0,27,26.0,27.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10368,2004,1400,1462,False,0,False,144.0,23,22.0,19.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3048,2015,1173,1344,True,0,True,137.0,25,23.0,17.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [248]:
print(data.shape)
print(data.dropna().shape)

(2117, 35)
(2117, 35)


In [249]:
X = data.loc[~data.in_target].drop(['Season', 'a_win', 'in_target', 'team_a', 'team_b', 'DayNum'], axis=1)
Y = data.loc[~data.in_target, 'a_win']*1
X_eval = data.loc[data.in_target].drop(['Season', 'a_win', 'in_target', 'team_a', 'team_b', 'DayNum'], axis=1)
Y_eval = data.loc[data.in_target, 'a_win']*1

In [250]:
sum(Y)/len(Y)

0.5116279069767442

In [251]:
param_dict = {
    'input_dim': X.shape[1],
    'batch_size': 50,
    'hidden_units': [32],
    'eta': 0.01,
    'num_epochs': 500,
}
print(param_dict)

{'input_dim': 29, 'batch_size': 50, 'hidden_units': [32], 'eta': 0.01, 'num_epochs': 500}


In [254]:
ann = ANN(**param_dict)

In [255]:
ann.train(X, Y, X_eval, Y_eval, verbose=True)

Epoch: 000 train_loss=0.761 eval_loss=0.514
Epoch: 005 train_loss=0.382 eval_loss=0.455
Epoch: 010 train_loss=0.368 eval_loss=0.459
Epoch: 015 train_loss=0.399 eval_loss=0.407
Epoch: 020 train_loss=0.363 eval_loss=0.429
Epoch: 025 train_loss=0.350 eval_loss=0.447
Epoch: 030 train_loss=0.351 eval_loss=0.455
Epoch: 035 train_loss=0.361 eval_loss=0.505
Epoch: 040 train_loss=0.347 eval_loss=0.407
Epoch: 045 train_loss=0.343 eval_loss=0.407
Epoch: 050 train_loss=0.341 eval_loss=0.406
Epoch: 055 train_loss=0.340 eval_loss=0.404
Epoch: 060 train_loss=0.339 eval_loss=0.401
Epoch: 065 train_loss=0.339 eval_loss=0.397
Epoch: 070 train_loss=0.344 eval_loss=0.415
Epoch: 075 train_loss=0.338 eval_loss=0.442
Epoch: 080 train_loss=0.337 eval_loss=0.455
Epoch: 085 train_loss=0.337 eval_loss=0.464
Epoch: 090 train_loss=0.337 eval_loss=0.464
Epoch: 095 train_loss=0.340 eval_loss=0.470
Epoch: 100 train_loss=0.340 eval_loss=0.456
Epoch: 105 train_loss=0.345 eval_loss=0.451
Epoch: 110 train_loss=0.336 eval

(0.33330241501331331, 0.43034416)

In [56]:
X.shape

(11229, 29)

In [124]:
ann.layers

{'l_0': <tf.Tensor 'add:0' shape=(?, 32) dtype=float32>}

In [125]:
ann.weights

{'w_0': <tf.Variable 'w_0:0' shape=(28, 32) dtype=float32_ref>,
 'w_out': <tf.Variable 'w_out:0' shape=(32, 2) dtype=float32_ref>}

In [216]:
## Random search

In [272]:
from src.models.random_search import RandomSearch

In [273]:
rs = RandomSearch('nn', ANN, X, Y, X_eval, Y_eval, iterations=50)

In [None]:
performance = rs.search()

{'input_dim': 29, 'eta': 0.014844844844844845, 'dropout': 0.84684684684684686, 'hidden_units': [37, 113], 'batch_size': 108, 'num_epochs': 297, 'train_loss': 0.10907566712962258, 'eval_loss': 0.69390225}
{'input_dim': 29, 'eta': 0.045035035035035038, 'dropout': 0.03403403403403403, 'hidden_units': [107, 24], 'batch_size': 112, 'num_epochs': 401, 'train_loss': 1.1808460525103979, 'eval_loss': 7.878624}
{'input_dim': 29, 'eta': 0.034304304304304305, 'dropout': 0.073073073073073078, 'hidden_units': [24, 5], 'batch_size': 42, 'num_epochs': 429, 'train_loss': 0.72686848895890377, 'eval_loss': 0.69295365}
{'input_dim': 29, 'eta': 0.040070070070070067, 'dropout': 0.79379379379379378, 'hidden_units': [92], 'batch_size': 24, 'num_epochs': 359, 'train_loss': 25.268472939729698, 'eval_loss': 8.2394772}
{'input_dim': 29, 'eta': 0.021771771771771774, 'dropout': 0.24224224224224225, 'hidden_units': [42, 66], 'batch_size': 54, 'num_epochs': 223, 'train_loss': 0.43653480101514741, 'eval_loss': 0.69343

In [None]:
performance.head()