In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV
from subprocess import check_output
import re

In [None]:
data_dir = 'DataFiles/'
df_seeds = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')
df_tour = pd.read_csv(data_dir + 'NCAATourneyDetailedResults.csv')
df_tour.shape

In [None]:
w_keeps =  [col for col in df_tour.columns if 'W' in col]
l_keeps =  [col for col in df_tour.columns if 'L' in col]
new_names = [name[1:] for name in w_keeps]
#new_names == [name[1:] for name in l_keeps] #check that sets are identical
w_tour = df_tour.copy()
w_tour = w_tour[w_keeps]
w_tour.columns = new_names
l_tour = df_tour.copy()
l_tour = l_tour[l_keeps]
l_tour.columns = new_names
stack_tour = pd.concat([w_tour,l_tour])
team_means = stack_tour.groupby(['TeamID'], as_index=False).mean()
team_means.head()

In [None]:
def seed_to_int(seed):
    s_int = int(seed[1:3])
    return s_int
df_seeds['seed_int'] = df_seeds.Seed.apply(seed_to_int)
df_seeds.drop(labels=['Seed'], inplace=True, axis=1) # This is the string label
df_seeds.head()

In [None]:
df_min = df_tour[['Season','WTeamID','LTeamID']]
df_min.head()

In [None]:
df_wins = df_seeds.copy()
df_wins = pd.merge(left=df_wins,right=team_means,how='left',on=['TeamID'])
df_wins.drop(labels=['Score'],inplace=True,axis=1) #there are teams where we have Non team_mean
df_w_names = ['W'+i for i in df_wins.columns]
df_wins.columns = df_w_names
df_losses = df_seeds.copy()
df_losses = pd.merge(left=df_losses,right=team_means,how='left',on=['TeamID'])
df_losses.drop(labels=['Score'],inplace=True,axis=1)
df_l_names = ['L'+i for i in df_losses.columns]
df_losses.columns = df_l_names
df_dummy = pd.merge(left=df_min, right=df_wins, how='left', left_on=['Season', 'WTeamID'],right_on=['WSeason','WTeamID'])
df_concat = pd.merge(left=df_dummy, right=df_losses, left_on=['Season', 'LTeamID'],right_on=['LSeason','LTeamID'])
df_concat['SeedDiff'] = df_concat.Wseed_int - df_concat.Lseed_int
for i in new_names[3:]:
    df_concat[i+'Diff'] = df_concat['W'+i]-df_concat['L'+i]
    df_concat.drop(labels=['W'+i,'L'+i],inplace=True,axis=1)
df_concat.drop(labels=['Season','WSeason','LSeason','Wseed_int','Lseed_int'],inplace=True,axis=1)
df_concat.head()


In [None]:
df_win_pred = df_concat.copy()
df_win_pred.drop(labels=['LTeamID'],inplace=True,axis=1)
df_win_pred = df_win_pred.rename(columns={'WTeamID':'TeamID'})
df_win_pred['result'] = 1

df_loss_pred = df_concat.copy()
df_loss_pred.drop(labels=['WTeamID'],inplace=True,axis=1)
df_loss_pred = df_loss_pred.rename(columns={'LTeamID':'TeamID'})
df_loss_pred.iloc[:,1:] = -df_loss_pred.iloc[:,1:]
df_loss_pred['result'] = 0

df_predictions = pd.concat((df_win_pred, df_loss_pred))
df_predictions.drop(labels=['TeamID'], inplace=True, axis=1)
df_loss_pred.head()

In [None]:
df_predictions.shape

In [None]:
X_train = df_predictions.copy()
X_train.drop(labels=['result'],inplace=True,axis=1)
y_train = df_predictions.result.values
X_train, y_train = shuffle(X_train, y_train)

In [None]:
logreg = LogisticRegression()
params = {'C': np.logspace(start=-5, stop=3, num=9)}
clf = GridSearchCV(logreg, params, scoring='neg_log_loss', refit=True)
clf.fit(X_train, y_train)
print('Best log_loss: {:.4}, with best C: {}'.format(clf.best_score_, clf.best_params_['C']))

In [None]:
df_sample_sub = pd.read_csv(data_dir + 'SampleSubmissionStage1.csv')
n_test_games = len(df_sample_sub)

def get_year_t1_t2(ID):
    """Return a tuple with ints `year`, `team1` and `team2`."""
    return (int(x) for x in ID.split('_'))

In [None]:
df_sample_sub.head()
df_concat.head()

In [None]:
X_test = np.zeros(shape=(n_test_games, X_train.shape[1]))
for ii, row in df_sample_sub.iterrows():
    year, t1, t2 = get_year_t1_t2(row.ID)
    t1_seed = df_seeds[(df_seeds.TeamID == t1) & (df_seeds.Season == year)].seed_int.values[0]
    t2_seed = df_seeds[(df_seeds.TeamID == t2) & (df_seeds.Season == year)].seed_int.values[0]
    diff_seed = t1_seed - t2_seed
    X_test[ii, 0] = diff_seed
    i = 1
    for name in new_names[3:]:
        val1 = float(team_means.loc[(team_means.TeamID == t1),name])
        val2 = float(team_means.loc[(team_means.TeamID == t2),name])
        X_test[ii,i] = val1-val2
        i +=1

In [None]:
preds = clf.predict_proba(X_test)[:,1]

clipped_preds = np.clip(preds, 0.05, 0.95)
df_sample_sub.Pred = clipped_preds
df_sample_sub.head()

In [None]:
df_sample_sub.to_csv('logreg_extra_vars.csv', index=False)

In [None]:
import keras
from keras.layers import Dense
from keras import Sequential
from keras.callbacks import EarlyStopping
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

In [None]:
y_class = pd.DataFrame()
y_class['win'] = y_train
y_class['lose'] = 1-y_train
early_stopping_monitor = EarlyStopping(patience=3)

def create_model():
    model = Sequential()
    model.add(Dense(30, activation='relu', input_shape = (X_train.shape[1],)))
    model.add(Dense(50, activation='relu', input_shape = (X_train.shape[1],)))
    model.add(Dense(30, activation='relu', input_shape = (X_train.shape[1],)))
    model.add(Dense(2, activation='softmax'))
    model.compile(optimizer='adam', 
              loss='categorical_crossentropy')
    return(model)


n_folds = 10
skf = StratifiedKFold(n_splits=n_folds, shuffle=True)
models = []
scores = []
for i, (train, test) in enumerate(skf.split(X_train,y_class.win)):
        print('Running Fold', i+1, "/", n_folds)
        model = None
        model = create_model()
        model.fit(X_train.iloc[train,],y_class.iloc[train,],
                  epochs=20,
                  validation_data=(X_train.iloc[test,],y_class.iloc[test,]), 
                  callbacks=[early_stopping_monitor],
                  verbose=0)
        predictions_valid = model.predict(X_train.iloc[test,].astype('float32'), verbose=2)
        score = log_loss(y_class.iloc[test,], predictions_valid)
        scores.append(score)
        print('Log loss: ', score)
        models.append(model)
print('Average log loss: ',np.mean(scores))

In [None]:
pred_list = []
for i,model in enumerate(models):
    pred_list.append(models[i].predict(X_test)[:,0])
pred = np.mean(pred_list,0)
clipped_pred = np.clip(pred, 0.05, 0.95)
df_sample_sub.Pred = clipped_pred
df_sample_sub.head()

In [None]:
df_sample_sub.to_csv('nn_extra_vars.csv', index=False)