In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import log_loss
from sklearn.metrics import r2_score
from sklearn.utils import resample 
from numpy.random import seed
import warnings; warnings.simplefilter('ignore')
import importlib
pd.set_option("display.max_columns",999)

In [None]:
import bball_functions
importlib.reload(bball_functions)

In [None]:
data_dir = 'DataFiles/'
dfs = pd.read_csv(data_dir + 'NCAATourneySeeds.csv')
dft = pd.read_csv(data_dir + 'NCAATourneyDetailedResults.csv')
dfa = dft#pd.concat([df_tour,df_reg],axis=1)
min_season = dfa.Season.min()
dfs = dfs[dfs.Season>=min_season]
model_type = 'gbm'

In [None]:
df_teams = bball_functions.team_stats(dfa.copy())
df_train = bball_functions.build_data(dfa[['Season','WTeamID','LTeamID','WScore','LScore']].copy(),dfs.copy(),df_teams)

In [None]:
seed(12358)
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True)
all_models = []
all_scores = []
for i, (train, test) in enumerate(kf.split(df_train)):
        print('Running Fold', i+1, "/", n_folds)
        models = None
        models, scores, score2s = bball_functions.bootstrap(df_train.iloc[train],mod_type=model_type)
        predictions = np.vstack([m.predict(df_train.iloc[test,3:]) for m in models])
        total_predictions = predictions.mean(axis=0)
        total_predictions = np.clip(total_predictions, 0.02, 0.98)
        score = log_loss((df_train.iloc[test,2]>0)+0, total_predictions,labels=[0,1])
        all_scores.append(score)
        print('Mean R2: ', np.mean(score2s))
        print('Log loss: ', score)
        all_models.extend(models)

In [None]:
seed(12358)
print('2014 Log Loss: ',bball_functions.year_score(dfa.copy(),dfs.copy(),2014,mod_type=model_type))
print('2015 Log Loss: ',bball_functions.year_score(dfa.copy(),dfs.copy(),2015,mod_type=model_type))
print('2016 Log Loss: ',bball_functions.year_score(dfa.copy(),dfs.copy(),2016,mod_type=model_type))
print('2017 Log Loss: ',bball_functions.year_score(dfa.copy(),dfs.copy(),2017,mod_type=model_type))

In [None]:
df_sample_sub = pd.read_csv(data_dir + 'SampleSubmissionStage1.csv')
df_teams_fin = bball_functions.team_stats(dfa.copy())
output = bball_functions.generate_output_df(df_sample_sub.copy(),dfs.copy(),df_teams_fin.copy())

In [None]:
seed(12358)
bball_functions.build_data(dfa[['Season','WTeamID','LTeamID','WScore','LScore']].copy(),dfs.copy(),df_teams_fin)
final_models, final_scores, final_score2s = bball_functions.bootstrap(df_train,mod_type=model_type)
final_predictions = np.vstack([m.predict(output.iloc[:,3:]) for m in models])
final_total_predictions = final_predictions.mean(axis=0)
final_total_predictions = np.clip(final_total_predictions, 0.025, 0.975)
df_sample_sub['Pred'] = final_total_predictions

In [None]:
df_sample_sub.to_csv('bootstrap_gbm_1.csv', index=False)