In [1]:
import pandas as pd #dataframes
import matplotlib.pyplot as plt 
import numpy as np # n-dim object support
# do ploting inline instead of in a separate window
%matplotlib inline
import random

In [2]:
df_school = pd.read_csv("../Scraper/school_records.csv")
df_ps_game = pd.read_csv("../Scraper/post_season_game_records.csv")

In [3]:
df_school.shape

(3125, 21)

In [4]:
df_ps_game.shape

(441, 7)

In [5]:
df_school.head(5)

Unnamed: 0,year,team_name,fg_pg,ft_pg,three_pt_pg,orb_pg,drb_pg,ast_pg,stl_pg,blk_pg,...,pf_pg,pt_pg,opnt_pt_pg,fg_pct,three_p_pct,ft_pct,wl_pct,conf_wl_pct,srs,sos
0,2010.0,Air Force,20.387097,10.741935,5.677419,7.096774,27.222685,12.548387,5.0,1.645161,...,17.645161,57.193548,63.129032,0.443,0.313,0.635,0.323,0.0625,-4.9,3.13
1,2010.0,Akron,25.057143,13.8,6.714286,13.342857,35.875918,13.514286,6.085714,3.257143,...,19.485714,70.628571,65.514286,0.433,0.339,0.657,0.686,0.75,2.82,-1.5
2,2010.0,Alabama A&M,22.185185,17.481481,4.0,13.925926,36.66941,10.666667,9.222222,5.296296,...,20.37037,65.851852,69.666667,0.382,0.291,0.635,0.407,0.444444,-20.19,-13.71
3,2010.0,Alabama-Birmingham,22.441176,16.852941,5.205882,12.352941,36.342561,11.470588,6.558824,2.676471,...,17.970588,66.941176,60.382353,0.422,0.311,0.694,0.735,0.6875,9.46,2.9
4,2010.0,Alabama State,21.516129,15.290323,6.129032,12.903226,35.099896,12.903226,7.354839,4.16129,...,20.451613,64.451613,65.903226,0.404,0.324,0.641,0.516,0.666667,-14.41,-12.02


In [6]:
df_ps_game.head(5)

Unnamed: 0,year,team_1_name,team_1_score,team_1_seed,team_2_name,team_2_score,team_2_seed
0,2011,UTSA,46.0,16.0,Ohio State,75.0,1.0
1,2011,George Mason,61.0,8.0,Villanova,57.0,9.0
2,2011,Clemson,76.0,12.0,West Virginia,84.0,5.0
3,2011,Princeton,57.0,13.0,Kentucky,59.0,4.0
4,2011,Marquette,66.0,11.0,Xavier,55.0,6.0


In [7]:
df_school.isnull().values.any()

False

In [8]:
df_ps_game.isnull().values.any()

False

In [9]:
def resolve_team_name(team_name):
    #Apply hard-coded corrections to team names
    team_name_dict = {'Colorado-Colorado Springs':'Colorado',
                     'Colorado College': 'Colorado',
                     'UNC':'North Carolina',
                     'UConn':'Connecticut',
                     'LIU-Brooklyn':'Long Island University',
                     'UTSA':'Texas-San Antonio',
                     'Pitt':'Pittsburgh',
                     'BYU':'Brigham Young',
                     "St. Peter's": "Saint Peter's",
                     'VCU':'Virginia Commonwealth',
                     'Southern Miss':'Southern Mississippi',
                     'Detroit': 'Detroit Mercy',
                     'UNLV':'Nevada-Las Vegas',
                     'Ole Miss':'Mississippi',
                     "St. Joseph's":"Saint Joseph's",
                     'UCSB':'UC-Santa Barbara',
                     'SMU': 'Southern Methodist',
                     'USC':'South Carolina',
                     'LSU':'Louisiana State',
                     'UMass':'Massachusetts',
                     'ETSU':'East Tennessee State'}
    # TODO: for V2 add more corrections to the team_name_dict
    if(team_name in team_name_dict):
        return team_name_dict[team_name]
    return team_name
def get_school_stats(year, team_name):
    return df_school[(df_school['year'] == year) & (df_school['team_name'] == team_name)]
def get_vals(t_stats_list, key):
    ret = []
    for t_stat in t_stats_list:
        ret.append(t_stat[key].squeeze())
    return ret
def get_team_stats_dict_with_t1_win(t1_stats, t2_stats, t1_wins):
    return {'team_name_1':get_vals(t1_stats,'team_name'),'fg_pg_1':get_vals(t1_stats,'fg_pg'),'ft_pg_1':get_vals(t1_stats,'ft_pg'),
            'three_pt_pg_1':get_vals(t1_stats,'three_pt_pg'),'orb_pg_1':get_vals(t1_stats,'orb_pg'),'drb_pg_1':get_vals(t1_stats,'drb_pg'),
            'ast_pg_1':get_vals(t1_stats,'ast_pg'),'stl_pg_1':get_vals(t1_stats,'stl_pg'),'blk_pg_1':get_vals(t1_stats,'blk_pg'),
            'tov_pg_1':get_vals(t1_stats,'tov_pg'),'pf_pg_1':get_vals(t1_stats,'pf_pg'), 'pt_pg_1':get_vals(t1_stats,'pt_pg'),
            'opnt_pt_pg_1':get_vals(t1_stats,'opnt_pt_pg'),'fg_pct_1':get_vals(t1_stats,'fg_pct'),'three_p_pct_1':get_vals(t1_stats,'three_p_pct'),
            'ft_pct_1':get_vals(t1_stats,'ft_pct'),'wl_pct_1':get_vals(t1_stats,'wl_pct'),'conf_wl_pct_1':get_vals(t1_stats,'conf_wl_pct'),
            'srs_1':get_vals(t1_stats,'srs'),'sos_1':get_vals(t1_stats,'sos'),
            'team_name_2':get_vals(t2_stats,'team_name'),'fg_pg_2':get_vals(t2_stats,'fg_pg'),'ft_pg_2':get_vals(t2_stats,'ft_pg'),
            'three_pt_pg_2':get_vals(t2_stats,'three_pt_pg'),'orb_pg_2':get_vals(t2_stats,'orb_pg'),'drb_pg_2':get_vals(t2_stats,'drb_pg'),
            'ast_pg_2':get_vals(t2_stats,'ast_pg'),'stl_pg_2':get_vals(t2_stats,'stl_pg'),'blk_pg_2':get_vals(t2_stats,'blk_pg'),
            'tov_pg_2':get_vals(t2_stats,'tov_pg'),'pf_pg_2':get_vals(t2_stats,'pf_pg'), 'pt_pg_2':get_vals(t2_stats,'pt_pg'),
            'opnt_pt_pg_2':get_vals(t2_stats,'opnt_pt_pg'),'fg_pct_2':get_vals(t2_stats,'fg_pct'),'three_p_pct_2':get_vals(t2_stats,'three_p_pct'),
            'ft_pct_2':get_vals(t2_stats,'ft_pct'),'wl_pct_2':get_vals(t2_stats,'wl_pct'),'conf_wl_pct_2':get_vals(t2_stats,'conf_wl_pct'),
            'srs_2':get_vals(t2_stats,'srs'),'sos_2':get_vals(t2_stats,'sos'),
            't1_win':t1_wins}
def get_team_stats_dict(t1_stats, t2_stats):
    return {'team_name_1':get_vals(t1_stats,'team_name'),'fg_pg_1':get_vals(t1_stats,'fg_pg'),'ft_pg_1':get_vals(t1_stats,'ft_pg'),
            'three_pt_pg_1':get_vals(t1_stats,'three_pt_pg'),'orb_pg_1':get_vals(t1_stats,'orb_pg'),'drb_pg_1':get_vals(t1_stats,'drb_pg'),
            'ast_pg_1':get_vals(t1_stats,'ast_pg'),'stl_pg_1':get_vals(t1_stats,'stl_pg'),'blk_pg_1':get_vals(t1_stats,'blk_pg'),
            'tov_pg_1':get_vals(t1_stats,'tov_pg'),'pf_pg_1':get_vals(t1_stats,'pf_pg'), 'pt_pg_1':get_vals(t1_stats,'pt_pg'),
            'opnt_pt_pg_1':get_vals(t1_stats,'opnt_pt_pg'),'fg_pct_1':get_vals(t1_stats,'fg_pct'),'three_p_pct_1':get_vals(t1_stats,'three_p_pct'),
            'ft_pct_1':get_vals(t1_stats,'ft_pct'),'wl_pct_1':get_vals(t1_stats,'wl_pct'),'conf_wl_pct_1':get_vals(t1_stats,'conf_wl_pct'),
            'srs_1':get_vals(t1_stats,'srs'),'sos_1':get_vals(t1_stats,'sos'),
            'team_name_2':get_vals(t2_stats,'team_name'),'fg_pg_2':get_vals(t2_stats,'fg_pg'),'ft_pg_2':get_vals(t2_stats,'ft_pg'),
            'three_pt_pg_2':get_vals(t2_stats,'three_pt_pg'),'orb_pg_2':get_vals(t2_stats,'orb_pg'),'drb_pg_2':get_vals(t2_stats,'drb_pg'),
            'ast_pg_2':get_vals(t2_stats,'ast_pg'),'stl_pg_2':get_vals(t2_stats,'stl_pg'),'blk_pg_2':get_vals(t2_stats,'blk_pg'),
            'tov_pg_2':get_vals(t2_stats,'tov_pg'),'pf_pg_2':get_vals(t2_stats,'pf_pg'), 'pt_pg_2':get_vals(t2_stats,'pt_pg'),
            'opnt_pt_pg_2':get_vals(t2_stats,'opnt_pt_pg'),'fg_pct_2':get_vals(t2_stats,'fg_pct'),'three_p_pct_2':get_vals(t2_stats,'three_p_pct'),
            'ft_pct_2':get_vals(t2_stats,'ft_pct'),'wl_pct_2':get_vals(t2_stats,'wl_pct'),'conf_wl_pct_2':get_vals(t2_stats,'conf_wl_pct'),
            'srs_2':get_vals(t2_stats,'srs'),'sos_2':get_vals(t2_stats,'sos')}
def get_team_stats_dict_ps(t1_stats, t2_stats, t1_seeds, t2_seeds):
    return {'team_name_1':get_vals(t1_stats,'team_name'),'fg_pg_1':get_vals(t1_stats,'fg_pg'),'ft_pg_1':get_vals(t1_stats,'ft_pg'),
            'three_pt_pg_1':get_vals(t1_stats,'three_pt_pg'),'orb_pg_1':get_vals(t1_stats,'orb_pg'),'drb_pg_1':get_vals(t1_stats,'drb_pg'),
            'ast_pg_1':get_vals(t1_stats,'ast_pg'),'stl_pg_1':get_vals(t1_stats,'stl_pg'),'blk_pg_1':get_vals(t1_stats,'blk_pg'),
            'tov_pg_1':get_vals(t1_stats,'tov_pg'),'pf_pg_1':get_vals(t1_stats,'pf_pg'), 'pt_pg_1':get_vals(t1_stats,'pt_pg'),
            'opnt_pt_pg_1':get_vals(t1_stats,'opnt_pt_pg'),'fg_pct_1':get_vals(t1_stats,'fg_pct'),'three_p_pct_1':get_vals(t1_stats,'three_p_pct'),
            'ft_pct_1':get_vals(t1_stats,'ft_pct'),'wl_pct_1':get_vals(t1_stats,'wl_pct'),'conf_wl_pct_1':get_vals(t1_stats,'conf_wl_pct'),
            'srs_1':get_vals(t1_stats,'srs'),'sos_1':get_vals(t1_stats,'sos'),
            'team_name_2':get_vals(t2_stats,'team_name'),'fg_pg_2':get_vals(t2_stats,'fg_pg'),'ft_pg_2':get_vals(t2_stats,'ft_pg'),
            'three_pt_pg_2':get_vals(t2_stats,'three_pt_pg'),'orb_pg_2':get_vals(t2_stats,'orb_pg'),'drb_pg_2':get_vals(t2_stats,'drb_pg'),
            'ast_pg_2':get_vals(t2_stats,'ast_pg'),'stl_pg_2':get_vals(t2_stats,'stl_pg'),'blk_pg_2':get_vals(t2_stats,'blk_pg'),
            'tov_pg_2':get_vals(t2_stats,'tov_pg'),'pf_pg_2':get_vals(t2_stats,'pf_pg'), 'pt_pg_2':get_vals(t2_stats,'pt_pg'),
            'opnt_pt_pg_2':get_vals(t2_stats,'opnt_pt_pg'),'fg_pct_2':get_vals(t2_stats,'fg_pct'),'three_p_pct_2':get_vals(t2_stats,'three_p_pct'),
            'ft_pct_2':get_vals(t2_stats,'ft_pct'),'wl_pct_2':get_vals(t2_stats,'wl_pct'),'conf_wl_pct_2':get_vals(t2_stats,'conf_wl_pct'),
            'srs_2':get_vals(t2_stats,'srs'),'sos_2':get_vals(t2_stats,'sos'), 'team_1_seed':t1_seeds,
            'team_2_seed':t2_seeds}
def create_team_stats_df_w_t1_win(indeces_w_stats, t1_stats_list, t2_stats_list,t1_wins):
    # Adds column for wether team 1 wins or not
    # Assumes all lists are of the same length
    return pd.DataFrame(get_team_stats_dict_with_t1_win(t1_stats_list, t2_stats_list,t1_wins), index = indeces_w_stats)
def create_team_stats_df(indeces_w_stats, t1_stats_list, t2_stats_list):
    # Assumes all lists are of the same length
    return pd.DataFrame(get_team_stats_dict(t1_stats_list, t2_stats_list), index = indeces_w_stats)
def create_team_stats_df_ps(indeces_w_stats, t1_stats_list, t2_stats_list, t1_seeds, t2_seeds):
    # Only uses post season stats => inclu
    # Assumes all lists are of the same length
    return pd.DataFrame(get_team_stats_dict_ps(t1_stats_list, t2_stats_list, t1_seeds, t2_seeds), index = indeces_w_stats)
def get_team_stats_df(game_df, should_print=False):
    indeces_w_stats = []
    t1_stats_list = []
    t2_stats_list = []
    t1_wins_list = []
    for index, row in game_df.iterrows():
        year = row['year']
        team_1 = row['team_1_name']
        team_2 = row['team_2_name']
        team_1_score = row['team_1_score']
        team_2_score = row['team_2_score']
        t1_stats = get_school_stats(year, resolve_team_name(team_1))
        t2_stats = get_school_stats(year, resolve_team_name(team_2))

        if(len(t1_stats) > 0 and len(t2_stats) > 0):  
            indeces_w_stats.append(index)
            t1_stats_list.append(t1_stats)
            t2_stats_list.append(t2_stats)
            t1_wins_list.append(team_1_score > team_2_score)
        else:         
            if(should_print):
                print(year)
                if(len(t1_stats) < 1):
                    print(team_1)
                if(len(t2_stats) < 1):
                    print(team_2)
            
    print(len(indeces_w_stats))
    team_stats_df = create_team_stats_df_w_t1_win(indeces_w_stats, t1_stats_list, t2_stats_list, t1_wins_list)
    return team_stats_df
        

In [10]:
ps_team_stats_df = get_team_stats_df(df_ps_game, True)

441


In [11]:
ps_game_w_team_stats = pd.concat([df_ps_game, ps_team_stats_df], axis=1, join='inner')

In [12]:
ps_game_w_team_stats.head(3)

Unnamed: 0,year,team_1_name,team_1_score,team_1_seed,team_2_name,team_2_score,team_2_seed,team_name_1,fg_pg_1,ft_pg_1,...,pt_pg_2,opnt_pt_pg_2,fg_pct_2,three_p_pct_2,ft_pct_2,wl_pct_2,conf_wl_pct_2,srs_2,sos_2,t1_win
0,2011,UTSA,46.0,16.0,Ohio State,75.0,1.0,Texas-San Antonio,23.588235,16.058824,...,77.135135,59.675676,0.494,0.423,0.701,0.919,0.888889,25.84,8.38,False
1,2011,George Mason,61.0,8.0,Villanova,57.0,9.0,George Mason,25.764706,14.558824,...,72.242424,65.424242,0.438,0.348,0.757,0.636,0.5,15.05,8.23,True
2,2011,Clemson,76.0,12.0,West Virginia,84.0,5.0,Clemson,23.823529,14.5,...,69.787879,64.666667,0.429,0.337,0.711,0.636,0.611111,16.15,11.03,False


In [13]:
ps_game_w_team_stats.shape

(441, 48)

## Check team 1 winning true/false ratio

In [14]:
t1_win_map = {True:1, False:0}
ps_game_w_team_stats['t1_win'] = ps_game_w_team_stats['t1_win'].map(t1_win_map)
num_true = len(ps_game_w_team_stats.loc[ps_game_w_team_stats['t1_win'] == True])
num_false = len(ps_game_w_team_stats.loc[ps_game_w_team_stats['t1_win'] == False])
print("Number of True cases: {0} ({1:2.2f}%)".format(num_true, (num_true/(num_true+num_false))*100))
print("Number of False cases: {0} ({1:2.2f}%)".format(num_false, (num_false/(num_true+num_false))*100))

Number of True cases: 231 (52.38%)
Number of False cases: 210 (47.62%)


In [15]:
import sklearn.model_selection
ps_feature_col_names = ['team_1_seed', 'team_2_seed','fg_pg_1','ft_pg_1',
            'three_pt_pg_1','orb_pg_1','drb_pg_1',
            'ast_pg_1','stl_pg_1','blk_pg_1',
            'tov_pg_1','pf_pg_1', 'pt_pg_1',
            'opnt_pt_pg_1','fg_pct_1','three_p_pct_1',
            'ft_pct_1','wl_pct_1','conf_wl_pct_1',
            'srs_1','sos_1',
            'fg_pg_2','ft_pg_2',
            'three_pt_pg_2','orb_pg_2','drb_pg_2',
            'ast_pg_2','stl_pg_2','blk_pg_2',
            'tov_pg_2','pf_pg_2', 'pt_pg_2',
            'opnt_pt_pg_2','fg_pct_2','three_p_pct_2',
            'ft_pct_2','wl_pct_2','conf_wl_pct_2',
            'srs_2','sos_2'
            ]
ps_predict_class_names = ['t1_win']




In [16]:
def scale_features(data, col_names):
    scaled_features = {}
    for col_name in col_names:
        mean, std = data[col_name].values.mean(), data[col_name].values.std()
        scaled_features[col_name] = [mean, std]
        data.loc[:, col_name] = (data[col_name].values - mean)/std
    return scaled_features


In [35]:
scale_features(ps_game_w_team_stats, ps_feature_col_names)
ps_x = ps_game_w_team_stats[ps_feature_col_names].values
ps_y = ps_game_w_team_stats[ps_predict_class_names].values
print(type(ps_x))
split_test_size = 0.25
ps_x_train, ps_x_test, ps_y_train, ps_y_test = sklearn.model_selection.train_test_split(ps_x, ps_y, test_size=split_test_size, random_state=42)
split_valid_size = 0.333
ps_x_train, ps_x_val, ps_y_train, ps_y_val = sklearn.model_selection.train_test_split(ps_x_train, ps_y_train, test_size=split_valid_size, random_state=1)


<class 'numpy.ndarray'>


In [36]:
print("{0:0.2f}% in training set".format((len(ps_x_train)/len(ps_game_w_team_stats.index))*100))
print("{0:0.2f}% in test set".format((len(ps_x_test)/len(ps_game_w_team_stats.index))*100))
print("{0:0.2f}% in test set".format((len(ps_x_val)/len(ps_game_w_team_stats.index))*100))

49.89% in training set
25.17% in test set
24.94% in test set


In [37]:
import sklearn.impute

#Impute with mean all 0 readings
fill_0 = sklearn.impute.SimpleImputer(missing_values=0, strategy="mean")

ps_x_train = fill_0.fit_transform(ps_x_train)
ps_x_test = fill_0.fit_transform(ps_x_test)
ps_x_val = fill_0.fit_transform(ps_x_val)
# TODO : impute incorrect negative values such anything other than (SOS and SRS)

In [20]:
print(len(ps_x_train[0]))

40


In [21]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
dropout = 0.3
model = nn.Sequential(nn.Linear(40, 30),
                     nn.ReLU(),
                     nn.Dropout(dropout),
                     nn.Linear(30, 30),
                     nn.ReLU(),
                     nn.Dropout(dropout),
                     nn.Linear(30, 10),
                     nn.ReLU(),
                     nn.Dropout(dropout),
                     nn.Linear(10, 1),
                     nn.Sigmoid())
trainset = TensorDataset(torch.from_numpy(ps_x_train).float(), torch.from_numpy(ps_y_train).float())
trainloader = DataLoader(trainset, batch_size=3, shuffle=True)
testset = TensorDataset(torch.from_numpy(ps_x_test).float(), torch.from_numpy(ps_y_test).float())
testloader = DataLoader(testset, batch_size=3, shuffle=True)
valset = TensorDataset(torch.from_numpy(ps_x_val).float(), torch.from_numpy(ps_y_val).float())
valloader = DataLoader(valset, batch_size=3, shuffle=True)
print(next(iter(testloader)))

[tensor([[-0.8663, -1.2717,  1.0014, -0.4597,  1.3055, -0.4140,  0.4707,  0.8538,
         -0.2324,  2.0430, -0.0753, -0.9153,  0.9982,  0.1564,  0.9477,  0.7227,
          0.0420,  1.0910,  1.2749,  0.8389,  0.5079,  2.0011, -0.1542,  1.5838,
          0.3544,  1.0425,  1.1607,  0.2799,  0.3289,  0.6034,  0.2011,  1.9187,
          1.5560,  1.2947,  1.6882, -0.9417,  1.1696,  1.2119,  1.2497,  1.3320],
        [-0.8663,  1.6681, -0.9790,  0.2877, -2.0177,  0.3602, -1.2628, -0.2302,
         -0.1468, -0.4616,  1.1402, -0.2907, -1.2397, -0.5344, -0.1032, -2.3768,
          0.5640, -0.0244,  0.4275, -0.0752,  0.5506, -0.6995,  0.7169,  0.5353,
         -1.3976, -1.6311, -0.1627, -0.8204, -1.0138, -0.6475,  0.1611, -0.1383,
         -0.5670,  0.0914,  0.3067,  2.5229,  0.1497,  1.6310, -1.1775, -2.1604],
        [-0.6509,  1.4420, -1.8601,  0.9976,  0.3973, -1.3084, -0.1459, -1.9353,
         -1.4899,  0.2428,  0.3753, -0.3868, -1.0327, -0.3662, -0.8475,  0.5751,
          1.3181,  0.5928

In [22]:
import torch.optim as optim
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.001)

In [23]:
# check if CUDA is available
use_cuda = torch.cuda.is_available()
print ('Using CUDA: {}'.format(use_cuda))
if use_cuda:
    model = model.cuda()

Using CUDA: True


In [24]:
def train(n_epochs, train_loader, model, optimizer, criterion, use_cuda, save_path):
    """returns trained model"""
    # initialize tracker for minimum validation loss
    # TODO: fix target tensors always zero for some reason
    valid_loss_min = np.Inf 
    print_loss_count = 40
    cuda_refresh_count = 5
    if use_cuda:
        model = model.cuda()
    for epoch in range(1, n_epochs+1):
        # initialize variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        ###################
        # train the model #
        ###################
        model.train()
        train_load_iter = iter(train_loader)
        for i in range(len(train_loader)):
            data, target = next(train_load_iter)
            # print('data {}'.format(data.shape))
            # print('target {}'.format(target))
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()

            optimizer.zero_grad()    
            output = model(data)
            # print('target {}'.format(target))
            # print('output {}'.format(output))
            # _, argmax = output.max(-1)
            # print('argmax', argmax)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
                
            train_loss = train_loss + ((1 / (i + 1)) * (loss.data - train_loss))
            if i % print_loss_count == 0:
                print('Epoch %d, Batch %d Loss %.6f' % (epoch, i + 1, train_loss))
            if i % cuda_refresh_count == 0:
                torch.cuda.empty_cache()
        torch.save(model.state_dict(), save_path)

    # return trained model
    return model

In [25]:
model = train(140, trainloader, model, optimizer, criterion,
                      use_cuda, 'model_nn1.pt')

Epoch 1, Batch 1 Loss 0.656183
Epoch 1, Batch 21 Loss 0.696060
Epoch 1, Batch 41 Loss 0.683744
Epoch 1, Batch 61 Loss 0.695740
Epoch 1, Batch 81 Loss 0.695903
Epoch 1, Batch 101 Loss 0.689590
Epoch 2, Batch 1 Loss 0.670350
Epoch 2, Batch 21 Loss 0.681764
Epoch 2, Batch 41 Loss 0.683111
Epoch 2, Batch 61 Loss 0.686538
Epoch 2, Batch 81 Loss 0.689973
Epoch 2, Batch 101 Loss 0.686561
Epoch 3, Batch 1 Loss 0.752274
Epoch 3, Batch 21 Loss 0.711087
Epoch 3, Batch 41 Loss 0.699204
Epoch 3, Batch 61 Loss 0.699687
Epoch 3, Batch 81 Loss 0.696100
Epoch 3, Batch 101 Loss 0.691675
Epoch 4, Batch 1 Loss 0.588947
Epoch 4, Batch 21 Loss 0.669261
Epoch 4, Batch 41 Loss 0.678836
Epoch 4, Batch 61 Loss 0.688483
Epoch 4, Batch 81 Loss 0.691241
Epoch 4, Batch 101 Loss 0.690208
Epoch 5, Batch 1 Loss 0.659773
Epoch 5, Batch 21 Loss 0.687240
Epoch 5, Batch 41 Loss 0.689732
Epoch 5, Batch 61 Loss 0.695063
Epoch 5, Batch 81 Loss 0.688919
Epoch 5, Batch 101 Loss 0.689564
Epoch 6, Batch 1 Loss 0.722019
Epoch 6, 

Epoch 42, Batch 101 Loss 0.678259
Epoch 43, Batch 1 Loss 0.710078
Epoch 43, Batch 21 Loss 0.682415
Epoch 43, Batch 41 Loss 0.682605
Epoch 43, Batch 61 Loss 0.686627
Epoch 43, Batch 81 Loss 0.681415
Epoch 43, Batch 101 Loss 0.679869
Epoch 44, Batch 1 Loss 0.654303
Epoch 44, Batch 21 Loss 0.676725
Epoch 44, Batch 41 Loss 0.673338
Epoch 44, Batch 61 Loss 0.672356
Epoch 44, Batch 81 Loss 0.672844
Epoch 44, Batch 101 Loss 0.676941
Epoch 45, Batch 1 Loss 0.636916
Epoch 45, Batch 21 Loss 0.676980
Epoch 45, Batch 41 Loss 0.677438
Epoch 45, Batch 61 Loss 0.686261
Epoch 45, Batch 81 Loss 0.685718
Epoch 45, Batch 101 Loss 0.681750
Epoch 46, Batch 1 Loss 0.747935
Epoch 46, Batch 21 Loss 0.678408
Epoch 46, Batch 41 Loss 0.677965
Epoch 46, Batch 61 Loss 0.676074
Epoch 46, Batch 81 Loss 0.676535
Epoch 46, Batch 101 Loss 0.678015
Epoch 47, Batch 1 Loss 0.707553
Epoch 47, Batch 21 Loss 0.687424
Epoch 47, Batch 41 Loss 0.686503
Epoch 47, Batch 61 Loss 0.680731
Epoch 47, Batch 81 Loss 0.679981
Epoch 47, 

Epoch 84, Batch 21 Loss 0.630964
Epoch 84, Batch 41 Loss 0.639658
Epoch 84, Batch 61 Loss 0.652089
Epoch 84, Batch 81 Loss 0.648112
Epoch 84, Batch 101 Loss 0.648277
Epoch 85, Batch 1 Loss 0.563326
Epoch 85, Batch 21 Loss 0.651121
Epoch 85, Batch 41 Loss 0.645383
Epoch 85, Batch 61 Loss 0.645806
Epoch 85, Batch 81 Loss 0.648356
Epoch 85, Batch 101 Loss 0.649668
Epoch 86, Batch 1 Loss 0.605701
Epoch 86, Batch 21 Loss 0.645764
Epoch 86, Batch 41 Loss 0.643341
Epoch 86, Batch 61 Loss 0.638013
Epoch 86, Batch 81 Loss 0.639607
Epoch 86, Batch 101 Loss 0.644252
Epoch 87, Batch 1 Loss 0.658130
Epoch 87, Batch 21 Loss 0.646890
Epoch 87, Batch 41 Loss 0.646056
Epoch 87, Batch 61 Loss 0.642771
Epoch 87, Batch 81 Loss 0.647529
Epoch 87, Batch 101 Loss 0.645295
Epoch 88, Batch 1 Loss 0.615119
Epoch 88, Batch 21 Loss 0.629598
Epoch 88, Batch 41 Loss 0.639721
Epoch 88, Batch 61 Loss 0.648607
Epoch 88, Batch 81 Loss 0.640286
Epoch 88, Batch 101 Loss 0.646607
Epoch 89, Batch 1 Loss 0.688011
Epoch 89, 

Epoch 125, Batch 1 Loss 0.611813
Epoch 125, Batch 21 Loss 0.565911
Epoch 125, Batch 41 Loss 0.548222
Epoch 125, Batch 61 Loss 0.548310
Epoch 125, Batch 81 Loss 0.552598
Epoch 125, Batch 101 Loss 0.556805
Epoch 126, Batch 1 Loss 0.755248
Epoch 126, Batch 21 Loss 0.516311
Epoch 126, Batch 41 Loss 0.556062
Epoch 126, Batch 61 Loss 0.545460
Epoch 126, Batch 81 Loss 0.543198
Epoch 126, Batch 101 Loss 0.552148
Epoch 127, Batch 1 Loss 0.692938
Epoch 127, Batch 21 Loss 0.567518
Epoch 127, Batch 41 Loss 0.541455
Epoch 127, Batch 61 Loss 0.542675
Epoch 127, Batch 81 Loss 0.558530
Epoch 127, Batch 101 Loss 0.556083
Epoch 128, Batch 1 Loss 0.447227
Epoch 128, Batch 21 Loss 0.552419
Epoch 128, Batch 41 Loss 0.556843
Epoch 128, Batch 61 Loss 0.552673
Epoch 128, Batch 81 Loss 0.548692
Epoch 128, Batch 101 Loss 0.547460
Epoch 129, Batch 1 Loss 0.412357
Epoch 129, Batch 21 Loss 0.544755
Epoch 129, Batch 41 Loss 0.566181
Epoch 129, Batch 61 Loss 0.548254
Epoch 129, Batch 81 Loss 0.559784
Epoch 129, Batc

In [26]:
def test(loader, model, criterion, use_cuda, num_classes = 2):
    if use_cuda:
        model = model.cuda()
    # monitor test loss and accuracy
    test_loss = 0.
    correct = 0.
    total = 0.
    model.eval()
    load_iter = iter(loader)
    for i in range(len(loader)):
        data, target = next(load_iter)
        # move to GPU
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        
        # calculate the loss
        loss = criterion(output, target)
        # update average test loss 
        test_loss = test_loss + ((1 / (i + 1)) * (loss.data - test_loss))
        # compare predictions to true label
        for j, tensor in enumerate(output):
            if (tensor.item() > .5 and target[j] == 1) or (tensor.item() <= .5 and target[j] == 0):
                correct += 1
        total += data.size(0)
       
    print('Test Loss: {:.6f}\n'.format(test_loss))

    print('\nTest Accuracy: %2d%% (%2d/%2d)' % (
        100. * correct / total, correct, total))

In [27]:
test(testloader, model, criterion, use_cuda)

Test Loss: 0.533384


Test Accuracy: 72% (81/111)


In [28]:
# Setup This years bracket regions
# TODO: automate this with the data received from the scraper
school_names_south = [
    # south region
    ('Virginia', 1),('Maryland-Baltimore County', 16),
    ('Creighton', 8), ('Kansas State', 9),
    ('Kentucky',5), ('Davidson', 12),
    ('Arizona',4), ('Buffalo', 13),
    ('Miami (FL)', 6), ('Loyola (IL)', 11),
    ('Tennessee',3), ('Wright State',14),
    ('Nevada',7),('Texas',10),
    ('Cincinnati',2), ('Georgia State',15)
    ]
school_names_west = [
    # west region
    ('Xavier', 1),('North Carolina Central',16), #or 'Texas Southern',
    ('Missouri', 8),('Florida State', 9),
    ('Ohio State',5), ('South Dakota State', 12),
    ('Gonzaga',4), ('North Carolina-Greensboro',13),
    ('Houston',6),('San Diego State',11),
    ('Michigan', 3),('Montana', 14),
    ('Texas A&M',7),('Providence',10),
    ('North Carolina',2),('Lipscomb',15)
    ]
school_names_east = [
    # east region
    ('Villanova',1),('Long Island University',16), # or 'Radford',
    ('Virginia Tech',8), ('Alabama',9),
    ('West Virginia',5), ('Murray State',12),
    ('Wichita State',4), ('Marshall',13),
    ('Florida',6), ('St. Bonaventure',11), # or 'UCLA',
    ('Texas Tech',3), ('Stephen F. Austin',14),
    ('Arkansas',7), ('Butler',10),
    ('Purdue', 2), ('Cal State Fullerton',15)
    ]
school_names_midwest = [
    # mid-west region
    ('Kansas', 1), ('Pennsylvania',16),
    ('Seton Hall', 8), ('North Carolina State',9),
    ('Clemson', 5), ('New Mexico State',12),
    ('Auburn',4), ('College of Charleston',13),
    ('Texas Christian',6), ('Arizona State',11), # or 'Syracuse',
    ('Michigan State',3), ('Bucknell',14),
    ('Rhode Island',7), ('Oklahoma',10),
    ('Duke', 2), ('Iona' ,15) 
    ]

In [29]:
'''
    Methods to add evaluating the predicted winners of matchups and subbrackets (A region or Final Four)
    To change the predictive model used, just change the model handed to "evaluate_winner(schools,sub_bracket_name, model)"
    found later in the notebook
'''
def get_matchups_stats(schools, post_season):    
    
    i = 0 
    t1_stats = []
    t2_stats = []
    t1_seeds = []
    t2_seeds = []
    if(not is_power_of_two(len(schools))):
        print('ERROR: invalid number of school names')
        return False
    while i < len(schools):
        t1_name, t1_seed = schools[i]
        t2_name, t2_seed = schools[i + 1]
        t1_seeds.append(t1_seed)
        t2_seeds.append(t2_seed)
        #print(t1_name, t2_name
        t1_stats.append(get_school_stats(2018, t1_name))
        t2_stats.append(get_school_stats(2018, t2_name))
        i = i + 2
    if(post_season):
        matchup_stats = create_team_stats_df_ps(range(0,int(len(schools)/2)), t1_stats, t2_stats, t1_seeds, t2_seeds)
    else:
        matchup_stats = create_team_stats_df(range(0,int(len(schools)/2)), t1_stats, t2_stats)
    return matchup_stats
def is_power_of_two(num):
    return ((num & (num - 1)) == 0) and num != 0
def get_matchup_winners(matchup_stats, schools, model, post_season, use_cuda):

    x_tourney = matchup_stats[ps_feature_col_names].values
    x_tourney = torch.from_numpy(x_tourney).float()
    # print(x_tourney)
    if use_cuda:
        x_tourney = x_tourney.cuda()
    y_tourney = model(x_tourney)
    #print(y_tourney)
    i = 0
    winners = []
    for y_val in y_tourney:
        t1_name, t1_seed = schools[i]
        t2_name, t2_seed = schools[i + 1]
        t1_won = y_val.item() > .5
        print(t1_name,t1_seed,' vs. ', t2_name,t2_seed,'(team 1 won=', t1_won,')')
        if(t1_won):
            winners.append((t1_name,t1_seed))
        else:
            winners.append((t2_name, t2_seed))
        i = i + 2
    return winners
def evaluate_winner(schools,sub_bracket_name, model, use_cuda):        
    remaining_teams = schools
    i = 1
    while(len(remaining_teams) > 1):
        #Add a random factor
        rand = random.randrange(0,1)
        post_season_stats = True
        print("---",sub_bracket_name," round ",i,"---")
        matchup_stats = get_matchups_stats(remaining_teams, post_season_stats)
        remaining_teams = get_matchup_winners(matchup_stats,remaining_teams, model, post_season_stats, use_cuda)
        i = i + 1
    winner = remaining_teams[0]
    print('Winner of ',sub_bracket_name,':',winner)
    return winner

In [30]:
# Get predicted final four

final_four = [evaluate_winner(school_names_south, "South",model, use_cuda), evaluate_winner(school_names_west,"West",model, use_cuda),
              evaluate_winner(school_names_east, "East", model, use_cuda), evaluate_winner(school_names_midwest, "MidWest",model, use_cuda)]

--- South  round  1 ---
Virginia 1  vs.  Maryland-Baltimore County 16 (team 1 won= True )
Creighton 8  vs.  Kansas State 9 (team 1 won= False )
Kentucky 5  vs.  Davidson 12 (team 1 won= True )
Arizona 4  vs.  Buffalo 13 (team 1 won= True )
Miami (FL) 6  vs.  Loyola (IL) 11 (team 1 won= True )
Tennessee 3  vs.  Wright State 14 (team 1 won= True )
Nevada 7  vs.  Texas 10 (team 1 won= False )
Cincinnati 2  vs.  Georgia State 15 (team 1 won= True )
--- South  round  2 ---
Virginia 1  vs.  Kansas State 9 (team 1 won= True )
Kentucky 5  vs.  Arizona 4 (team 1 won= True )
Miami (FL) 6  vs.  Tennessee 3 (team 1 won= False )
Texas 10  vs.  Cincinnati 2 (team 1 won= False )
--- South  round  3 ---
Virginia 1  vs.  Kentucky 5 (team 1 won= True )
Tennessee 3  vs.  Cincinnati 2 (team 1 won= False )
--- South  round  4 ---
Virginia 1  vs.  Cincinnati 2 (team 1 won= True )
Winner of  South : ('Virginia', 1)
--- West  round  1 ---
Xavier 1  vs.  North Carolina Central 16 (team 1 won= True )
Missouri 8

In [31]:
final_four

[('Virginia', 1), ('North Carolina', 2), ('Purdue', 2), ('Duke', 2)]

In [32]:
champ = evaluate_winner(final_four, "FinalFour", model, use_cuda)

--- FinalFour  round  1 ---
Virginia 1  vs.  North Carolina 2 (team 1 won= False )
Purdue 2  vs.  Duke 2 (team 1 won= False )
--- FinalFour  round  2 ---
North Carolina 2  vs.  Duke 2 (team 1 won= False )
Winner of  FinalFour : ('Duke', 2)
