In [1]:
import pandas as pd #dataframes
import matplotlib.pyplot as plt 
import numpy as np # n-dim object support
# do ploting inline instead of in a separate window
%matplotlib inline
import random

In [2]:
df_school = pd.read_csv("../Scraper/school_records.csv")
df_ps_game = pd.read_csv("../Scraper/post_season_game_records.csv")

In [3]:
df_school.shape

(3478, 21)

In [4]:
df_ps_game.shape

(504, 7)

In [5]:
df_school.head(5)

Unnamed: 0,year,team_name,fg_pg,ft_pg,three_pt_pg,orb_pg,drb_pg,ast_pg,stl_pg,blk_pg,...,pf_pg,pt_pg,opnt_pt_pg,fg_pct,three_p_pct,ft_pct,wl_pct,conf_wl_pct,srs,sos
0,2010,Air Force,20.387097,10.741935,5.677419,7.096774,27.222685,12.548387,5.0,1.645161,...,17.645161,57.193548,63.129032,0.443,0.313,0.635,0.323,0.0625,-4.9,3.13
1,2010,Akron,25.057143,13.8,6.714286,13.342857,35.875918,13.514286,6.085714,3.257143,...,19.485714,70.628571,65.514286,0.433,0.339,0.657,0.686,0.75,2.82,-1.5
2,2010,Alabama A&M,22.185185,17.481481,4.0,13.925926,36.66941,10.666667,9.222222,5.296296,...,20.37037,65.851852,69.666667,0.382,0.291,0.635,0.407,0.444444,-20.19,-13.71
3,2010,Alabama-Birmingham,22.441176,16.852941,5.205882,12.352941,36.342561,11.470588,6.558824,2.676471,...,17.970588,66.941176,60.382353,0.422,0.311,0.694,0.735,0.6875,9.46,2.9
4,2010,Alabama State,21.516129,15.290323,6.129032,12.903226,35.099896,12.903226,7.354839,4.16129,...,20.451613,64.451613,65.903226,0.404,0.324,0.641,0.516,0.666667,-14.41,-12.02


In [6]:
df_ps_game.head(5)

Unnamed: 0,year,team_1_name,team_1_score,team_1_seed,team_2_name,team_2_score,team_2_seed
0,2011,UTSA,46,16,Ohio State,75,1
1,2011,George Mason,61,8,Villanova,57,9
2,2011,Clemson,76,12,West Virginia,84,5
3,2011,Princeton,57,13,Kentucky,59,4
4,2011,Marquette,66,11,Xavier,55,6


In [7]:
df_school.isnull().values.any()

False

In [8]:
df_ps_game.isnull().values.any()

False

In [9]:
def resolve_team_name(team_name):
    #Apply hard-coded corrections to team names
    team_name_dict = {'Colorado-Colorado Springs':'Colorado',
                     'Colorado College': 'Colorado',
                     'UNC':'North Carolina',
                     'UConn':'Connecticut',
                     'LIU-Brooklyn':'Long Island University',
                     'UTSA':'Texas-San Antonio',
                     'Pitt':'Pittsburgh',
                     'BYU':'Brigham Young',
                     "St. Peter's": "Saint Peter's",
                     'VCU':'Virginia Commonwealth',
                     'Southern Miss':'Southern Mississippi',
                     'Detroit': 'Detroit Mercy',
                     'UNLV':'Nevada-Las Vegas',
                     'Ole Miss':'Mississippi',
                     "St. Joseph's":"Saint Joseph's",
                     'UCSB':'UC-Santa Barbara',
                     'SMU': 'Southern Methodist',
                     'USC':'South Carolina',
                     'LSU':'Louisiana State',
                     'UMass':'Massachusetts',
                     'ETSU':'East Tennessee State'}
    # TODO: for V2 add more corrections to the team_name_dict
    if(team_name in team_name_dict):
        return team_name_dict[team_name]
    return team_name
def get_school_stats(year, team_name):
    return df_school[(df_school['year'] == year) & (df_school['team_name'] == team_name)]
def get_vals(t_stats_list, key):
    ret = []
    for t_stat in t_stats_list:
        ret.append(t_stat[key].squeeze())
    return ret
def get_team_stats_dict_with_t1_win(t1_stats, t2_stats, t1_wins):
    return {'team_name_1':get_vals(t1_stats,'team_name'),'fg_pg_1':get_vals(t1_stats,'fg_pg'),'ft_pg_1':get_vals(t1_stats,'ft_pg'),
            'three_pt_pg_1':get_vals(t1_stats,'three_pt_pg'),'orb_pg_1':get_vals(t1_stats,'orb_pg'),'drb_pg_1':get_vals(t1_stats,'drb_pg'),
            'ast_pg_1':get_vals(t1_stats,'ast_pg'),'stl_pg_1':get_vals(t1_stats,'stl_pg'),'blk_pg_1':get_vals(t1_stats,'blk_pg'),
            'tov_pg_1':get_vals(t1_stats,'tov_pg'),'pf_pg_1':get_vals(t1_stats,'pf_pg'), 'pt_pg_1':get_vals(t1_stats,'pt_pg'),
            'opnt_pt_pg_1':get_vals(t1_stats,'opnt_pt_pg'),'fg_pct_1':get_vals(t1_stats,'fg_pct'),'three_p_pct_1':get_vals(t1_stats,'three_p_pct'),
            'ft_pct_1':get_vals(t1_stats,'ft_pct'),'wl_pct_1':get_vals(t1_stats,'wl_pct'),'conf_wl_pct_1':get_vals(t1_stats,'conf_wl_pct'),
            'srs_1':get_vals(t1_stats,'srs'),'sos_1':get_vals(t1_stats,'sos'),
            'team_name_2':get_vals(t2_stats,'team_name'),'fg_pg_2':get_vals(t2_stats,'fg_pg'),'ft_pg_2':get_vals(t2_stats,'ft_pg'),
            'three_pt_pg_2':get_vals(t2_stats,'three_pt_pg'),'orb_pg_2':get_vals(t2_stats,'orb_pg'),'drb_pg_2':get_vals(t2_stats,'drb_pg'),
            'ast_pg_2':get_vals(t2_stats,'ast_pg'),'stl_pg_2':get_vals(t2_stats,'stl_pg'),'blk_pg_2':get_vals(t2_stats,'blk_pg'),
            'tov_pg_2':get_vals(t2_stats,'tov_pg'),'pf_pg_2':get_vals(t2_stats,'pf_pg'), 'pt_pg_2':get_vals(t2_stats,'pt_pg'),
            'opnt_pt_pg_2':get_vals(t2_stats,'opnt_pt_pg'),'fg_pct_2':get_vals(t2_stats,'fg_pct'),'three_p_pct_2':get_vals(t2_stats,'three_p_pct'),
            'ft_pct_2':get_vals(t2_stats,'ft_pct'),'wl_pct_2':get_vals(t2_stats,'wl_pct'),'conf_wl_pct_2':get_vals(t2_stats,'conf_wl_pct'),
            'srs_2':get_vals(t2_stats,'srs'),'sos_2':get_vals(t2_stats,'sos'),
            't1_win':t1_wins}
def get_team_stats_dict(t1_stats, t2_stats):
    return {'team_name_1':get_vals(t1_stats,'team_name'),'fg_pg_1':get_vals(t1_stats,'fg_pg'),'ft_pg_1':get_vals(t1_stats,'ft_pg'),
            'three_pt_pg_1':get_vals(t1_stats,'three_pt_pg'),'orb_pg_1':get_vals(t1_stats,'orb_pg'),'drb_pg_1':get_vals(t1_stats,'drb_pg'),
            'ast_pg_1':get_vals(t1_stats,'ast_pg'),'stl_pg_1':get_vals(t1_stats,'stl_pg'),'blk_pg_1':get_vals(t1_stats,'blk_pg'),
            'tov_pg_1':get_vals(t1_stats,'tov_pg'),'pf_pg_1':get_vals(t1_stats,'pf_pg'), 'pt_pg_1':get_vals(t1_stats,'pt_pg'),
            'opnt_pt_pg_1':get_vals(t1_stats,'opnt_pt_pg'),'fg_pct_1':get_vals(t1_stats,'fg_pct'),'three_p_pct_1':get_vals(t1_stats,'three_p_pct'),
            'ft_pct_1':get_vals(t1_stats,'ft_pct'),'wl_pct_1':get_vals(t1_stats,'wl_pct'),'conf_wl_pct_1':get_vals(t1_stats,'conf_wl_pct'),
            'srs_1':get_vals(t1_stats,'srs'),'sos_1':get_vals(t1_stats,'sos'),
            'team_name_2':get_vals(t2_stats,'team_name'),'fg_pg_2':get_vals(t2_stats,'fg_pg'),'ft_pg_2':get_vals(t2_stats,'ft_pg'),
            'three_pt_pg_2':get_vals(t2_stats,'three_pt_pg'),'orb_pg_2':get_vals(t2_stats,'orb_pg'),'drb_pg_2':get_vals(t2_stats,'drb_pg'),
            'ast_pg_2':get_vals(t2_stats,'ast_pg'),'stl_pg_2':get_vals(t2_stats,'stl_pg'),'blk_pg_2':get_vals(t2_stats,'blk_pg'),
            'tov_pg_2':get_vals(t2_stats,'tov_pg'),'pf_pg_2':get_vals(t2_stats,'pf_pg'), 'pt_pg_2':get_vals(t2_stats,'pt_pg'),
            'opnt_pt_pg_2':get_vals(t2_stats,'opnt_pt_pg'),'fg_pct_2':get_vals(t2_stats,'fg_pct'),'three_p_pct_2':get_vals(t2_stats,'three_p_pct'),
            'ft_pct_2':get_vals(t2_stats,'ft_pct'),'wl_pct_2':get_vals(t2_stats,'wl_pct'),'conf_wl_pct_2':get_vals(t2_stats,'conf_wl_pct'),
            'srs_2':get_vals(t2_stats,'srs'),'sos_2':get_vals(t2_stats,'sos')}
def get_team_stats_dict_ps(t1_stats, t2_stats, t1_seeds, t2_seeds):
    return {'team_name_1':get_vals(t1_stats,'team_name'),'fg_pg_1':get_vals(t1_stats,'fg_pg'),'ft_pg_1':get_vals(t1_stats,'ft_pg'),
            'three_pt_pg_1':get_vals(t1_stats,'three_pt_pg'),'orb_pg_1':get_vals(t1_stats,'orb_pg'),'drb_pg_1':get_vals(t1_stats,'drb_pg'),
            'ast_pg_1':get_vals(t1_stats,'ast_pg'),'stl_pg_1':get_vals(t1_stats,'stl_pg'),'blk_pg_1':get_vals(t1_stats,'blk_pg'),
            'tov_pg_1':get_vals(t1_stats,'tov_pg'),'pf_pg_1':get_vals(t1_stats,'pf_pg'), 'pt_pg_1':get_vals(t1_stats,'pt_pg'),
            'opnt_pt_pg_1':get_vals(t1_stats,'opnt_pt_pg'),'fg_pct_1':get_vals(t1_stats,'fg_pct'),'three_p_pct_1':get_vals(t1_stats,'three_p_pct'),
            'ft_pct_1':get_vals(t1_stats,'ft_pct'),'wl_pct_1':get_vals(t1_stats,'wl_pct'),'conf_wl_pct_1':get_vals(t1_stats,'conf_wl_pct'),
            'srs_1':get_vals(t1_stats,'srs'),'sos_1':get_vals(t1_stats,'sos'),
            'team_name_2':get_vals(t2_stats,'team_name'),'fg_pg_2':get_vals(t2_stats,'fg_pg'),'ft_pg_2':get_vals(t2_stats,'ft_pg'),
            'three_pt_pg_2':get_vals(t2_stats,'three_pt_pg'),'orb_pg_2':get_vals(t2_stats,'orb_pg'),'drb_pg_2':get_vals(t2_stats,'drb_pg'),
            'ast_pg_2':get_vals(t2_stats,'ast_pg'),'stl_pg_2':get_vals(t2_stats,'stl_pg'),'blk_pg_2':get_vals(t2_stats,'blk_pg'),
            'tov_pg_2':get_vals(t2_stats,'tov_pg'),'pf_pg_2':get_vals(t2_stats,'pf_pg'), 'pt_pg_2':get_vals(t2_stats,'pt_pg'),
            'opnt_pt_pg_2':get_vals(t2_stats,'opnt_pt_pg'),'fg_pct_2':get_vals(t2_stats,'fg_pct'),'three_p_pct_2':get_vals(t2_stats,'three_p_pct'),
            'ft_pct_2':get_vals(t2_stats,'ft_pct'),'wl_pct_2':get_vals(t2_stats,'wl_pct'),'conf_wl_pct_2':get_vals(t2_stats,'conf_wl_pct'),
            'srs_2':get_vals(t2_stats,'srs'),'sos_2':get_vals(t2_stats,'sos'), 'team_1_seed':t1_seeds,
            'team_2_seed':t2_seeds}
def create_team_stats_df_w_t1_win(indeces_w_stats, t1_stats_list, t2_stats_list,t1_wins):
    # Adds column for wether team 1 wins or not
    # Assumes all lists are of the same length
    return pd.DataFrame(get_team_stats_dict_with_t1_win(t1_stats_list, t2_stats_list,t1_wins), index = indeces_w_stats)
def create_team_stats_df(indeces_w_stats, t1_stats_list, t2_stats_list):
    # Assumes all lists are of the same length
    return pd.DataFrame(get_team_stats_dict(t1_stats_list, t2_stats_list), index = indeces_w_stats)
def create_team_stats_df_ps(indeces_w_stats, t1_stats_list, t2_stats_list, t1_seeds, t2_seeds):
    # Only uses post season stats => inclu
    # Assumes all lists are of the same length
    return pd.DataFrame(get_team_stats_dict_ps(t1_stats_list, t2_stats_list, t1_seeds, t2_seeds), index = indeces_w_stats)
def get_team_stats_df(game_df, should_print=False):
    indeces_w_stats = []
    t1_stats_list = []
    t2_stats_list = []
    t1_wins_list = []
    for index, row in game_df.iterrows():
        year = row['year']
        team_1 = row['team_1_name']
        team_2 = row['team_2_name']
        team_1_score = row['team_1_score']
        team_2_score = row['team_2_score']
        t1_stats = get_school_stats(year, resolve_team_name(team_1))
        t2_stats = get_school_stats(year, resolve_team_name(team_2))

        if(len(t1_stats) > 0 and len(t2_stats) > 0):  
            indeces_w_stats.append(index)
            t1_stats_list.append(t1_stats)
            t2_stats_list.append(t2_stats)
            t1_wins_list.append(team_1_score > team_2_score)
        else:         
            if(should_print):
                print(year)
                if(len(t1_stats) < 1):
                    print(team_1)
                if(len(t2_stats) < 1):
                    print(team_2)
            
    print(len(indeces_w_stats))
    team_stats_df = create_team_stats_df_w_t1_win(indeces_w_stats, t1_stats_list, t2_stats_list, t1_wins_list)
    return team_stats_df
        

In [10]:
ps_team_stats_df = get_team_stats_df(df_ps_game, True)

2018
UNC Greensboro
2018
Penn
2018
NC State
2018
TCU
2018
UMBC
2018
UMBC
498


In [11]:
ps_game_w_team_stats = pd.concat([df_ps_game, ps_team_stats_df], axis=1, join='inner')

In [12]:
ps_game_w_team_stats.head(3)

Unnamed: 0,year,team_1_name,team_1_score,team_1_seed,team_2_name,team_2_score,team_2_seed,team_name_1,fg_pg_1,ft_pg_1,...,pt_pg_2,opnt_pt_pg_2,fg_pct_2,three_p_pct_2,ft_pct_2,wl_pct_2,conf_wl_pct_2,srs_2,sos_2,t1_win
0,2011,UTSA,46,16,Ohio State,75,1,Texas-San Antonio,23.588235,16.058824,...,77.135135,59.675676,0.494,0.423,0.701,0.919,0.888889,25.84,8.38,False
1,2011,George Mason,61,8,Villanova,57,9,George Mason,25.764706,14.558824,...,72.242424,65.424242,0.438,0.348,0.757,0.636,0.5,15.05,8.23,True
2,2011,Clemson,76,12,West Virginia,84,5,Clemson,23.823529,14.5,...,69.787879,64.666667,0.429,0.337,0.711,0.636,0.611111,16.15,11.03,False


In [13]:
ps_game_w_team_stats.shape

(498, 48)

## Check team 1 winning true/false ratio

In [14]:
t1_win_map = {True:1, False:0}
ps_game_w_team_stats['t1_win'] = ps_game_w_team_stats['t1_win'].map(t1_win_map)
num_true = len(ps_game_w_team_stats.loc[ps_game_w_team_stats['t1_win'] == True])
num_false = len(ps_game_w_team_stats.loc[ps_game_w_team_stats['t1_win'] == False])
print("Number of True cases: {0} ({1:2.2f}%)".format(num_true, (num_true/(num_true+num_false))*100))
print("Number of False cases: {0} ({1:2.2f}%)".format(num_false, (num_false/(num_true+num_false))*100))

Number of True cases: 259 (52.01%)
Number of False cases: 239 (47.99%)


In [15]:
import sklearn.model_selection
ps_feature_col_names = ['team_1_seed', 'team_2_seed','fg_pg_1','ft_pg_1',
            'three_pt_pg_1','orb_pg_1','drb_pg_1',
            'ast_pg_1','stl_pg_1','blk_pg_1',
            'tov_pg_1','pf_pg_1', 'pt_pg_1',
            'opnt_pt_pg_1','fg_pct_1','three_p_pct_1',
            'ft_pct_1','wl_pct_1','conf_wl_pct_1',
            'srs_1','sos_1',
            'fg_pg_2','ft_pg_2',
            'three_pt_pg_2','orb_pg_2','drb_pg_2',
            'ast_pg_2','stl_pg_2','blk_pg_2',
            'tov_pg_2','pf_pg_2', 'pt_pg_2',
            'opnt_pt_pg_2','fg_pct_2','three_p_pct_2',
            'ft_pct_2','wl_pct_2','conf_wl_pct_2',
            'srs_2','sos_2'
            ]
ps_predict_class_names = ['t1_win']




In [16]:
def scale_features(data, col_names):
    scaled_features = {}
    for col_name in col_names:
        mean, std = data[col_name].values.mean(), data[col_name].values.std()
        scaled_features[col_name] = [mean, std]
        data.loc[:, col_name] = (data[col_name].values - mean)/std
    return scaled_features


In [19]:
scale_features(ps_game_w_team_stats, ps_feature_col_names)
ps_x = ps_game_w_team_stats[ps_feature_col_names].values
ps_y = ps_game_w_team_stats[ps_predict_class_names].values
print(type(ps_x))
split_test_size = 0.2
ps_x_train, ps_x_test, ps_y_train, ps_y_test = sklearn.model_selection.train_test_split(ps_x, ps_y, test_size=split_test_size, random_state=42)
split_valid_size = 0.25
ps_x_train, ps_x_val, ps_y_train, ps_y_val = sklearn.model_selection.train_test_split(ps_x_train, ps_y_train, test_size=split_valid_size, random_state=1)


<class 'numpy.ndarray'>


In [20]:
print("{0:0.2f}% in training set".format((len(ps_x_train)/len(ps_game_w_team_stats.index))*100))
print("{0:0.2f}% in test set".format((len(ps_x_test)/len(ps_game_w_team_stats.index))*100))
print("{0:0.2f}% in test set".format((len(ps_x_val)/len(ps_game_w_team_stats.index))*100))

59.84% in training set
20.08% in test set
20.08% in test set


In [21]:
import sklearn.impute

#Impute with mean all 0 readings
fill_0 = sklearn.impute.SimpleImputer(missing_values=0, strategy="mean")

ps_x_train = fill_0.fit_transform(ps_x_train)
ps_x_test = fill_0.fit_transform(ps_x_test)
ps_x_val = fill_0.fit_transform(ps_x_val)
# TODO : impute incorrect negative values such anything other than (SOS and SRS)

In [22]:
print(len(ps_x_train[0]))

40


In [45]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
dropout = 0.5
model = nn.Sequential(nn.Linear(40, 30),
                     nn.ReLU(),
                     nn.Dropout(dropout),
                     nn.Linear(30, 30),
                     nn.ReLU(),
                     nn.Dropout(dropout),
                     nn.Linear(30, 10),
                     nn.ReLU(),
                     nn.Dropout(dropout),
                     nn.Linear(10, 1),
                     nn.Sigmoid())
trainset = TensorDataset(torch.from_numpy(ps_x_train).float(), torch.from_numpy(ps_y_train).float())
trainloader = DataLoader(trainset, batch_size=3, shuffle=True)
testset = TensorDataset(torch.from_numpy(ps_x_test).float(), torch.from_numpy(ps_y_test).float())
testloader = DataLoader(testset, batch_size=3, shuffle=True)
valset = TensorDataset(torch.from_numpy(ps_x_val).float(), torch.from_numpy(ps_y_val).float())
valloader = DataLoader(valset, batch_size=3, shuffle=True)
print(next(iter(testloader)))

[tensor([[-0.8784,  1.0063, -0.2218, -0.5439, -0.0466, -0.0116, -0.0256,  0.8330,
         -0.4402,  0.0745, -0.9502,  0.2407, -0.3926, -0.7599, -0.5942,  0.1154,
          0.0996,  0.2147,  0.4424,  0.6467,  0.5807, -0.9921, -0.1811,  1.0188,
         -0.2898, -1.1671,  0.1403,  1.3164, -0.5491, -0.3529,  0.5443, -0.5698,
          0.3269, -1.2511,  0.2916,  0.0995, -0.5304, -0.4341, -1.0058, -0.5689],
        [-1.0949, -1.2675,  0.4426,  0.1311, -1.3617,  0.1437, -0.3711,  0.1399,
          0.3467, -0.7537, -0.1964, -0.6685,  0.0373, -1.1872,  0.6528, -1.0004,
         -0.0763,  0.5408,  0.0234,  1.2681,  0.6608,  0.5074, -0.8850, -0.5554,
          0.7176, -1.3220,  0.5173,  2.0758,  2.4618, -0.9123, -1.0215, -0.0759,
         -0.8781,  0.2606, -0.5362, -0.3867,  1.7935,  1.6683,  0.8729,  0.3712],
        [-1.0949,  1.9159,  0.1650, -0.2320,  0.3129,  1.1011,  1.2250,  1.0161,
          0.5051,  1.3189, -0.0982, -0.8924,  0.1357, -1.7539, -0.2502,  0.0410,
         -0.6041,  1.4103

In [46]:
import torch.optim as optim
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [47]:
# check if CUDA is available
use_cuda = torch.cuda.is_available()
print ('Using CUDA: {}'.format(use_cuda))
if use_cuda:
    model = model.cuda()

Using CUDA: True


In [48]:
def train(n_epochs, train_loader, val_loader, model, optimizer, criterion, use_cuda, save_path):
    """returns trained model"""
    # initialize tracker for minimum validation loss
    # TODO: fix target tensors always zero for some reason
    valid_loss_min = np.Inf 
    print_loss_count = 40
    cuda_refresh_count = 5
    if use_cuda:
        model = model.cuda()
    for epoch in range(1, n_epochs+1):
        # initialize variables to monitor training and validation loss
        train_loss = 0.0
        valid_loss = 0.0
        ###################
        # train the model #
        ###################
        model.train()
        train_load_iter = iter(train_loader)
        for i in range(len(train_loader)):
            data, target = next(train_load_iter)
            # move to GPU
            if use_cuda:
                data, target = data.cuda(), target.cuda()

            optimizer.zero_grad()    
            output = model(data)
            loss = criterion(output, target)
            loss.backward()
            optimizer.step()
                
            train_loss = train_loss + ((1 / (i + 1)) * (loss.data - train_loss))

        model.eval()
        val_load_iter = iter(val_loader)
        for i in range(len(val_loader)):
            data, target = next(val_load_iter)
            if use_cuda:
                data, target = data.cuda(), target.cuda()
            output = model(data)
            loss = criterion(output, target)
            valid_loss = valid_loss + ((1 / (i + 1)) * (loss.data - valid_loss))
        # print training/validation statistics 
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))
        ## save the model if validation loss has decreased
        if valid_loss < valid_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
            torch.save(model.state_dict(), save_path)
            valid_loss_min = valid_loss

    # return trained model
    return model

In [49]:
model = train(120, trainloader, valloader, model, optimizer, criterion,
                      use_cuda, 'model_nn1.pt')

Epoch: 1 	Training Loss: 0.694864 	Validation Loss: 0.694825
Validation loss decreased (inf --> 0.694825).  Saving model ...
Epoch: 2 	Training Loss: 0.696691 	Validation Loss: 0.696323
Epoch: 3 	Training Loss: 0.695185 	Validation Loss: 0.694812
Validation loss decreased (0.694825 --> 0.694812).  Saving model ...
Epoch: 4 	Training Loss: 0.691325 	Validation Loss: 0.693610
Validation loss decreased (0.694812 --> 0.693610).  Saving model ...
Epoch: 5 	Training Loss: 0.692976 	Validation Loss: 0.690144
Validation loss decreased (0.693610 --> 0.690144).  Saving model ...
Epoch: 6 	Training Loss: 0.688731 	Validation Loss: 0.691535
Epoch: 7 	Training Loss: 0.688258 	Validation Loss: 0.690295
Epoch: 8 	Training Loss: 0.687805 	Validation Loss: 0.686997
Validation loss decreased (0.690144 --> 0.686997).  Saving model ...
Epoch: 9 	Training Loss: 0.683537 	Validation Loss: 0.685116
Validation loss decreased (0.686997 --> 0.685116).  Saving model ...
Epoch: 10 	Training Loss: 0.685419 	Valida

Epoch: 99 	Training Loss: 0.360873 	Validation Loss: 0.686525
Epoch: 100 	Training Loss: 0.408990 	Validation Loss: 0.696803
Epoch: 101 	Training Loss: 0.324089 	Validation Loss: 0.725288
Epoch: 102 	Training Loss: 0.388927 	Validation Loss: 0.655730
Epoch: 103 	Training Loss: 0.398857 	Validation Loss: 0.646196
Epoch: 104 	Training Loss: 0.355109 	Validation Loss: 0.700692
Epoch: 105 	Training Loss: 0.368273 	Validation Loss: 0.710731
Epoch: 106 	Training Loss: 0.354027 	Validation Loss: 0.724133
Epoch: 107 	Training Loss: 0.345299 	Validation Loss: 0.742154
Epoch: 108 	Training Loss: 0.383961 	Validation Loss: 0.739921
Epoch: 109 	Training Loss: 0.342146 	Validation Loss: 0.743813
Epoch: 110 	Training Loss: 0.330957 	Validation Loss: 0.763999
Epoch: 111 	Training Loss: 0.286655 	Validation Loss: 0.735209
Epoch: 112 	Training Loss: 0.351697 	Validation Loss: 0.669973
Epoch: 113 	Training Loss: 0.341263 	Validation Loss: 0.721137
Epoch: 114 	Training Loss: 0.287619 	Validation Loss: 0.

In [50]:
def test(loader, model, criterion, use_cuda, num_classes = 2):
    if use_cuda:
        model = model.cuda()
    # monitor test loss and accuracy
    test_loss = 0.
    correct = 0.
    total = 0.
    model.eval()
    load_iter = iter(loader)
    for i in range(len(loader)):
        data, target = next(load_iter)
        # move to GPU
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        
        # calculate the loss
        loss = criterion(output, target)
        # update average test loss 
        test_loss = test_loss + ((1 / (i + 1)) * (loss.data - test_loss))
        # compare predictions to true label
        for j, tensor in enumerate(output):
            if (tensor.item() > .5 and target[j] == 1) or (tensor.item() <= .5 and target[j] == 0):
                correct += 1
        total += data.size(0)
       
    print('Test Loss: {:.6f}\n'.format(test_loss))

    print('\nTest Accuracy: %2d%% (%2d/%2d)' % (
        100. * correct / total, correct, total))

In [51]:
test(testloader, model, criterion, use_cuda)

Test Loss: 0.832023


Test Accuracy: 70% (70/100)


In [52]:
# Setup This years bracket regions
# TODO: automate this with the data received from the scraper
school_names_south = [
    # south region
    ('Virginia', 1),('Maryland-Baltimore County', 16),
    ('Creighton', 8), ('Kansas State', 9),
    ('Kentucky',5), ('Davidson', 12),
    ('Arizona',4), ('Buffalo', 13),
    ('Miami (FL)', 6), ('Loyola (IL)', 11),
    ('Tennessee',3), ('Wright State',14),
    ('Nevada',7),('Texas',10),
    ('Cincinnati',2), ('Georgia State',15)
    ]
school_names_west = [
    # west region
    ('Xavier', 1),('North Carolina Central',16), #or 'Texas Southern',
    ('Missouri', 8),('Florida State', 9),
    ('Ohio State',5), ('South Dakota State', 12),
    ('Gonzaga',4), ('North Carolina-Greensboro',13),
    ('Houston',6),('San Diego State',11),
    ('Michigan', 3),('Montana', 14),
    ('Texas A&M',7),('Providence',10),
    ('North Carolina',2),('Lipscomb',15)
    ]
school_names_east = [
    # east region
    ('Villanova',1),('Long Island University',16), # or 'Radford',
    ('Virginia Tech',8), ('Alabama',9),
    ('West Virginia',5), ('Murray State',12),
    ('Wichita State',4), ('Marshall',13),
    ('Florida',6), ('St. Bonaventure',11), # or 'UCLA',
    ('Texas Tech',3), ('Stephen F. Austin',14),
    ('Arkansas',7), ('Butler',10),
    ('Purdue', 2), ('Cal State Fullerton',15)
    ]
school_names_midwest = [
    # mid-west region
    ('Kansas', 1), ('Pennsylvania',16),
    ('Seton Hall', 8), ('North Carolina State',9),
    ('Clemson', 5), ('New Mexico State',12),
    ('Auburn',4), ('College of Charleston',13),
    ('Texas Christian',6), ('Arizona State',11), # or 'Syracuse',
    ('Michigan State',3), ('Bucknell',14),
    ('Rhode Island',7), ('Oklahoma',10),
    ('Duke', 2), ('Iona' ,15) 
    ]

In [53]:
'''
    Methods to add evaluating the predicted winners of matchups and subbrackets (A region or Final Four)
    To change the predictive model used, just change the model handed to "evaluate_winner(schools,sub_bracket_name, model)"
    found later in the notebook
'''
def get_matchups_stats(schools, post_season):    
    
    i = 0 
    t1_stats = []
    t2_stats = []
    t1_seeds = []
    t2_seeds = []
    if(not is_power_of_two(len(schools))):
        print('ERROR: invalid number of school names')
        return False
    while i < len(schools):
        t1_name, t1_seed = schools[i]
        t2_name, t2_seed = schools[i + 1]
        t1_seeds.append(t1_seed)
        t2_seeds.append(t2_seed)
        #print(t1_name, t2_name
        t1_stats.append(get_school_stats(2018, t1_name))
        t2_stats.append(get_school_stats(2018, t2_name))
        i = i + 2
    if(post_season):
        matchup_stats = create_team_stats_df_ps(range(0,int(len(schools)/2)), t1_stats, t2_stats, t1_seeds, t2_seeds)
    else:
        matchup_stats = create_team_stats_df(range(0,int(len(schools)/2)), t1_stats, t2_stats)
    return matchup_stats
def is_power_of_two(num):
    return ((num & (num - 1)) == 0) and num != 0
def get_matchup_winners(matchup_stats, schools, model, post_season, use_cuda):

    x_tourney = matchup_stats[ps_feature_col_names].values
    x_tourney = torch.from_numpy(x_tourney).float()
    # print(x_tourney)
    if use_cuda:
        x_tourney = x_tourney.cuda()
    y_tourney = model(x_tourney)
    #print(y_tourney)
    i = 0
    winners = []
    for y_val in y_tourney:
        t1_name, t1_seed = schools[i]
        t2_name, t2_seed = schools[i + 1]
        t1_won = y_val.item() > .5
        print(t1_name,t1_seed,' vs. ', t2_name,t2_seed,'(team 1 won=', t1_won,')')
        if(t1_won):
            winners.append((t1_name,t1_seed))
        else:
            winners.append((t2_name, t2_seed))
        i = i + 2
    return winners
def evaluate_winner(schools,sub_bracket_name, model, use_cuda):        
    remaining_teams = schools
    i = 1
    while(len(remaining_teams) > 1):
        #Add a random factor
        rand = random.randrange(0,1)
        post_season_stats = True
        print("---",sub_bracket_name," round ",i,"---")
        matchup_stats = get_matchups_stats(remaining_teams, post_season_stats)
        remaining_teams = get_matchup_winners(matchup_stats,remaining_teams, model, post_season_stats, use_cuda)
        i = i + 1
    winner = remaining_teams[0]
    print('Winner of ',sub_bracket_name,':',winner)
    return winner

In [54]:
# Get predicted final four

final_four = [evaluate_winner(school_names_south, "South",model, use_cuda), evaluate_winner(school_names_west,"West",model, use_cuda),
              evaluate_winner(school_names_east, "East", model, use_cuda), evaluate_winner(school_names_midwest, "MidWest",model, use_cuda)]

--- South  round  1 ---
Virginia 1  vs.  Maryland-Baltimore County 16 (team 1 won= True )
Creighton 8  vs.  Kansas State 9 (team 1 won= False )
Kentucky 5  vs.  Davidson 12 (team 1 won= True )
Arizona 4  vs.  Buffalo 13 (team 1 won= True )
Miami (FL) 6  vs.  Loyola (IL) 11 (team 1 won= True )
Tennessee 3  vs.  Wright State 14 (team 1 won= True )
Nevada 7  vs.  Texas 10 (team 1 won= False )
Cincinnati 2  vs.  Georgia State 15 (team 1 won= True )
--- South  round  2 ---
Virginia 1  vs.  Kansas State 9 (team 1 won= True )
Kentucky 5  vs.  Arizona 4 (team 1 won= True )
Miami (FL) 6  vs.  Tennessee 3 (team 1 won= False )
Texas 10  vs.  Cincinnati 2 (team 1 won= True )
--- South  round  3 ---
Virginia 1  vs.  Kentucky 5 (team 1 won= True )
Tennessee 3  vs.  Texas 10 (team 1 won= False )
--- South  round  4 ---
Virginia 1  vs.  Texas 10 (team 1 won= True )
Winner of  South : ('Virginia', 1)
--- West  round  1 ---
Xavier 1  vs.  North Carolina Central 16 (team 1 won= True )
Missouri 8  vs.  Fl

In [55]:
final_four

[('Virginia', 1), ('North Carolina', 2), ('Purdue', 2), ('Duke', 2)]

In [56]:
champ = evaluate_winner(final_four, "FinalFour", model, use_cuda)

--- FinalFour  round  1 ---
Virginia 1  vs.  North Carolina 2 (team 1 won= True )
Purdue 2  vs.  Duke 2 (team 1 won= False )
--- FinalFour  round  2 ---
Virginia 1  vs.  Duke 2 (team 1 won= False )
Winner of  FinalFour : ('Duke', 2)
