In [1]:
## You'll just need to execute this once.

# Standard
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

# Time a lengthy computation
from time import time

# A tool for getting data
import pandas as pd

# A sparse matrix representation
from scipy.sparse import dok_matrix

# An eigenvalue computer for sparse matrices
from scipy.sparse.linalg import eigs

# We'll need to optimize our procedure.
from scipy.optimize import minimize


In [2]:
## You'll just need to execute this once.

teams_df = pd.read_csv('DataFiles/Teams.csv')
teams = [
    {
        'team_idx':idx,
        'team_id':row['TeamID'],
        'team_name':row['TeamName']
    }
    for (idx,row) in teams_df.iterrows()
]

FileNotFoundError: File b'DataFiles/Teams.csv' does not exist

In [None]:
## Read in and set up the season results data for a particular year.
## Note that this is the one and only place we set the year. You'll
## need to rerun this every time through.

year = 2018

# Load the regular season data
results_df = pd.read_csv('DataFiles/RegularSeasonCompactResults.csv')
games = [
    {
        'day':int(row['DayNum']),
        'win_id':row['WTeamID'],
        'win_score':int(row['WScore']),
        'lose_id':row['LTeamID'],
        'lose_score':int(row['LScore']),
        'win_loc':row['WLoc'],
        'num_ot':row['NumOT']
    }
    for (idx,row) in results_df.iterrows() if row['Season'] == year
]
min_day = min([game['day'] for game in games])
max_day = max([game['day'] for game in games])

# Load the tournament results data - for scoring purposes
tourney_results_df = pd.read_csv('DataFiles/NCAATourneyCompactResults.csv')
tourney_results = []
for idx,row in tourney_results_df.iterrows():
    if row['Season'] == year:
        win_team = row['WTeamID']
        lose_team = row['LTeamID']
        tourney_results.append((win_team,lose_team))

In [None]:
## You'll need to execute this each time you try a different year -
## including running the final version for 2018!

team_dict = {}
for game in games:
    win_team = game['win_id']
    if win_team in team_dict:
        team_dict[win_team]['num_games'] = team_dict[win_team]['num_games']+1
    else:
        team_dict[win_team] = {'num_games':1}
    lose_team = game['lose_id']
    if lose_team in team_dict:
        team_dict[lose_team]['num_games'] = team_dict[lose_team]['num_games']+1
    else:
        team_dict[lose_team] = {'num_games':1}
n_games = len(team_dict)
cnt = 0
for team in team_dict:
    team_dict[team]['matrix_idx'] = cnt
    cnt = cnt+1
    pos = [team['team_id'] for team in teams].index(team)
    team_dict[team]['name'] = teams[pos]['team_name']
reverse_team_dict = dict([(team_dict[team]['matrix_idx'],team) for team in team_dict])

In [None]:
## You'll need to rerun this every time you perform optimization for a given year.

def scale(s,x):
    if x<=0.5:
        return 0.5*(2*x)**(1/s)
    else:
        return 1-0.5*(2*(1-x))**(1/s)
def run_trial(parameters, extra_info=False):
    aw = parameters[0]
    sw = parameters[1]
    ww = parameters[2]
    p = parameters[3]
    day_weight = parameters[4]
    def dw(day):
        return day_weight + (1-day_weight)*(day-min_day)/(max_day-min_day)
    s = parameters[5]

    M = dok_matrix((len(team_dict),len(team_dict)))
    for game in games:
        day_weight = dw(game['day'])
        win_team = game['win_id']
        w_num_games = team_dict[win_team]['num_games']**p
        win_score = int(game['win_score'])
        win_index = team_dict[win_team]['matrix_idx']
        lose_team = game['lose_id']
        l_num_games = team_dict[lose_team]['num_games']**p
        lose_score = int(game['lose_score'])
        lose_index = team_dict[lose_team]['matrix_idx']
        total_score = win_score+lose_score
        if game['win_loc'] == 'H':
            whw = 1
            lhw = aw
        elif game['win_loc'] == 'A':
            whw = aw
            lhw = 1
        else:
            whw = 1
            lhw = 1
        M[win_index,lose_index] = M[win_index,lose_index] + whw*day_weight*ww/w_num_games + \
            (whw*day_weight*sw*win_score/total_score)/w_num_games
        M[lose_index,win_index] = M[lose_index,win_index] +  \
            (lhw*day_weight*sw*lose_score/total_score)/l_num_games
    value, vector = eigs(M, which = 'LM', k=1)
    vector = abs(np.ndarray.flatten(vector.real))
    order = list(vector.argsort())
    order.reverse()

    def prob1beats2(team1,team2):
        strength1 = vector[team_dict[team1]['matrix_idx']]
        strength2 = vector[team_dict[team2]['matrix_idx']]
        return scale(s, strength1/(strength1+strength2))

    total = 0
    cnt = 0
    for result in tourney_results:
        total = total + np.log(prob1beats2(result[0],result[1]))
        cnt = cnt+1
    score = -total/cnt
    if extra_info == True:
        extra_info_dict = {
            'ranking_vector': vector,
            'order': order,
            'p_function': prob1beats2,
            'ranking': [team_dict[reverse_team_dict[k]]['name'] for k in order]
        }
        return score, extra_info_dict
    else:
        return score

In [None]:
## Here's where we actually find parameters to optimize the procedure!!

aw = 1.3; sw = 1; ww = 1.5; p = 1; dw = 0.75; s = 0.1
t = time()
min_result = minimize(run_trial, [aw,sw,ww,p,dw,s],
    bounds =  [(1,None), (0.1, None), (0, None), (0,None), (0,1),(0.01,1)])
time()-t

In [None]:
## This is the important output that you want from the optimization!!

min_result

In [None]:
## This doesn't need to be run, but examining the top 10 might 
## help ensure that your results are sensible

result, info = run_trial(min_result['x'], extra_info = True)
info['ranking'][:10]

In [None]:
seeds_df = pd.read_csv('DataFiles/NCAATourneySeeds.csv')
seed_dict = {};
for (idx,row) in list(seeds_df.iterrows()):
    if row['Season'] == year:
        team = row['TeamID']
        seed = int(row['Seed'][1:3])
        seed_dict[team] = seed
teams_in = list(seed_dict.keys())
teams_in.sort()
pairs = [(teams_in[i], teams_in[j])
    for i in range(len(teams_in))
    for j in range(i+1,len(teams_in))]

In [None]:
def scale(s,x):
    if x<=0.5:
        return 0.5*(2*x)**(1/s)
    else:
        return 1-0.5*(2*(1-x))**(1/s)
def run_it(parameters):
    aw = parameters[0]
    sw = parameters[1]
    ww = parameters[2]
    p = parameters[3]
    day_weight = parameters[4]
    def dw(day):
        return day_weight + (1-day_weight)*(day-min_day)/(max_day-min_day)
    s = parameters[5]

    M = dok_matrix((len(team_dict),len(team_dict)))
    for game in games:
        day_weight = dw(game['day'])
        win_team = game['win_id']
        w_num_games = team_dict[win_team]['num_games']**p
        win_score = int(game['win_score'])
        win_index = team_dict[win_team]['matrix_idx']
        lose_team = game['lose_id']
        l_num_games = team_dict[lose_team]['num_games']**p
        lose_score = int(game['lose_score'])
        lose_index = team_dict[lose_team]['matrix_idx']
        total_score = win_score+lose_score
        if game['win_loc'] == 'H':
            whw = 1
            lhw = aw
        elif game['win_loc'] == 'A':
            whw = aw
            lhw = 1
        else:
            whw = 1
            lhw = 1
        M[win_index,lose_index] = M[win_index,lose_index] + whw*day_weight*ww/w_num_games + \
            (whw*day_weight*sw*win_score/total_score)/w_num_games
        M[lose_index,win_index] = M[lose_index,win_index] +  \
            (lhw*day_weight*sw*lose_score/total_score)/l_num_games
    value, vector = eigs(M, which = 'LM', k=1)
    vector = abs(np.ndarray.flatten(vector.real))
    order = list(vector.argsort())
    order.reverse()

    def prob1beats2(team1,team2):
        strength1 = vector[team_dict[team1]['matrix_idx']]
        strength2 = vector[team_dict[team2]['matrix_idx']]
        return scale(s, strength1/(strength1+strength2))

    team_dicts = [team_dict[reverse_team_dict[k]] for k in order]
    for idx,td in enumerate(team_dicts):
        td['rating'] = vector[order][idx]

    return {
        'ranking_vector': vector,
        'order': order,
        'p_function': prob1beats2,
        'rankings': [team_dict[reverse_team_dict[k]] for k in order]
    }

In [None]:
attempt = run_it([1.0, 2.77510649, 0.0, 0.81164999, 1.0, 0.03604771])
[x['name'] for x in attempt['rankings'][:10]]

In [None]:
p_fun = attempt['p_function']
file_handle = open('submit1.csv', 'w')
file_handle.write("id,pred\n")
for pair in pairs:
    line = str(year) + "_" + str(pair[0]) + "_" + str(pair[1]) + ","
    p = p_fun(pair[0], pair[1])
    p = str(p)
    line = line + p
    #line = line + ",\t" + team_dict[pair[0]]['name'] + " - " + team_dict[pair[1]]['name']
    line = line + "\n"
    file_handle.write(line)
file_handle.close()

In [None]:
sweet_sixteen = [
 'Villanova','West Virginia',
 'Texas Tech','Purdue',
 'Kansas','Clemson',
 'Syracuse','Duke',
 'Kansas St','Kentucky',
 'Loyola-Chicago','Nevada',
 'Florida St','Gonzaga',
 'Michigan','Texas A&M'
]

In [None]:
ranked_teams = attempt['rankings']
team_names = [t['name'] for t in ranked_teams]
def get_rating(t):
    idx = team_names.index(t)
    return ranked_teams[idx]['rating']

q = "?"
for k in range(16):
    q = q + 'team=' + sweet_sixteen[k] + '&'
    q = q + 'rating=' + str(get_rating(sweet_sixteen[k]))
    if k<15:
        q = q + '&'
q

In [None]:
import webbrowser
webbrowser.open(
    'https://marksmath.org/visualization/eigenbrackets/tourney_from_rankings.html' + q
)

My Identification for NumericalBracket is half_descent.

Our goal is to predict who is going to win or lose the sweet sixteen bascketball competition by minimizing the LoggLoss function that Kaggle uses to rank scores. Python code is used to find the parameters that optimize scores, and then we use those parameters to minimize the LogLoss score.