In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss

In [2]:
K = 20.
HOME_ADVANTAGE = 100.

In [3]:
rs = pd.read_csv("data/RegularSeasonCompactResults.csv")
rs.head(3)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT
0,1985,20,1228,81,1328,64,N,0
1,1985,25,1106,77,1354,70,H,0
2,1985,25,1112,63,1223,56,H,0


In [4]:
team_ids = set(rs.WTeamID).union(set(rs.LTeamID))
len(team_ids)

364

In [5]:
# This dictionary will be used as a lookup for current
# scores while the algorithm is iterating through each game
elo_dict = dict(zip(list(team_ids), [1500] * len(team_ids)))

In [6]:
# New columns to help us iteratively update elos
rs['margin'] = rs.WScore - rs.LScore
rs['w_elo'] = None
rs['l_elo'] = None

In [7]:
def elo_pred(elo1, elo2):
    return(1. / (10. ** (-(elo1 - elo2) / 400.) + 1.))

def expected_margin(elo_diff):
    return((7.5 + 0.006 * elo_diff))

def elo_update(w_elo, l_elo, margin):
    elo_diff = w_elo - l_elo
    pred = elo_pred(w_elo, l_elo)
    mult = ((margin + 3.) ** 0.8) / expected_margin(elo_diff)
    update = K * mult * (1 - pred)
    return(pred, update)

In [8]:
# I'm going to iterate over the games dataframe using 
# index numbers, so want to check that nothing is out
# of order before I do that.
assert np.all(rs.index.values == np.array(range(rs.shape[0]))), "Index is out of order."

In [9]:
preds = []

# Loop over all rows of the games dataframe
for i in range(rs.shape[0]):
    
    # Get key data from current row
    w = rs.at[i, 'WTeamID']
    l = rs.at[i, 'LTeamID']
    margin = rs.at[i, 'margin']
    wloc = rs.at[i, 'WLoc']
    
    # Does either team get a home-court advantage?
    w_ad, l_ad, = 0., 0.
    if wloc == "H":
        w_ad += HOME_ADVANTAGE
    elif wloc == "A":
        l_ad += HOME_ADVANTAGE
    
    # Get elo updates as a result of the game
    pred, update = elo_update(elo_dict[w] + w_ad,
                              elo_dict[l] + l_ad, 
                              margin)
    elo_dict[w] += update
    elo_dict[l] -= update
    preds.append(pred)

    # Stores new elos in the games dataframe
    rs.loc[i, 'w_elo'] = elo_dict[w]
    rs.loc[i, 'l_elo'] = elo_dict[l]

In [10]:
rs.tail(10)

Unnamed: 0,Season,DayNum,WTeamID,WScore,LTeamID,LScore,WLoc,NumOT,margin,w_elo,l_elo
150674,2017,131,1436,56,1107,53,H,0,3,1652.42,1542.46
150675,2017,131,1437,74,1166,60,N,0,14,2097.4,1864.51
150676,2017,131,1458,76,1321,48,N,0,28,1953.23,1770.25
150677,2017,131,1463,73,1217,71,N,0,2,1591.42,1581.21
150678,2017,132,1246,82,1116,65,N,0,17,2070.61,1809.06
150679,2017,132,1276,71,1458,56,N,0,15,1904.71,1936.38
150680,2017,132,1343,71,1463,59,N,0,12,1710.38,1584.06
150681,2017,132,1348,70,1433,63,N,0,7,1790.62,1841.22
150682,2017,132,1374,71,1153,56,N,0,15,1965.93,1906.94
150683,2017,132,1407,59,1402,53,N,0,6,1447.51,1449.7


In [11]:
def final_elo_per_season(df, team_id):
    d = df.copy()
    d = d.loc[(d.WTeamID == team_id) | (d.LTeamID == team_id), :]
    d.sort_values(['Season', 'DayNum'], inplace=True)
    d.drop_duplicates(['Season'], keep='last', inplace=True)
    w_mask = d.WTeamID == team_id
    l_mask = d.LTeamID == team_id
    d['season_elo'] = None
    d.loc[w_mask, 'season_elo'] = d.loc[w_mask, 'w_elo']
    d.loc[l_mask, 'season_elo'] = d.loc[l_mask, 'l_elo']
    out = pd.DataFrame({
        'team_id': team_id,
        'season': d.Season,
        'season_elo': d.season_elo
    })
    return(out)

In [12]:
df_list = [final_elo_per_season(rs, i) for i in team_ids]
season_elos = pd.concat(df_list)

In [13]:
season_elos.sample(10)

Unnamed: 0,season,season_elo,team_id
134471,2014,1809.85,1234
134538,2014,1401.71,1169
123846,2012,1747.61,1268
64642,2000,1595.26,1103
134486,2014,1432.47,1408
11147,1987,1190.19,1159
39526,1994,1380.95,1407
128979,2013,1305.66,1237
123330,2012,1039.76,1136
47815,1996,1354.42,1168


In [14]:
season_elos.to_csv("season_elos.csv", index=None)

--------

--------


In [16]:
# TrueSkill is a rating system based on Bayesian inference, estimating each players skill as a gaussian like Elo rating.
# See trueskill.org for more.

import pandas as pd, numpy as np
from trueskill import TrueSkill, Rating, rate_1vs1

ts = TrueSkill(draw_probability=0.01) # 0.01 is arbitary small number
beta = 25 / 6  # default value

def win_probability(p1, p2):
    delta_mu = p1.mu - p2.mu
    sum_sigma = p1.sigma * p1.sigma + p2.sigma * p2.sigma
    denom = np.sqrt(2 * (beta * beta) + sum_sigma)
    return ts.cdf(delta_mu / denom)
    
submit = pd.read_csv('data/SampleSubmissionStage1.csv')
submit[['Season', 'Team1', 'Team2']] = submit.apply(lambda r:pd.Series([int(t) for t in r.ID.split('_')]), axis=1)

df_tour = pd.read_csv('data/RegularSeasonCompactResults.csv')
teamIds = np.unique(np.concatenate([df_tour.WTeamID.values, df_tour.LTeamID.values]))
ratings = { tid:ts.Rating() for tid in teamIds }

def feed_season_results(season):
    print("season = {}".format(season))
    df1 = df_tour[df_tour.Season == season]
    for r in df1.itertuples():
        ratings[r.WTeamID], ratings[r.LTeamID] = rate_1vs1(ratings[r.WTeamID], ratings[r.LTeamID])

def update_pred(season):
    beta = np.std([r.mu for r in ratings.values()]) 
    print("beta = {}".format(beta))
    submit.loc[submit.Season==season, 'Pred'] = submit[submit.Season==season].apply(lambda r:win_probability(ratings[r.Team1], ratings[r.Team2]), axis=1)

for season in sorted(df_tour.Season.unique())[:-4]: # exclude last 4 years
    feed_season_results(season)

update_pred(2014)
feed_season_results(2014)
update_pred(2015)
feed_season_results(2015)
update_pred(2016)
feed_season_results(2016)
update_pred(2017)

submit.drop(['Season', 'Team1', 'Team2'], axis=1, inplace=True)
submit.to_csv('trueskill_estimation.csv', index=None)

season = 1985
season = 1986
season = 1987
season = 1988
season = 1989
season = 1990
season = 1991
season = 1992
season = 1993
season = 1994
season = 1995
season = 1996
season = 1997
season = 1998
season = 1999
season = 2000
season = 2001
season = 2002
season = 2003
season = 2004
season = 2005
season = 2006
season = 2007
season = 2008
season = 2009
season = 2010
season = 2011
season = 2012
season = 2013
beta = 5.173598443091853
season = 2014
beta = 5.220324159361539
season = 2015
beta = 5.175775424826819
season = 2016
beta = 5.158745585314824
