This notebook computes an Elo rating for each team in each year. This Elo rating is used to give a baseline winrate for each team, where the expected odds are given by $10 ^ {(myRate - oppRate) / 400} : 1$.

In [82]:
import numpy as np
import pandas as pd
import itertools as itr
from tqdm import tqdm
import random
import matplotlib.pyplot as plt
import seaborn as sns

In [48]:
# Read in dataset and filter for relevant columns

df = pd.read_csv('.data/datathon_2024_dataset_corrected.csv') 
df["date"] = pd.to_datetime(df["game_date"], format='%Y%m%d')
df = df[['date', 'home_team', 'away_team', 'home_score', 'away_score', 'venue_name']]
print(df.head())

        date home_team away_team  home_score  away_score        venue_name
0 2000-03-29       NYN       CHN           3           5        Tokyo Dome
1 2000-03-30       CHN       NYN           1           5        Tokyo Dome
2 2000-04-03       ATL       COL           2           0      Turner Field
3 2000-04-03       CIN       MIL           3           3     Cinergy Field
4 2000-04-03       FLO       SFN           6           4  Sun Life Stadium


In [61]:
# Make list of team names, where each year is considered a different team. Names have form [3CHARNAME][YEAR]
team_names = []
for team in df['home_team'].unique():
    for yr in df[df['home_team'] == team]['date'].dt.year.unique():
        team_names.append(f'{team}{yr}')

# For each game in the given dataset, store the game data in a list as a 3-tuple with format:
# [0] team1; [1] team2; [2] team1's record, where 1 is a win, 0 is a loss, and 0.5 is a tie
record = []
for ind in df.index:
    home_team = df.loc[ind, 'home_team'] + str(df.loc[ind, 'date'].year)
    away_team = df.loc[ind, 'away_team'] + str(df.loc[ind, 'date'].year)
    home_score = df.loc[ind, 'home_score']
    away_score = df.loc[ind, 'away_score']
    if home_score == away_score:
        record.append((home_team, away_team, 0.5))
        record.append((away_team, home_team, 0.5))
    elif home_score > away_score:
        record.append((home_team, away_team, 1))
        record.append((away_team, home_team, 0))
    else:
        record.append((home_team, away_team, 0))
        record.append((away_team, home_team, 1))

print(len(record))

113550


In [97]:
# Compute ELO ratings by simulating the entire baseball record and computing ELO adjustments
rating = {team : 1000 for team in team_names}
learning_rate = 5

# Do 1000 epochs
for _ in tqdm(range(1000)):
    random.shuffle(record)
    for game in record:

        # Stochastic gradient descent to compute ELO ratings
        expected_outcome = 1 / (1 + 10 ** ((rating[game[1]] - rating[game[0]]) / 400))
        rating[game[0]] += learning_rate * (game[2] - expected_outcome)    

100%|█| 1000/1000 [01

{'NYN2000': 1038.0171879274176, 'NYN2001': 1005.5927962948316, 'NYN2002': 987.9410454812129, 'NYN2003': 966.3415351579171, 'NYN2004': 957.9563354986802, 'NYN2005': 995.9016099298883, 'NYN2006': 1032.7270517573481, 'NYN2007': 1033.2404137253002, 'NYN2008': 1014.7257759985199, 'NYN2009': 921.2328616072465, 'NYN2010': 988.5009087387754, 'NYN2011': 992.1197362430328, 'NYN2012': 989.6214228157154, 'NYN2013': 971.1448808374098, 'NYN2014': 972.8407178904542, 'NYN2015': 1017.9032851118099, 'NYN2016': 997.8333052788828, 'NYN2017': 946.3621408094305, 'NYN2018': 980.2387789625908, 'NYN2019': 1042.2784441586414, 'NYN2020': 951.3914208992927, 'NYN2021': 968.8221391385408, 'NYN2022': 1067.3549882785876, 'NYN2023': 982.3396681645046, 'CHN2000': 917.6788279071648, 'CHN2001': 1026.9292460377253, 'CHN2002': 927.8098475086097, 'CHN2003': 1029.857520724318, 'CHN2004': 1039.4450995696657, 'CHN2005': 977.7112716622862, 'CHN2006': 894.1197107639641, 'CHN2007': 1008.6358861098014, 'CHN2008': 1059.758743174999




In [98]:
# Pivot the ratings into a pandas dataframe for writing to csv
rating_df = pd.DataFrame(data=np.nan, index=df['date'].dt.year.unique(), columns=df['home_team'].unique())
for team_name, rat in rating.items():
    team = team_name[:3]
    yr = int(team_name[3:])
    rating_df.loc[yr, team] = rat
print(rating_df)

              NYN          CHN          ATL          CIN          FLO  \
2000  1038.017188   917.678828  1042.674724   997.844659   966.687400   
2001  1005.592796  1026.929246  1018.058941   924.181293   978.822684   
2002   987.941045   927.809848  1082.079387   977.270124   993.957073   
2003   966.341535  1029.857521  1099.373832   953.859508  1062.951834   
2004   957.956335  1039.445100  1051.894527   995.447553  1005.446032   
2005   995.901610   977.711272  1026.133140   945.295578   996.390545   
2006  1032.727052   894.119711   956.044331   937.502719   945.249210   
2007  1033.240414  1008.635886  1004.422622   938.766276   967.811389   
2008  1014.725776  1059.758743   953.251005   973.413114   997.702517   
2009   921.232862   986.900365   998.928771   961.851770  1006.640961   
2010   988.500909   969.631451  1035.256582  1020.457468   991.991370   
2011   992.119736   941.178291  1036.307378   969.924258   961.250943   
2012   989.621423   918.157847  1063.643518  1040.7

In [99]:
rating_df.to_csv('.data/ratings.csv')