This notebook processes all the data in *datathon_2024_dataset_corrected.csv* into a dataset that relates the outcome of each game to the team's short-term travel history preceding the game.

In [157]:
import numpy as np
import pandas as pd
from geopy.distance import distance as geodist
from tqdm import tqdm
import itertools as itr

In [2]:
df = pd.read_csv('.data/datathon_2024_dataset_corrected.csv')
df["date"] = pd.to_datetime(df["game_date"], format='%Y%m%d')
df = df[['date', 'home_team', 'away_team', 'is_day_game', 'home_score', 'away_score', 'venue_name']]
print(df.head())

   Unnamed: 0  game_date home_team away_team  is_day_game  home_score  \
0           0   20000329       NYN       CHN        False           3   
1           1   20000330       CHN       NYN        False           1   
2           2   20000403       ATL       COL         True           2   
3           3   20000403       CIN       MIL         True           3   
4           4   20000403       FLO       SFN        False           6   

   away_score  venue        venue_name        city  ... home_pa  home_1b  \
0           5  TOK01        Tokyo Dome       Tokyo  ...      38        5   
1           5  TOK01        Tokyo Dome       Tokyo  ...      44        5   
2           0  ATL02      Turner Field     Atlanta  ...      32        5   
3           3  CIN08     Cinergy Field  Cincinnati  ...      20        3   
4           4  MIA01  Sun Life Stadium       Miami  ...      38        9   

   home_2b  home_3b  home_hr  home_fo  home_so  home_bb  home_hbp       date  
0        1        0      

In [129]:
venue_df = pd.read_csv('.data/venue.csv', index_col='venue_name')
print(venue_df.head())

                     lat     long
venue_name                       
Tokyo Dome        35.705  139.752
Turner Field      33.735  -84.389
Cinergy Field     39.548  -84.303
Sun Life Stadium  25.959  -80.240
Stade Olympique   45.558  -73.552


In [4]:
years = df['date'].dt.year.unique()
years.sort()
print(years)

[2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023]


In [5]:
teams = df['home_team'].unique()
teams.sort()
print(teams)

['ANA' 'ARI' 'ATL' 'BAL' 'BOS' 'CHA' 'CHN' 'CIN' 'CLE' 'COL' 'DET' 'FLO'
 'HOU' 'KCA' 'LAN' 'MIA' 'MIL' 'MIN' 'MON' 'NYA' 'NYN' 'OAK' 'PHI' 'PIT'
 'SDN' 'SEA' 'SFN' 'SLN' 'TBA' 'TEX' 'TOR' 'WAS']


In [9]:
opening_day = {yr : df['date'][df['date'].dt.year == yr].min() for yr in years}

df['day'] = 0
for ind in tqdm(df.index):
    df.loc[ind, 'day'] = (df.loc[ind, 'date'] - opening_day[df.loc[ind, 'date'].year]).days

print(df.head())

100%|█| 56775/56775 [

        date home_team away_team  is_day_game  home_score  away_score  \
0 2000-03-29       NYN       CHN        False           3           5   
1 2000-03-30       CHN       NYN        False           1           5   
2 2000-04-03       ATL       COL         True           2           0   
3 2000-04-03       CIN       MIL         True           3           3   
4 2000-04-03       FLO       SFN        False           6           4   

         venue_name  day  
0        Tokyo Dome    0  
1        Tokyo Dome    1  
2      Turner Field    5  
3     Cinergy Field    5  
4  Sun Life Stadium    5  





In [95]:
homes = pd.DataFrame(data="", index=years, columns=teams)
for ind in tqdm(df.index):
    home_team = df.loc[ind, 'home_team']
    yr = df.loc[ind, 'date'].year
    venue = df.loc[ind, 'venue_name']
    homes.loc[yr, home_team] = venue
print(homes)

100%|█| 56775/56775 [

                           ANA          ARI           ATL  \
2000  Angel Stadium of Anaheim  Chase Field  Turner Field   
2001  Angel Stadium of Anaheim  Chase Field  Turner Field   
2002  Angel Stadium of Anaheim  Chase Field  Turner Field   
2003  Angel Stadium of Anaheim  Chase Field  Turner Field   
2004  Angel Stadium of Anaheim  Chase Field  Turner Field   
2005  Angel Stadium of Anaheim  Chase Field  Turner Field   
2006  Angel Stadium of Anaheim  Chase Field  Turner Field   
2007  Angel Stadium of Anaheim  Chase Field  Turner Field   
2008  Angel Stadium of Anaheim  Chase Field  Turner Field   
2009  Angel Stadium of Anaheim  Chase Field  Turner Field   
2010  Angel Stadium of Anaheim  Chase Field  Turner Field   
2011  Angel Stadium of Anaheim  Chase Field  Turner Field   
2012  Angel Stadium of Anaheim  Chase Field  Turner Field   
2013  Angel Stadium of Anaheim  Chase Field  Turner Field   
2014  Angel Stadium of Anaheim  Chase Field  Turner Field   
2015  Angel Stadium of A




In [115]:
days = list(np.arange(0, df['day'].max()+1, 1))
print(days)

extended_days = [-1] + list(days) + [days[-1]+1]
print(extended_days)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193]
[-1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 

In [183]:
location = {team:
                {year:
                    {day:
                         {'home': 0,
                          'away': 0,
                          'travel': 0,
                          'miles': 0,
                          'venue': None,
                          'won': 0,
                          'odds': -1}
                     for day in extended_days
                    }
                for year in years
                }
            for team in teams
            }

for team in teams:
    for yr in years:
        home_venue = homes.loc[yr, team]
        if home_venue is None:
            print(team, yr)
            del location[team][yr]
        else:
            location[team][yr][-1]['venue'] = home_venue
            location[team][yr][days[-1]+1]['venue'] = home_venue

In [184]:
for ind in tqdm(df.index):
    home_team = df.loc[ind, 'home_team']
    away_team = df.loc[ind, 'away_team']
    home_score = df.loc[ind, 'home_score']
    away_score = df.loc[ind, 'away_score']
    yr = df.loc[ind, 'date'].year
    day = df.loc[ind, 'day']
    venue = df.loc[ind, 'venue_name']
    location[home_team][yr][day]['home'] = 1
    location[away_team][yr][day]['away'] = 1    
    location[home_team][yr][day]['venue'] = venue
    location[away_team][yr][day]['venue'] = venue
    if home_score == away_score:
        location[home_team][yr][day]['won'] = 0.5
        location[away_team][yr][day]['won'] = 0.5
    elif home_score > away_score:
        location[home_team][yr][day]['won'] = 1
    else:
        location[away_team][yr][day]['won'] = 1

100%|█| 56775/56775 [


In [185]:
for team in teams:
    for yr in years:
        for day in reversed(days):
            if location[team][yr][day]['venue'] is None:
                location[team][yr][day]['venue'] = location[team][yr][day+1]['venue']
        for day in extended_days[1:]:
            yesterday_venue = location[team][yr][day-1]['venue']
            today_venue = location[team][yr][day]['venue']
            if yesterday_venue != today_venue:
                location[team][yr][day]['travel'] = 1
                location[team][yr][day]['miles'] = geodist(venue_df.loc[yesterday_venue].tolist(), venue_df.loc[today_venue].tolist()).miles
        if sum([location[team][yr][dummy_day]['won'] for dummy_day in days]) == 0:
            del location[team][yr]

In [221]:
MAX_DELAY = 14

var_names = ['win', 'baseline_odds'] + [f'{var}{delay}' for var, delay in itr.product(['home', 'away', 'travel', 'miles'], range(MAX_DELAY))]

ind_names = []
for team in teams:
    for yr in location[team].keys():
        for day in days:
            ind_names.append(f'{team}{yr}{day}')

In [222]:
ml_dict = {ind: {} for ind in ind_names}
for team in tqdm(teams):
    for yr in location[team].keys():
        for day in days:
            ind = f'{team}{yr}{day}'
            ml_dict[ind]['won'] = location[team][yr][day]['won']
            ml_dict[ind]['baseline_odds'] = location[team][yr][day]['odds']
            for delay_day in range(day, min(day + MAX_DELAY, days[-1]+1)):
                delay_ind = f'{team}{yr}{delay_day}'
                for var in ['home', 'away', 'travel', 'miles']:
                    var_name = f'{var}{delay_day-day}'
                    ml_dict[delay_ind][var_name] = location[team][yr][day][var]
ml_df = pd.DataFrame(ml_dict).transpose()
ml_df = ml_df[ml_df['won'].notna()].dropna()
print(ml_df.iloc[:20, ])
print(ml_df.shape)

100%|█| 32/32 [00:04<


           won  baseline_odds  home0  away0  travel0       miles0  home1  \
ANA200013  1.0           -1.0    1.0    0.0      0.0     0.000000    1.0   
ANA200014  0.0           -1.0    1.0    0.0      0.0     0.000000    1.0   
ANA200015  0.0           -1.0    0.0    0.0      1.0  1734.576143    1.0   
ANA200016  0.0           -1.0    0.0    1.0      0.0     0.000000    0.0   
ANA200017  1.0           -1.0    0.0    1.0      0.0     0.000000    0.0   
ANA200018  1.0           -1.0    0.0    1.0      0.0     0.000000    0.0   
ANA200019  0.0           -1.0    0.0    1.0      1.0   441.403955    0.0   
ANA200020  1.0           -1.0    0.0    1.0      0.0     0.000000    0.0   
ANA200021  0.0           -1.0    0.0    1.0      0.0     0.000000    0.0   
ANA200022  0.0           -1.0    0.0    1.0      0.0     0.000000    0.0   
ANA200023  1.0           -1.0    0.0    1.0      1.0  1093.319254    0.0   
ANA200024  0.0           -1.0    0.0    1.0      0.0     0.000000    0.0   
ANA200025  0

In [223]:
ml_df.to_csv('.data/logregression_data.csv')