In [46]:
# imports 
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import seaborn as sns
from scipy.stats import ttest_ind
import datetime

from dataclasses import dataclass
from math import radians, cos, sin, asin, sqrt
# from ydata_profiling import ProfileReport

In [47]:
pd.set_option('display.max_columns', None)

In [48]:
# File path for csv File
filepath = "./../data/raw/datathon_2024_dataset.csv"
astro_df = pd.read_csv(filepath, index_col=0)
astro_df["game_date"] = [date[:4] + "-" + date[4:6] + "-" + date[6:] for date in astro_df["game_date"].astype(str)]
astro_df["game_date"] = pd.to_datetime(astro_df["game_date"])
astro_df["game_date"].dtypes

dtype('<M8[ns]')

In [49]:
# Adds information about 
# astro_df['home_obp'] = homeOBP
# astro_df['away_obp'] = awayOBP
# astro_df['home_restdays'] = homeConsecutiveRestDays
# astro_df['away_restdays'] = awayConsecutiveRestDays

homeOPS = []
awayOPS = []

teamLastGameDate = {}
homeConsecutiveRestDays = []
awayConsecutiveRestDays = []
def calculateOPS(prefix,row):
    H = row[prefix+"1b"] + row[prefix+"2b"] + row[prefix+"3b"] + row[prefix+"hr"]
    ops = ((H + row[prefix+"bb"] + row[prefix+"hbp"]) / (row[prefix+"pa"])) + (row[prefix+"1b"] + row[prefix+"2b"] * 2 + row[prefix+"3b"] * 3 + row[prefix+"hr"] * 4) / (row[prefix+"pa"] - row[prefix+"bb"] - row[prefix+"hbp"])

    return ops

for index, row in astro_df.iterrows():
    # calulate OBP for both home and away
    home_OPS = calculateOPS("home_",row)
    away_OPS = calculateOPS("away_",row)

    homeOPS.append(home_OPS)
    awayOPS.append(away_OPS)

    # Calculate consectitive rest days for each team





        # Calculate for home team
    if not (row['home_team'] in teamLastGameDate):
        teamLastGameDate[row['home_team']] = None
    
    if teamLastGameDate[row['home_team']]  is None or row['game_date'] == teamLastGameDate[row['home_team']] + datetime.timedelta(days=1):
        homeConsecutiveRestDays.append(0)
    else:
        homeConsecutiveRestDays.append((row['game_date'] - teamLastGameDate[row['home_team']]).days)
    
    teamLastGameDate[row['home_team']]  = row['game_date']

        # Calculate for away team

    if not (row['away_team'] in teamLastGameDate):
        teamLastGameDate[row['away_team']] = None

    if teamLastGameDate[row['away_team']] is None or row['game_date'] == teamLastGameDate[row['away_team']] + datetime.timedelta(days=1):
        awayConsecutiveRestDays.append(0)
    else:
        awayConsecutiveRestDays.append((row['game_date'] - teamLastGameDate[row['away_team']]).days)
    
    teamLastGameDate[row['away_team']]  = row['game_date']

astro_df['home_ops'] = homeOPS
astro_df['away_ops'] = awayOPS
astro_df['home_restdays'] = homeConsecutiveRestDays
astro_df['away_restdays'] = awayConsecutiveRestDays
astro_df

Unnamed: 0,game_date,home_team,away_team,is_day_game,home_score,away_score,venue,venue_name,city,state,home_pa,home_1b,home_2b,home_3b,home_hr,home_fo,home_so,home_bb,home_hbp,away_pa,away_1b,away_2b,away_3b,away_hr,away_fo,away_so,away_bb,away_hbp,home_ops,away_ops,home_restdays,away_restdays
0,2000-03-29,NYN,CHN,False,3,5,TOK01,Tokyo Dome,Tokyo,JAP,45,9,1,0,2,17,5,10,1,38,5,1,0,1,24,4,3,0,1.069935,0.577444,0,0
1,2000-03-30,CHN,NYN,False,1,5,TOK01,Tokyo Dome,Tokyo,JAP,48,3,2,0,1,28,5,8,1,44,5,0,0,0,24,9,6,0,0.594551,0.381579,0,0
2,2000-04-03,ATL,COL,True,2,0,ATL02,Turner Field,Atlanta,GA,34,4,2,0,0,19,7,2,0,32,5,0,0,2,17,6,1,1,0.485294,0.714583,0,0
3,2000-04-03,CIN,MIL,True,3,3,CIN08,Cinergy Field,Cincinnati,OH,27,6,1,0,0,14,1,5,0,20,3,1,0,1,10,4,1,0,0.808081,0.773684,0,0
4,2000-04-03,FLO,SFN,False,6,4,MIA01,Sun Life Stadium,Miami,FL,36,5,2,2,1,17,8,1,0,38,9,3,0,0,17,7,1,1,0.848413,0.785088,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56770,2023-10-01,CHA,SDN,True,1,2,CHI12,Guaranteed Rate Field;U.S. Cellular Field,Chicago,IL,44,6,1,0,0,22,8,7,0,44,7,0,0,0,17,16,3,1,0.534398,0.425000,0,0
56771,2023-10-01,DET,CLE,True,5,2,DET05,Comerica Park,Detroit,MI,34,3,1,0,1,20,8,1,0,38,4,4,0,1,16,7,6,0,0.449198,0.894737,0,0
56772,2023-10-01,KCA,NYA,True,5,2,KAN06,Kauffman Stadium,Kansas City,MO,35,4,3,0,0,18,7,3,0,35,7,0,1,3,15,8,1,0,0.598214,0.989916,0,0
56773,2023-10-01,SEA,TEX,True,1,0,SEA03,Safeco Field,Seattle,WA,30,3,1,0,0,15,11,0,0,30,4,0,0,0,14,10,1,1,0.300000,0.342857,0,0


In [50]:
# code required to find distance between cities
@dataclass
class Location:
    longitude: float
    latitude: float

locations = {
    "Dyersville": Location(latitude= 42.4836, longitude= 91.1141),
    "Washington": Location(longitude=-122.32718290060447, latitude=47.59111075),
    "Philadelphia": Location(longitude=-75.16483547778323, latitude=39.996116349999994),
    "Baltimore": Location(longitude=-76.61870875574147, latitude=39.2886044),
    "Los Angeles": Location(longitude=-118.28802864848308, latitude=34.0139923),
    "Lake Buena Vista": Location(longitude=-81.5753182375872, latitude=28.34372875),
    "Denver": Location(longitude=-105.0214597, latitude=39.7400878),
    "Boston": Location(longitude=-78.77370516802709, latitude=42.6904416),
    "Detroit": Location(longitude=-95.8533333, latitude=46.8069444),
    "Cleveland": Location(longitude=-81.69971184525946, latitude=41.5060559),
    "Houston": Location(longitude=-83.69428985932677, latitude=32.5531761),
    "New York": Location(longitude=-1.3626780163091192, latitude=53.4267854),
    "Cincinnati": Location(longitude=-84.51371634509104, latitude=39.1301459),
    "Kansas City": Location(longitude=-94.54036478875975, latitude=39.291054),
    "Chicago": Location(longitude=-87.6742028, latitude=41.8814716),
    "San Juan": Location(longitude=-117.62378254131188, latitude=33.503761749999995),
    "London": Location(longitude=-0.016623759002470637, latitude=51.53862065),
    "Pittsburgh": Location(longitude=-80.01575493349428, latitude=40.44671645),
    "Sydney": Location(longitude=151.00000746396432, latitude=-33.8087178),
    "San Francisco": Location(longitude=-122.45600968359278, latitude=37.766867),
    "Anaheim": Location(longitude=-117.88174262759796, latitude=33.8002599),
    "Omaha": Location(longitude=-95.94227635668676, latitude=41.21148395),
    "Miami": Location(longitude=-80.18794351626137, latitude=25.781359549999998),
    "Tokyo": Location(longitude=139.52707517816015, latitude=35.66418645),
    "Williamsport": Location(longitude=-77.07740738423433, latitude=41.24249025),
    "Oakland": Location(longitude=-122.20293232639798, latitude=37.75027335),
    "Seattle": Location(longitude=-122.32718290060447, latitude=47.59111075),
    "St. Petersburg": Location(longitude=30.221060751610707, latitude=59.9723766),
    "Fort Bragg": Location(longitude=-123.78576850038291, latitude=39.4373693),
    "Dunedin": Location(longitude=-82.7866619541428, latitude=28.0038021),
    "Atlanta": Location(longitude=-94.1437551238482, latitude=33.122242400000005),
    "Buffalo": Location(longitude=-101.6048748081557, latitude=33.87278885),
    "Montreal": Location(longitude=-73.55164580516643, latitude=45.55779835),
    "Toronto": Location(longitude=-79.3995096343725, latitude=43.630656349999995),
    "Arlington": Location(longitude=0.227449139219988, latitude=50.8578528),
    "Minneapolis": Location(longitude=-93.2231906824895, latitude=44.97653965),
    "Monterrey": Location(longitude=-100.31599383508116, latitude=25.7006158),
    "Phoenix": Location(longitude=-111.95422089528068, latitude=33.4492717),
    "San Diego": Location(longitude=-117.120158, latitude=32.7809702),
    "Milwaukee": Location(longitude=-87.91695198984257, latitude=43.0422882),
    "St. Louis": Location(longitude=-90.1945093, latitude=38.6235105)
}

# haversine function
def distance(from_location: Location, to_location: Location) -> float:
    
    radius_of_earth = 3958.8

    change_in_longitude = radians(to_location.longitude - from_location.longitude)
    change_in_latitude = radians(to_location.latitude - from_location.latitude)

    angle = sin(change_in_latitude / 2) ** 2 + cos(radians(from_location.latitude)) * cos(radians(to_location.latitude)) * sin(change_in_longitude / 2) ** 2
    curvature = 2 * asin(sqrt(angle))
    return radius_of_earth * curvature

In [51]:
# Adds away_distancetravelled and home_distancetravelled collumns 
home_distancetravelled = []
away_distancetravelled = []

teamLastLocation = {}

for index, row in astro_df.iterrows():
    # Calculate for home team

    if not (row['home_team'] in teamLastLocation):
        teamLastLocation[row['home_team']] = None

    if teamLastLocation[row['home_team']] == None or row['city'] == teamLastLocation[row['home_team']]:
        home_distancetravelled.append(0)
    else:
        travelledDistance = distance(locations[teamLastLocation[row['home_team']]], locations[row['city']])
        home_distancetravelled.append(travelledDistance)

    # Calculate for away team
    if not (row['away_team'] in teamLastLocation):
        teamLastLocation[row['away_team']] = None

    if teamLastLocation[row['away_team']] == None or row['city'] == teamLastLocation[row['home_team']]:
        away_distancetravelled.append(0)
    else:
        travelledDistance = distance(locations[teamLastLocation[row['away_team']]], locations[row['city']])
        away_distancetravelled.append(travelledDistance)
    
    teamLastLocation[row['home_team']] = row['city']
    teamLastLocation[row['away_team']] = row['city']

astro_df['home_distancetravelled'] = home_distancetravelled
astro_df['away_distancetravelled '] = away_distancetravelled 

astro_df



Unnamed: 0,game_date,home_team,away_team,is_day_game,home_score,away_score,venue,venue_name,city,state,home_pa,home_1b,home_2b,home_3b,home_hr,home_fo,home_so,home_bb,home_hbp,away_pa,away_1b,away_2b,away_3b,away_hr,away_fo,away_so,away_bb,away_hbp,home_ops,away_ops,home_restdays,away_restdays,home_distancetravelled,away_distancetravelled
0,2000-03-29,NYN,CHN,False,3,5,TOK01,Tokyo Dome,Tokyo,JAP,45,9,1,0,2,17,5,10,1,38,5,1,0,1,24,4,3,0,1.069935,0.577444,0,0,0.0,0.0
1,2000-03-30,CHN,NYN,False,1,5,TOK01,Tokyo Dome,Tokyo,JAP,48,3,2,0,1,28,5,8,1,44,5,0,0,0,24,9,6,0,0.594551,0.381579,0,0,0.0,0.0
2,2000-04-03,ATL,COL,True,2,0,ATL02,Turner Field,Atlanta,GA,34,4,2,0,0,19,7,2,0,32,5,0,0,2,17,6,1,1,0.485294,0.714583,0,0,0.0,0.0
3,2000-04-03,CIN,MIL,True,3,3,CIN08,Cinergy Field,Cincinnati,OH,27,6,1,0,0,14,1,5,0,20,3,1,0,1,10,4,1,0,0.808081,0.773684,0,0,0.0,0.0
4,2000-04-03,FLO,SFN,False,6,4,MIA01,Sun Life Stadium,Miami,FL,36,5,2,2,1,17,8,1,0,38,9,3,0,0,17,7,1,1,0.848413,0.785088,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56770,2023-10-01,CHA,SDN,True,1,2,CHI12,Guaranteed Rate Field;U.S. Cellular Field,Chicago,IL,44,6,1,0,0,22,8,7,0,44,7,0,0,0,17,16,3,1,0.534398,0.425000,0,0,0.0,0.0
56771,2023-10-01,DET,CLE,True,5,2,DET05,Comerica Park,Detroit,MI,34,3,1,0,1,20,8,1,0,38,4,4,0,1,16,7,6,0,0.449198,0.894737,0,0,0.0,0.0
56772,2023-10-01,KCA,NYA,True,5,2,KAN06,Kauffman Stadium,Kansas City,MO,35,4,3,0,0,18,7,3,0,35,7,0,1,3,15,8,1,0,0.598214,0.989916,0,0,0.0,0.0
56773,2023-10-01,SEA,TEX,True,1,0,SEA03,Safeco Field,Seattle,WA,30,3,1,0,0,15,11,0,0,30,4,0,0,0,14,10,1,1,0.300000,0.342857,0,0,0.0,0.0


In [52]:
astro_df.to_csv("./../data/processed/astro_df_with_features.csv")