In [None]:
import sys
sys.path.append('..')
from ift6758.features import games_to_table, team_side, trigonometry
import pandas as pd
pd.set_option('display.max_columns', None)
import wandb
import os
import json
from tqdm import tqdm

USE_CACHE = True

In [None]:
if not USE_CACHE:
    df = pd.DataFrame(games_to_table(list(range(2016, 2024))))
    df.to_csv("../ift6758/data/preprocessed_data/preprocessed_game_data_2016_to_2023.csv")
else:
    df = pd.read_csv("../ift6758/data/preprocessed_data/preprocessed_game_data_2016_to_2023.csv")


In [None]:
df

# compute home team defending side

In [None]:
if not USE_CACHE:
    home_team_defending_side_dict = {}
    home_team_defending_side_mismatches = []
    home_team_defending_side_not_found = []

    for game_id in tqdm(df['id'].unique()):
        for period_number in df[df['id'] == game_id]['periodDescriptor_number'].unique():
            team_side.populate_home_team_defending_side(game_id, period_number, df, home_team_defending_side_dict, home_team_defending_side_mismatches, home_team_defending_side_not_found)
    df = team_side.update_home_team_defending_side(df, home_team_defending_side_dict)
    df.to_csv("../ift6758/data/preprocessed_data/preprocessed_game_data__regular_season_2016_to_2023_team_side.csv", index=False)
    print("Home team defending side mismatches:", home_team_defending_side_mismatches)
    print("Home team defending side not found:", home_team_defending_side_not_found)
else:
    df = pd.read_csv("../ift6758/data/preprocessed_data/preprocessed_game_data__regular_season_2016_to_2023_team_side.csv")

# compute distance

In [None]:
df['distance_from_net'] = df.apply(trigonometry.compute_distance_from_net, axis=1)
df


In [None]:
trigonometry.plot_distances_histogram(df)


# upload to wandb

In [None]:
api_key = os.getenv("WANDB_API_KEY") # requires wandb API key in WANDB_API_KEY environment variable
with wandb.init(project="IFT6758-2024-B05", job_type="load-data", config={"seasons": "2016-2023"}, tags=["team_side", "regular_season", "distance_from_net"], name="change distance column name") as run:

    artifact = wandb.Artifact('shot_events', type='dataset', description='regular 2016-2023 seasons data with team side')
    table_2016_2017 = wandb.Table(dataframe=df[df["season"] == 20162017])
    artifact.add(table_2016_2017, '2016-2017')

    table_2017_2018 = wandb.Table(dataframe=df[df["season"] == 20172018])
    artifact.add(table_2017_2018, '2017-2018')

    table_2018_2019 = wandb.Table(dataframe=df[df["season"] == 20182019])
    artifact.add(table_2018_2019, '2018-2019')

    table_2019_2020 = wandb.Table(dataframe=df[df["season"] == 20192020])
    artifact.add(table_2019_2020, '2019-2020')

    table_2020_2021 = wandb.Table(dataframe=df[df["season"] == 20202021])
    artifact.add(table_2020_2021, '2020-2021')

    table_2021_2022 = wandb.Table(dataframe=df[df["season"] == 20212022])
    artifact.add(table_2021_2022, '2021-2022')

    table_2022_2023 = wandb.Table(dataframe=df[df["season"] == 20222023])
    artifact.add(table_2022_2023, '2022-2023')

    table_2023_2024 = wandb.Table(dataframe=df[df["season"] == 20232024])
    artifact.add(table_2023_2024, '2023-2024')
    
    run.log_artifact(artifact)



# Sanity checks

In [None]:
df[df.duplicated()]

In [None]:
unique_event_types = df['type'].unique()
unique_event_types

In [None]:
events_with_no_zone_code = df[df['zoneCode'].isna()]
events_with_no_zone_code

In [None]:
events_with_no_shot_type = df[df['shotType'].isna()]
events_with_no_shot_type

In [None]:
events_with_no_goalie = df[df['goalieInNetId'].isna()]
events_with_no_goalie # net empty of no goalie ?

In [None]:
# errors ?
events_with_no_x_coord = df[df['xCoord'].isna()]
events_with_no_x_coord

In [None]:
events_with_no_y_coord = df[df['yCoord'].isna()]
events_with_no_y_coord