In [None]:
import wandb
from ift6758.data import get_wandb_data
import pandas as pd
pd.set_option('display.max_columns', None)
import ift6758.features.trigonometry as trigonometry
import os
import sys
sys.path.append('..')
from ift6758.visualizations import debugging_tool

In [24]:
# Train/val
train_val_seasons=['2016-2017', '2017-2018', '2018-2019', '2019-2020']
test_season='2020-2021'
artifact_name="shot_events"

In [9]:
get_wandb_data.download_artifact(artifact_name=artifact_name, artifact_version="v4")

wandb: Downloading large artifact shot_events:v4, 108.93MB. 8 files... 
wandb:   8 of 8 files downloaded.  
Done. 0:0:0.3


Artifact downloaded to: ../ift6758/data/wandb_artifacts//shot_events


In [22]:
def preprocess_feature_eng_1(df):
    new_df = pd.DataFrame()
    new_df['season'] = df['season']
    new_df['game_id'] = df['id']
    new_df['event_id'] = df['eventId']
    new_df['angle'] = df.apply(trigonometry.compute_angle_from_net, axis=1)
    new_df['is_goal'] = df['type'].apply(lambda x: 1 if x == 'Goal' else 0)
    new_df['empty_net'] = df['goalieInNetId'].apply(lambda x: 1 if pd.isna(x) else 0)
    new_df['distance'] = df['distance_from_net']
    new_df = new_df.dropna(subset=['angle', 'distance'])
    return new_df

In [27]:
def check_nan_values(df):
    nan_distance = df['distance'].isna().sum()
    nan_angle = df['angle'].isna().sum()
    print(f"Number of NaN values in distance: {nan_distance}")
    print(f"Number of NaN values in angle: {nan_angle}")

In [25]:
df_list = []
for season in train_val_seasons:
    df_list.append(get_wandb_data.load_season_dataframe(artifact_name=artifact_name, season=season))
df_train_val = pd.concat(df_list, ignore_index=True)
df_train_val = preprocess_feature_eng_1(df_train_val)
df_train_val

Unnamed: 0,season,game_id,event_id,angle,is_goal,empty_net,distance
0,20162017,2016020001,8,22.619865,0,0,13.000000
1,20162017,2016020001,11,77.005383,0,0,13.341664
2,20162017,2016020001,15,29.931512,0,0,76.157731
3,20162017,2016020001,16,14.995079,0,0,57.974132
4,20162017,2016020001,24,26.980231,0,0,61.717096
...,...,...,...,...,...,...,...
305809,20192020,2019021082,773,62.447188,0,0,25.942244
305810,20192020,2019021082,774,71.113913,0,0,40.162171
305811,20192020,2019021082,537,45.000000,0,0,14.142136
305812,20192020,2019021082,777,41.185925,0,0,10.630146


In [28]:
check_nan_values(df_train_val)

Number of NaN values in distance: 0
Number of NaN values in angle: 0


In [29]:
df_test = get_wandb_data.load_season_dataframe(artifact_name=artifact_name, season=test_season)
df_test = preprocess_feature_eng_1(df_test)
df_test

Unnamed: 0,season,game_id,event_id,angle,is_goal,empty_net,distance
0,20202021,2020020001,53,62.650124,0,0,32.649655
1,20202021,2020020001,9,32.005383,0,0,47.169906
2,20202021,2020020001,55,39.957549,0,0,48.270074
3,20202021,2020020001,62,40.292148,0,0,60.307545
4,20202021,2020020001,65,12.528808,0,0,36.878178
...,...,...,...,...,...,...,...
52403,20202021,2020020868,490,41.633539,0,0,24.083189
52404,20202021,2020020868,498,33.690068,0,0,25.238859
52405,20202021,2020020868,500,18.434949,0,0,34.785054
52406,20202021,2020020868,751,20.556045,0,0,17.088007


In [30]:
check_nan_values(df_test)

Number of NaN values in distance: 0
Number of NaN values in angle: 0


In [31]:
duplicates = df_train_val.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

Number of duplicate rows: 0


# publish train_val

In [32]:
api_key = os.getenv("WANDB_API_KEY") # requires wandb API key in WANDB_API_KEY environment variable


artifact_name = "train_val_feature_eng_1"
run_name = "rename distance and angle columns" # goal of the run

with wandb.init(name=run_name, project="IFT6758-2024-B05", job_type="load-data", config={"seasons": "2016-2019"}, tags=["feature_eng_1", "train_val"]) as run:
    artifact = wandb.Artifact(artifact_name, type='dataset', description='train val data with feature engineering 1')
    for season in train_val_seasons:
        fomatted_season = int(season.replace('-', '')) # 2016-2017 -> 20162017
        train_val_table = wandb.Table(dataframe=df_train_val[df_train_val["season"] == fomatted_season])
        artifact.add(train_val_table, f"train_val_{season}")
    run.log_artifact(artifact)

# get artifact and add tags
project_name="IFT6758-2024-B05"
api = wandb.Api()    
artifact = api.artifact(f"{project_name}/{artifact_name}:latest")

artifact.tags = ["distance_from_net", "angle_from_net", "empty_net", "is_goal"] # change tags if necessary
artifact.save()

VBox(children=(Label(value='1.333 MB of 21.010 MB uploaded\r'), FloatProgress(value=0.06343498966617991, max=1…

# publish test

In [33]:
api_key = os.getenv("WANDB_API_KEY") # requires wandb API key in WANDB_API_KEY environment variable


artifact_name = "test_set_feature_eng_1"
run_name = "rename distance and angle columns" # goal of the run

with wandb.init(name=run_name, project="IFT6758-2024-B05", job_type="load-data", config={"seasons": "2020"}, tags=["feature_eng_1", "test_set"]) as run:
    artifact = wandb.Artifact(artifact_name, type='dataset', description='test set data with feature engineering 1')
    fomatted_season = int(test_season.replace('-', ''))
    test_table = wandb.Table(dataframe=df_test[df_test["season"] == fomatted_season])
    artifact.add(test_table, f"test_{season}")
    run.log_artifact(artifact)

# get artifact and add tags
project_name="IFT6758-2024-B05"
api = wandb.Api()    
artifact = api.artifact(f"{project_name}/{artifact_name}:latest")

artifact.tags = ["distance_from_net", "angle_from_net", "empty_net", "is_goal"] # change tags if necessary
artifact.save()

VBox(children=(Label(value='3.145 MB of 3.591 MB uploaded\r'), FloatProgress(value=0.8758537906712415, max=1.0…