In [None]:
import wandb
from ift6758.data import get_wandb_data
import pandas as pd
pd.set_option('display.max_columns', None)
import ift6758.features.trigonometry as trigonometry
import os
import sys
sys.path.append('..')
from ift6758.visualizations import debugging_tool

In [None]:
# Train/val
train_val_seasons=['2016-2017', '2017-2018', '2018-2019', '2019-2020']
test_season='2020-2021'

In [None]:
get_wandb_data.download_artifact(artifact_name="shot_events:v4")

In [None]:
def preprocess_feature_eng_1(df):
    new_df = pd.DataFrame()
    new_df['season'] = df['season']
    new_df['game_id'] = df['id']
    new_df['event_id'] = df['eventId']
    new_df['angle_from_net'] = df.apply(trigonometry.compute_angle_from_net, axis=1)
    new_df['is_goal'] = df['type'].apply(lambda x: 1 if x == 'Goal' else 0)
    new_df['empty_net'] = df['goalieInNetId'].apply(lambda x: 1 if pd.isna(x) else 0)
    new_df['distance_from_net'] = df['distance_from_net']
    return new_df

In [None]:
df_list = []
for season in train_val_seasons:
    df_list.append(get_wandb_data.load_season_dataframe(season))
df_train_val = pd.concat(df_list, ignore_index=True)
df_train_val = preprocess_feature_eng_1(df_train_val)
df_train_val

In [None]:
df_train_val[df_train_val['empty_net'] == 1]

In [None]:
df_train_val[df_train_val['is_goal'] == 1]


In [None]:
df_test = get_wandb_data.load_season_dataframe(test_season)
df_test = preprocess_feature_eng_1(df_test)
df_test

In [None]:
duplicates = df_train_val.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

In [None]:
df_test[df_test['angle_from_net'] > 85]

In [None]:
trigonometry.plot_angles_histogram(df_train_val)

In [None]:
debugging_tool.display_game_id("2016020006")

# publish train_val

In [None]:
api_key = os.getenv("WANDB_API_KEY") # requires wandb API key in WANDB_API_KEY environment variable


artifact_name = "train_val_feature_eng_1"
run_name = "add distance from net" # goal of the run

with wandb.init(name=run_name, project="IFT6758-2024-B05", job_type="load-data", config={"seasons": "2016-2019"}, tags=["feature_eng_1", "train_val"]) as run:
    artifact = wandb.Artifact(artifact_name, type='dataset', description='train val data with feature engineering 1')
    for season in train_val_seasons:
        fomatted_season = int(season.replace('-', '')) # 2016-2017 -> 20162017
        train_val_table = wandb.Table(dataframe=df_train_val[df_train_val["season"] == fomatted_season])
        artifact.add(train_val_table, f"train_val_{season}")
    run.log_artifact(artifact)

# get artifact and add tags
project_name="IFT6758-2024-B05"
api = wandb.Api()    
artifact = api.artifact(f"{project_name}/{artifact_name}:latest")

artifact.tags = ["distance_from_net", "angle_from_net", "empty_net", "is_goal"] # change tags if necessary
artifact.save()

# publish test

In [None]:
api_key = os.getenv("WANDB_API_KEY") # requires wandb API key in WANDB_API_KEY environment variable


artifact_name = "test_set_feature_eng_1"
run_name = "reduce number of columns" # goal of the run

with wandb.init(name=run_name, project="IFT6758-2024-B05", job_type="load-data", config={"seasons": "2020"}, tags=["feature_eng_1", "test_set"]) as run:
    artifact = wandb.Artifact(artifact_name, type='dataset', description='test set data with feature engineering 1')
    fomatted_season = int(test_season.replace('-', ''))
    test_table = wandb.Table(dataframe=df_test[df_test["season"] == fomatted_season])
    artifact.add(test_table, f"test_{season}")
    run.log_artifact(artifact)

# get artifact and add tags
project_name="IFT6758-2024-B05"
api = wandb.Api()    
artifact = api.artifact(f"{project_name}/{artifact_name}:latest")

artifact.tags = ["distance_from_net", "angle_from_net", "empty_net", "is_goal"] # change tags if necessary
artifact.save()