In [1]:
import wandb
from ift6758.data import wandb_handler
import pandas as pd
pd.set_option('display.max_columns', None)
import ift6758.features.trigonometry as trigonometry
import os
import sys
sys.path.append('..')
from ift6758.visualizations import debugging_tool

In [5]:
def preprocess_feature_eng_1(df):
    new_df = pd.DataFrame()
    new_df['season'] = df['season']
    new_df['game_id'] = df['id']
    new_df['event_id'] = df['eventId']
    new_df['angle'] = df.apply(trigonometry.compute_angle_from_net, axis=1)
    new_df['is_goal'] = df['type'].apply(lambda x: 1 if x == 'Goal' else 0)
    new_df['empty_net'] = df['goalieInNetId'].apply(lambda x: 1 if pd.isna(x) else 0)
    new_df['distance'] = df['distance_from_net']
    new_df = new_df.dropna(subset=['angle', 'distance'])
    return new_df

In [6]:
def check_nan_values(df):
    nan_distance = df['distance'].isna().sum()
    nan_angle = df['angle'].isna().sum()
    print(f"Number of NaN values in distance: {nan_distance}")
    print(f"Number of NaN values in angle: {nan_angle}")

In [2]:
# Train/val
train_val_seasons=['2016-2017', '2017-2018', '2018-2019', '2019-2020']
test_season=['2020-2021']
artifact_name="shot_events"

In [None]:
data_loader = wandb_handler.DataLoader(project_name="IFT6758-2024-B05")
df_train_val = data_loader.load_seasons_dataframe(artifact_name=artifact_name, artifact_version="v4", seasons=train_val_seasons)
df_train_val = preprocess_feature_eng_1(df_train_val)
df_train_val

In [8]:
df_test = data_loader.load_seasons_dataframe(artifact_name=artifact_name, artifact_version="v4", seasons=test_season)
df_test = preprocess_feature_eng_1(df_test)
df_test

wandb: Downloading large artifact shot_events:v4, 108.93MB. 8 files... 
wandb:   8 of 8 files downloaded.  
Done. 0:0:0.3


Artifact downloaded to: ..\ift6758\data\wandb_artifacts\shot_events


Unnamed: 0,season,game_id,event_id,angle,is_goal,empty_net,distance
0,20202021,2020020001,53,62.650124,0,0,32.649655
1,20202021,2020020001,9,32.005383,0,0,47.169906
2,20202021,2020020001,55,39.957549,0,0,48.270074
3,20202021,2020020001,62,40.292148,0,0,60.307545
4,20202021,2020020001,65,12.528808,0,0,36.878178
...,...,...,...,...,...,...,...
52403,20202021,2020020868,490,41.633539,0,0,24.083189
52404,20202021,2020020868,498,33.690068,0,0,25.238859
52405,20202021,2020020868,500,18.434949,0,0,34.785054
52406,20202021,2020020868,751,20.556045,0,0,17.088007


In [9]:
duplicates = df_train_val.duplicated()
print(f"Number of duplicate rows: {duplicates.sum()}")

Number of duplicate rows: 0


# publish train_val

In [10]:
data_loader.upload_dataset_to_wandb(df_train_val, 
                                    seasons=train_val_seasons, 
                                    artifact_name="train_val_feature_eng_1", 
                                    run_name="rename files",
                                    tags=["distance_from_net", "angle_from_net", "empty_net", "is_goal"])

wandb: Currently logged in as: andre-diler (IFT6758-2024-B05). Use `wandb login --relogin` to force relogin


# publish test

In [12]:
data_loader.upload_dataset_to_wandb(df_test, 
                                    seasons=test_season, 
                                    artifact_name="test_set_feature_eng_1", 
                                    run_name="rename files",
                                    tags=["distance_from_net", "angle_from_net", "empty_net", "is_goal"])