In [9]:
import numpy as np
import pandas as pd 
from typing import Any, Dict, List, Optional
import datetime as dt

import glob 
import os

import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

## Data Source: https://www.hockey-reference.com/leagues/NHL_2022.html ##
path = '/Users/rschraeder/Desktop/Projects/StanleyCupPredictions/data/'

def file_access(path: str): 
    dataframes = []
    filenames = []
    for root, dirs, files in os.walk(path, topdown=False):
        for name in files:
            filenames.append(name)
            df = pd.read_csv(os.path.join(root, name))
            dataframes.append(df) 
    
    return dataframes

team_stats_df = file_access(path)[0] 
games_df = file_access(path)[1]

# Null value counts
# games_df.isna().sum() 
# team_stats_df.isna().sum()

In [10]:
games_df['date'] = games_df['date'].astype('datetime64[ns]')

# Creating Column for Total Goals 

team_stats_df['G'] = team_stats_df.GF + team_stats_df.GA 

# Creating Column for Total Power-Play Goals 

team_stats_df['PPG'] = team_stats_df.PP + team_stats_df.PPA

# Creating Column for Total Games in Shootouts

team_stats_df['SHOOTOUTS'] = team_stats_df.SOW + team_stats_df.SOL

# Output to CSV
team_stats_df.to_csv(os.path.join(path, 'team_stats.csv'), index=False)


In [11]:
def encoding_game_outcome(dataset, away_output_colname: str, away_colname: str, home_colname: str) -> List[int]: 
    dataset.away_output_colname = dataset[f'{away_colname}'] - dataset[f'{home_colname}'].apply(
        lambda x: 'away' if x > 0 else 'home'
    ).map(lambda x: 1 if x == 'away' else 0)
    
    def encoding_home_outcome(dataset, away_output_colname, home_output_colname) -> List[int]: 
        home_win_mask = (dataset.away_output_colname == 0)
        dataset.home_output_colname = np.where(home_win_mask, 1, 0)
        
    return dataset
        
games_df = encoding_game_outcome(games_df, 
                      away_output_colname = 'away_outcome', 
                      away_colname = 'away_goals', 
                      home_colname = 'home_goals')

  dataset.away_output_colname = dataset[f'{away_colname}'] - dataset[f'{home_colname}'].apply(


In [12]:
# Add the Stanley Cup Final Schedule
def add_fake_data(
    data, date, away_team, home_team, 
    away_goals, home_goals, away_result, home_result, game_length
): 
    fake_data = pd.DataFrame({
        'date': date, 
        'away_team': away_team, 
        'home_team': home_team, 
        'away_goals': away_goals,
        'home_goals': home_goals,
        'outcome': away_result,
        'home_wins': home_result, 
        'length_of_game_min': game_length
    })
   
    result = pd.concat([data, fake_data])
    return result

dates = ['2022-06-15', '2022-06-18', '2022-06-20', '2022-06-22', '2022-06-24', '2022-06-26']
dates_list = [dt.datetime.strptime(date, '%Y-%m-%d') for date in dates]

stanley_cup_df = add_fake_data(
    data = games_df,
    date = dates_list,
    away_team = [
        'Tampa Bay Lightning', 
        'Tampa Bay Lightning', 
        'Colorado Avalanche', 
        'Colorado Avalanche', 
        'Tampa Bay Lightning', 
        'Tampa Bay Lightning'
    ],
    home_team = [
        'Colorado Avalanche', 
        'Colorado Avalanche', 
        'Tampa Bay Lightning', 
        'Tampa Bay Lightning', 
        'Colorado Avalanche', 
        'Colorado Avalanche'
    ],
    away_goals = [games_df['away_goals'].mean() for i in range(6)],
    home_goals = [games_df['home_goals'].mean() for i in range(6)],
    away_result = [games_df['outcome'].mean() for i in range(6)],
    home_result = [games_df['home_wins'].mean() for i in range(6)], 
    game_length = [games_df['length_of_game_min'].mean() for i in range(6)]
)
# View preview
stanley_cup_df


Unnamed: 0,date,away_team,away_goals,home_team,home_goals,length_of_game_min,outcome,home_wins
0,2021-10-12,Pittsburgh Penguins,6.000000,Tampa Bay Lightning,2.000000,153.000000,1.000000,0.000000
1,2021-10-12,Seattle Kraken,3.000000,Vegas Golden Knights,4.000000,145.000000,0.000000,1.000000
2,2021-10-13,Winnipeg Jets,1.000000,Anaheim Ducks,4.000000,149.000000,0.000000,1.000000
3,2021-10-13,Chicago Blackhawks,2.000000,Colorado Avalanche,4.000000,152.000000,0.000000,1.000000
4,2021-10-13,Vancouver Canucks,2.000000,Edmonton Oilers,3.000000,162.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...
1,2022-06-18,Tampa Bay Lightning,3.021341,Colorado Avalanche,3.268293,148.640244,0.463415,0.536585
2,2022-06-20,Colorado Avalanche,3.021341,Tampa Bay Lightning,3.268293,148.640244,0.463415,0.536585
3,2022-06-22,Colorado Avalanche,3.021341,Tampa Bay Lightning,3.268293,148.640244,0.463415,0.536585
4,2022-06-24,Tampa Bay Lightning,3.021341,Colorado Avalanche,3.268293,148.640244,0.463415,0.536585


In [13]:
# Create full dataframe 
raw_cup_data = stanley_cup_df.merge(team_stats_df, how='outer', left_on='away_team', right_on='Team').drop(columns={'away_goals', 'home_goals', 'Team'}) # drop goals, join column
raw_cup_data = raw_cup_data.round(0)

# Output dataset with team names to a separate CSV
raw_cup_data.to_csv(os.path.join(path, 'categorical_teams_set_cup.csv'), index=False)

# View preview
raw_cup_data

Unnamed: 0,date,away_team,home_team,length_of_game_min,outcome,home_wins,Rk,AvAge,GP,W,...,PIM/G,oPIM/G,S,S%,SA,SV%,SO,G,PPG,SHOOTOUTS
0,2021-10-12,Pittsburgh Penguins,Tampa Bay Lightning,153.0,1.0,0.0,12,30.0,82,46,...,7.0,8.0,2849,9.0,2576,1.0,7,491,83,10
1,2021-10-14,Pittsburgh Penguins,Florida Panthers,160.0,0.0,1.0,12,30.0,82,46,...,7.0,8.0,2849,9.0,2576,1.0,7,491,83,10
2,2021-11-09,Pittsburgh Penguins,Chicago Blackhawks,161.0,0.0,1.0,12,30.0,82,46,...,7.0,8.0,2849,9.0,2576,1.0,7,491,83,10
3,2021-11-13,Pittsburgh Penguins,Ottawa Senators,142.0,0.0,1.0,12,30.0,82,46,...,7.0,8.0,2849,9.0,2576,1.0,7,491,83,10
4,2021-11-14,Pittsburgh Penguins,Washington Capitals,141.0,0.0,1.0,12,30.0,82,46,...,7.0,8.0,2849,9.0,2576,1.0,7,491,83,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1313,2022-04-14,New Jersey Devils,Colorado Avalanche,140.0,0.0,1.0,28,26.0,82,27,...,8.0,8.0,2562,10.0,2540,1.0,2,547,77,8
1314,2022-04-16,New Jersey Devils,Seattle Kraken,160.0,0.0,1.0,28,26.0,82,27,...,8.0,8.0,2562,10.0,2540,1.0,2,547,77,8
1315,2022-04-18,New Jersey Devils,Vegas Golden Knights,144.0,1.0,0.0,28,26.0,82,27,...,8.0,8.0,2562,10.0,2540,1.0,2,547,77,8
1316,2022-04-26,New Jersey Devils,Ottawa Senators,158.0,0.0,1.0,28,26.0,82,27,...,8.0,8.0,2562,10.0,2540,1.0,2,547,77,8


# Classification of Numerical Variables

> Encoding categorical values
> Some features are still categorical and will be converted if deemed necessary, then I'll join the dataframes together to complete the dataset for my train and testing split

In [14]:
def encoding_full(
    df, home_team: str, away_team: str, date_col: str
): 
    
    # Encode non-numeric variables
    df.home_team = df.home_team.astype('category').cat.codes 
    df.away_team = df.away_team.astype('category').cat.codes
    
    # Encode date as a day of week to avoid time series implication.
    df.rename(columns={f'{date_col}': 'day_of_week'}, inplace=True)
    df['day_of_week'] = df['day_of_week'].dt.dayofweek
    
    return df

classification_df = encoding_full(
    raw_cup_data, 'home_team', 'away_team', 'date'
)
classification_df = classification_df.iloc[:,0:].astype(int)

# Output dataset with all encoded variables to training set. Will be used for training model. 
classification_df.to_csv(os.path.join(path, "encoded_variables.csv"), index=False)

# View preview
classification_df

Unnamed: 0,day_of_week,away_team,home_team,length_of_game_min,outcome,home_wins,Rk,AvAge,GP,W,...,PIM/G,oPIM/G,S,S%,SA,SV%,SO,G,PPG,SHOOTOUTS
0,1,22,26,153,1,0,12,30,82,46,...,7,8,2849,9,2576,1,7,491,83,10
1,3,22,12,160,0,1,12,30,82,46,...,7,8,2849,9,2576,1,7,491,83,10
2,1,22,6,161,0,1,12,30,82,46,...,7,8,2849,9,2576,1,7,491,83,10
3,5,22,20,142,0,1,12,30,82,46,...,7,8,2849,9,2576,1,7,491,83,10
4,6,22,30,141,0,1,12,30,82,46,...,7,8,2849,9,2576,1,7,491,83,10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1313,3,17,7,140,0,1,28,26,82,27,...,8,8,2562,10,2540,1,2,547,77,8
1314,5,17,24,160,0,1,28,26,82,27,...,8,8,2562,10,2540,1,2,547,77,8
1315,0,17,29,144,1,0,28,26,82,27,...,8,8,2562,10,2540,1,2,547,77,8
1316,1,17,20,158,0,1,28,26,82,27,...,8,8,2562,10,2540,1,2,547,77,8


In [7]:
# TODO: 

"""Find a way to extract data from the website and save it via this pipeline into the /data directory. There is a link on the website to save CSV files, so this can be accessed with selenium and exported to the directory OR be ideally fetched via API if available and stored as parquet in an S3 bucket. This is once I can get this on AWS. Clean this up and store the separate pieces of data in functions, maybe create dataclasses."""

'Find a way to extract data from the website and save it via this pipeline into the /data directory. There is a link on the website to save CSV files, so this can be accessed with selenium and exported to the directory OR be ideally fetched via API if available and stored as parquet in an S3 bucket. This is once I can get this on AWS. Clean this up and store the separate pieces of data in functions, maybe create dataclasses.'