In [16]:
import numpy as np
import pandas as pd 
from typing import Any, Dict, List, Optional

import glob 
import os

import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

## Data Source: https://www.hockey-reference.com/leagues/NHL_2022.html ##
path = '/Users/rschraeder/Desktop/Projects/StanleyCupPredictions/data/'

def file_access(path: str): 
    dataframes = []
    filenames = []
    for root, dirs, files in os.walk(path, topdown=False):
        for name in files:
            filenames.append(name)
            df = pd.read_csv(os.path.join(root, name))
            dataframes.append(df) 
    
    return dataframes

team_stats_df = file_access(path)[0] 
games_df = file_access(path)[1]

# Add the Stanley Cup Final Schedule
def add_fake_data(
    data: pd.DataFrame(), date: Optional, away_team, home_team, away_goals, 
    home_goals, away_result, home_result, game_length
): 
    fake_data = pd.DataFrame({
        'date': date, 
        'away_team': away_team, 
        'home_team': home_team, 
        'away_goals': away_goals,
        'home_goals': home_goals,
        'outcome': away_result,
        'home_wins': home_result, 
        'length_of_game_min': game_length
    })
   
    result = pd.concat([data, fake_data])
    return result


stanley_cup_df = add_fake_data(
    data = games_df,
    date = ['2022-06-15', '2022-06-18', '2022-06-20', '2022-06-22', '2022-06-24', '2022-06-26'],
    away_team = [
        'Tampa Bay Lightning', 
        'Tampa Bay Lightning', 
        'Colorado Avalanche', 
        'Colorado Avalanche', 
        'Tampa Bay Lightning', 
        'Tampa Bay Lightning'
    ],
    home_team = [
        'Colorado Avalanche', 
        'Colorado Avalanche', 
        'Tampa Bay Lightning', 
        'Tampa Bay Lightning', 
        'Colorado Avalanche', 
        'Colorado Avalanche'
    ],
    away_goals = [games_df['away_goals'].mean() for i in range(6)],
    home_goals = [games_df['home_goals'].mean() for i in range(6)],
    away_result = [games_df['outcome'].mean() for i in range(6)],
    home_result = [games_df['home_wins'].mean() for i in range(6)], 
    game_length = [games_df['length_of_game_min'].mean() for i in range(6)]
)
stanley_cup_df

Unnamed: 0,date,away_team,away_goals,home_team,home_goals,length_of_game_min,outcome,home_wins
0,2021-10-12,Pittsburgh Penguins,6.000000,Tampa Bay Lightning,2.000000,153.000000,1.000000,0.000000
1,2021-10-12,Seattle Kraken,3.000000,Vegas Golden Knights,4.000000,145.000000,0.000000,1.000000
2,2021-10-13,Winnipeg Jets,1.000000,Anaheim Ducks,4.000000,149.000000,0.000000,1.000000
3,2021-10-13,Chicago Blackhawks,2.000000,Colorado Avalanche,4.000000,152.000000,0.000000,1.000000
4,2021-10-13,Vancouver Canucks,2.000000,Edmonton Oilers,3.000000,162.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...
1,2022-06-18,Tampa Bay Lightning,3.021341,Colorado Avalanche,3.268293,148.640244,0.463415,0.536585
2,2022-06-20,Colorado Avalanche,3.021341,Tampa Bay Lightning,3.268293,148.640244,0.463415,0.536585
3,2022-06-22,Colorado Avalanche,3.021341,Tampa Bay Lightning,3.268293,148.640244,0.463415,0.536585
4,2022-06-24,Tampa Bay Lightning,3.021341,Colorado Avalanche,3.268293,148.640244,0.463415,0.536585


In [17]:
# df.isna().sum() -- just a check for missing data. 

# Creating Column for Total Goals 

team_stats_df['G'] = team_stats_df.GF + team_stats_df.GA 

# Creating Column for Total Power-Play Goals 

team_stats_df['PPG'] = team_stats_df.PP + team_stats_df. PPA

# Creating Column for Total Games in Shootouts

team_stats_df['SHOOTOUTS'] = team_stats_df.SOW + team_stats_df. SOL

# Output to CSV
team_stats_df.to_csv(os.path.join(path, 'team_stats.csv'), index=False)


In [18]:
games_df['outcome'] = (games_df.away_goals - games_df.home_goals).apply(lambda x: 'away' if x > 0 else 'home')
games_df['outcome'] = games_df['outcome'].apply(lambda x: 1 if x == 'away' else 0)

stanley_cup_df['outcome'] = (stanley_cup_df.away_goals - stanley_cup_df.home_goals).apply(lambda x: 'away' if x > 0 else 'home')
stanley_cup_df['outcome'] = stanley_cup_df['outcome'].apply(lambda x: 1 if x == 'away' else 0)

home_win_mask = games_df['outcome'] == 0 
games_df['home_wins'] = np.where(home_win_mask, 1, 0)

home_cup_win_mask = stanley_cup_df['outcome'] == 0 
stanley_cup_df['home_wins'] = np.where(home_cup_win_mask, 1, 0)

# convert length of game to minutes. only needed for original file. 
# new_vals = []
# for i, j in enumerate(games_df.length_of_game_min.str.split(':')):
#     j = (int(j[0])*60) + int(j[1])
#     new_vals.append(j)

# games_df.length_of_game_min = new_vals

games_df['date'] = games_df['date'].astype('datetime64')
stanley_cup_df['date'] = stanley_cup_df['date'].astype('datetime64')

# output to csv
games_df.to_csv(os.path.join(path, 'regular_season.csv'), index=False)
stanley_cup_df.to_csv(os.path.join(path, 'stanley_cup.csv'), index=False)


# Classification of Numerical Variables

> Encoding categorical values
> Some features are still categorical and will be converted if deemed necessary, then I'll join the dataframes together to complete the dataset for my train and testing split

In [19]:
# Create full dataframe 
classification_df = games_df.merge(team_stats_df, how='outer', left_on='away_team', right_on='Team')
cup_data = stanley_cup_df.merge(team_stats_df, how='outer', left_on='away_team', right_on='Team')

# Output dataset with team names to a separate CSV
classification_df.to_csv(os.path.join(path, 'categorical_teams_set.csv'), index=False)
cup_data.to_csv(os.path.join(path, 'categorical_teams_set_cup.csv'), index=False)

# Encode non-numeric variables
classification_df.home_team = classification_df.home_team.astype('category').cat.codes 
classification_df.away_team = classification_df.away_team.astype('category').cat.codes

cup_data.home_team = cup_data.home_team.astype('category').cat.codes 
cup_data.away_team = cup_data.away_team.astype('category').cat.codes

## add datetime features
classification_df["day_code"] = classification_df["date"].dt.dayofweek
cup_data["day_code"] = cup_data["date"].dt.dayofweek

## Remove date, team variables as team is redundant and date cannot be trained. 
classification_df = classification_df.iloc[:, (classification_df.columns != 'date') & (classification_df.columns != 'Team')]
cup_data = cup_data.iloc[:, (cup_data.columns != 'date') & (cup_data.columns != 'Team')]

# Output dataset with all encoded variables to training set. Will be used for training model. 
classification_df.to_csv(os.path.join(path, "encoded_variables.csv"), index=False)
cup_data.to_csv(os.path.join(path, "stanley_cup_encoded_variables.csv"), index=False)

In [20]:
cup_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1318 entries, 0 to 1317
Data columns (total 42 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   away_team           1318 non-null   int8   
 1   away_goals          1318 non-null   float64
 2   home_team           1318 non-null   int8   
 3   home_goals          1318 non-null   float64
 4   length_of_game_min  1318 non-null   float64
 5   outcome             1318 non-null   int64  
 6   home_wins           1318 non-null   int64  
 7   Rk                  1318 non-null   int64  
 8   AvAge               1318 non-null   float64
 9   GP                  1318 non-null   int64  
 10  W                   1318 non-null   int64  
 11  L                   1318 non-null   int64  
 12  OL                  1318 non-null   int64  
 13  PTS                 1318 non-null   int64  
 14  PTS%                1318 non-null   float64
 15  GF                  1318 non-null   int64  
 16  GA    

In [21]:
# TODO: 

"""
Find a way to extract data from the website and save it via this pipeline into the /data directory. 
There is a link on the website to save CSV files, so this can be accessed with selenium 
and exported to the directory OR be ideally fetched via API if available and stored as parquet in an S3 bucket. 
This is once I can get this on AWS. 

Clean this up and store the separate pieces of data in functions, maybe create dataclasses. 
"""

'\nFind a way to extract data from the website and save it via this pipeline into the /data directory. \nThere is a link on the website to save CSV files, so this can be accessed with selenium \nand exported to the directory OR be ideally fetched via API if available and stored as parquet in an S3 bucket. \nThis is once I can get this on AWS. \n\nClean this up and store the separate pieces of data in functions, maybe create dataclasses. \n'