Created by: [SmirkyGraphs](https://smirkygraphs.github.io/). Code: [Github](https://github.com/SmirkyGraphs/Python-Notebooks). Source: [NHL-API](https://gitlab.com/dword4/nhlapi) [NHL-Stats](https://www.nhl.com/stats/) [NHL-Records](https://records.nhl.com/records/skater-records|).
<hr>

# Bergeron Faceoff Stats

This Notebook pulls faceoff data for bergeron and other centers from the NHL stats & records page for mulitple seasons. The data is then cleaned and combined using pandas. For more detailed specific stats, data is also pulled from the NHL API dataset this shows faceoff-by-faceoff level data as opposed to the aggregated information from the stats and records data (for details on the queries look in `/data/queries/`).
<hr>

In [1]:
import requests
import pandas as pd
import numpy as np

import math
from pathlib import Path

pid = 8470638 # bergerons player_id
BASE_URL = 'https://statsapi.web.nhl.com/api/v1'
RECORDS_URL = 'https://records.nhl.com/site/api/skater-real-time-stats-career'
STATS_API_URL = "https://api.nhle.com/stats/rest/en/skater/faceoffwins?"
TEAM_API_URL = "https://records.nhl.com/site/api/team-season-record-and-scoring?"

In [2]:
data = {
    "isAggregate": "false",
    "isGame": "false",
    "limit": 100,
    "start": 0
}

# loop over seasons and get all faceoff players (> 300) or (playoffs > 30)
for game_type in [2, 3]:
    faceoffs = (150 if game_type == 2 else 30)
    
    for season in range(2000, 2023):
        data.update({"cayenneExp": f"gameTypeId={game_type} and seasonId=={season}{season + 1}"})
        data.update({"factCayenneExp": f"gamesPlayed>=1 and totalFaceoffs>={faceoffs}"})
        data.update({"start": 0})
        r = requests.get(STATS_API_URL, params=data)

        num = 1
        frames = []
        total = math.ceil(r.json()['total'] / 100) 
        while num <= total:
            df = pd.DataFrame(r.json()['data'])
            df['game_type'] = game_type
            frames.append(df)      

            data.update({"start": data['start'] + 100})
            num += 1

            r = requests.get(STATS_API_URL, params=data)

        # add season
        if len(frames) > 0:
            df = pd.concat(frames)
            df.to_csv(f'./data/raw/season-totals-faceoffs/{season}{season + 1}-{game_type}.csv', index=False)
            
# combine data
data = []
files = Path('./data/raw/season-totals-faceoffs').glob('*.csv')
for f in files:
    data.append(pd.read_csv(f))
df = pd.concat(data)

cols = ['seasonId', 'skaterFullName', 'totalFaceoffWins', 'totalFaceoffLosses', 'totalFaceoffs', 'gamesPlayed', 'game_type']
df = df[cols].copy()
df['percent_won'] = df['totalFaceoffWins'] / df['totalFaceoffs']
df['season_rank_pct'] = df.groupby(['seasonId', 'game_type'])['percent_won'].rank(method='dense', ascending=False)
df['season_rank_tot'] = df.groupby(['seasonId', 'game_type'])['totalFaceoffWins'].rank(method='dense', ascending=False)

df.to_csv('./data/clean/season-totals-faceoffs.csv', index=False)

In [3]:
data = {
    "isAggregate": "false",
    "isGame": "true",
    "limit": 100,
    "start": 0,
    "factCayenneExp": "gamesPlayed>=1"
}

# loop over seasons and get all game totals for bergeron
player = 'skaterFullName like "%Patrice Bergeron%"'
for game_type in [2, 3]:
    for season in range(2000, 2023):
        data.update({"cayenneExp": f'gameTypeId={game_type} and seasonId=={season}{season + 1} and {player}'})
        data.update({"start": 0})
        r = requests.get(STATS_API_URL, params=data)
        
        num = 1
        frames = []
        total = math.ceil(r.json()['total'] / 100) 
        while num <= total:
            df = pd.DataFrame(r.json()['data'])
            df['game_type'] = game_type
            frames.append(df)      

            data.update({"start": data['start'] + 100})
            num += 1

            r = requests.get(STATS_API_URL, params=data)

        # add season
        if len(frames) > 0:
            df = pd.concat(frames)
            df.to_csv(f'./data/raw/bergeron-all-games/{season}{season + 1}-{game_type}.csv', index=False)
            
# combine data
data = []
files = Path('./data/raw/bergeron-all-games').glob('*.csv')
for f in files:
    data.append(pd.read_csv(f))
df = pd.concat(data)

df.to_csv('./data/clean/bergeron-gamelog-faceoffs.csv', index=False)

In [4]:
# get team faceoff stats
cols = ['teamName', 'seasonId', 'faceoffWinPctg', 'faceoffsLost', 'faceoffsWon', 'faceoffsTaken']

data = {}
for game_type in [2, 3]:
    data.update({"cayenneExp": f'gameTypeId={game_type} and seasonId >= 20042005'})
    r = requests.get(TEAM_API_URL, params=data)

    df = pd.DataFrame(r.json()['data'])[cols]
    df['game_type'] = game_type
    df.to_csv(f'./data/raw/all-teams-faceoffs/all-teams-{game_type}.csv', index=False)

In [5]:
# endpoint for total career # of faceoffs, wins
url = f'{RECORDS_URL}?cayenneExp=gameTypeId%20=%202%20and%20faceoffsTaken%20%3E=%20500%20and%20franchiseId=null'

r = requests.get(url)
df = pd.DataFrame(r.json()['data'])
df.to_csv('./data/clean/records-api-top-faceoffs-totals.csv', index=False)

In [6]:
# game by game for all seasons
data = []
for season in range(2003, 2023):
    url = f'{BASE_URL}/people/{pid}/stats?stats=gameLog&season={season}{season+1}'
    r = requests.get(url)
    season_data = pd.json_normalize(r.json()['stats'][0]['splits'])
    data.append(season_data)
    
df = pd.concat(data)
df.to_csv('./data/clean/gamelog-faceoff-percent.csv', index=False)

In [7]:
# year by year stat totals
url = f'{BASE_URL}/people/{pid}/stats?stats=yearByYear'
r = requests.get(url)

data = r.json()['stats'][0]['splits']
df = pd.json_normalize(data)
df = df[df['league.name'] == 'National Hockey League']
df.to_csv('./data/clean/year-by-year-season-stats.csv', index=False)

In [61]:
# data from the sql database for faceoff row level data
df = pd.read_csv('./data/raw/query_results-2023-08-22_52557.csv')

# clean data & remove unwanted columns
df.loc[df['home_team'] == 'Boston Bruins', 'team_name'] = df['away_team']
df.loc[df['away_team'] == 'Boston Bruins', 'team_name'] = df['home_team']

berg_dob = pd.to_datetime('July 24, 1985')
df['age'] = (np.floor((pd.to_datetime(df['about_datetime']) - pd.to_datetime(df['birth_date'])).dt.days / 365.25)).astype(int)
df['berg_age'] = (np.floor((pd.to_datetime(df['about_datetime']) - pd.to_datetime(berg_dob)).dt.days / 365.25)).astype(int)
df['player_age_diff_from_berg'] = df['age'] - df['berg_age']

# get running count of winstreak
df['berg_win_bool'] = df['winner'] == pid
df = df.sort_values(by=['game_id', 'event_key'])
df['win_cnt'] = df['berg_win_bool'].groupby((~df['berg_win_bool']).cumsum()).cumsum()
df['win_streak'] = df['berg_win_bool'].groupby([df['game_id'], (~df['berg_win_bool']).cumsum()]).cumsum()

In [62]:
# get wanted columns and save
drop_cols = ['about_eventid', 'about_eventidx', 'height', 'weight', 'active', 'result_description', 'result_event', 
             'first_name', 'last_name', 'primary_number', 'birth_city', 'rookie', 'rosterstatus', 'primary_position_name', 
             'primary_position_type', 'primary_position_abbreviation', 'about_periodtime', 'about_goals_home',
             'about_goals_away', 'home_team', 'away_team', 'win_cnt', 'win_streak']

cols = [x for x in list(df) if x not in drop_cols]
df[cols].to_csv('./data/clean/query-clean.csv', index=False)