In [1]:
import unicodedata
import requests
import numpy as np
import re
from bs4 import BeautifulSoup

API_URL = "https://statsapi.web.nhl.com/api/v1/"

play_id_dict = {'PGSTR': 'GAME_SCHEDULED', 'PGEND': 'PERIOD_READY', 
'PSTR': 'PERIOD_START', 'FAC': 'FACEOFF', 'HIT': 'HIT', 'STOP': 'STOP', 
'SHOT': 'SHOT', 'TAKE': 'TAKEAWAY', 'BLOCK': 'BLOCKED_SHOT', 
'MISS': 'MISSED_SHOT', 'GIVE': 'GIVEAWAY', 'PEND': 'PERIOD_END', 
'GOAL': 'GOAL', 'PENL': 'PENALTY', 'GEND': 'GAME_END',
}

In [2]:
def search_for_games(start, end=None):
  """
  Gets the list of Game IDs for games played during the requested time period.

  Dates should be in YYYY-MM-DD format as string, with time period requested.
  For same day, enter a single date. Returns a list of all games played during 
  that time period.
  """
  web_request = requests.get(API_URL + f"schedule?startDate={start}&endDate={end if end else start}")
  data_json = web_request.json()
  if end == None:
    gameList = data_json["dates"][0]["games"]
  else:
    gameList = []
    for day in data_json["dates"]:
      for game in day["games"]:
        gameList.append(game)
  return gameList

def get_game_info(gameID):
  """
  Gets game information from the NHL API. 
  
  Takes a single parameter, gameID, in string format.

  Game ID is in the format YYYYTT#### where YYYY is start year of season. 
  TT is the game type: 01 Preseason, 02 Regular Season, 03 Playoffs, and 
  04 All Star. #### is the game number start at 0001. In the playoffs, the 
  second digit indicates the round, the third digit indicates the matchup, 
  and the final digit indicates the game number.

  Returns JSON 
  """
  web_request = requests.get(API_URL + f"game/{gameID}/feed/live")
  data_json = web_request.json()
  if len(data_json) == 2:
    return None
  else:
    return data_json


In [3]:
gameID = '2021020001'

In [4]:
plays = get_game_info(gameID)['liveData']['plays']['allPlays']

In [5]:
season = f"{gameID[:4]}{int(gameID[:4]) + 1}"
web_request = requests.get(f"http://www.nhl.com/scores/htmlreports/{season}/PL{gameID[4:]}.HTM")
soup = BeautifulSoup(web_request.content, 'html.parser')

In [63]:
def clean_play_by_play(pages):
  """
  Takes the raw html from the NHL play by play page and parses out each 
  of the plays.

  Takes a single parameter, 'pages' which is a list of elements from the
  NHL Play by Play site with the class of page. It then parses out each
  of the plays and does an initial cleaning of the data.

  Returns a list of plays, with each of those being a list. 
  """
  allPlays = []

  for page in pages:
    tableRows = [row for row in page.table.children if row != "\n"]
    data = []
    for row in tableRows:
      rowCells = [block for block in row.children if block != "\n"]
      cleanRow = []
      for row in rowCells:
        cleanRow.append(unicodedata.normalize('NFKD', row.get_text()).strip().replace("\n",""))
      data.append(cleanRow)
    for play in data:
      cleanData = [play for play in data if len(play) == 8]
    for play in cleanData:
      if play[0] == '#':
        continue
      else:
        play[3] = re.findall(r'(\d{1,2}:\d{2})', play[3])
        if len(play[3][0]) == 4:
          play[3][0] = '0' + play[3][0]
        homePlayers = {}
        visitPlayers = {}
        if 'Ice' not in play[6]:
          for player in play[6].split():
            if (player[-1] == 'D') & ('D1' in homePlayers):
              homePlayers['D2'] = player[:-1]
            elif player[-1] == 'D':
              homePlayers['D1'] = player[:-1]
            else:
              homePlayers[player[-1]] = player[:-1]
          for player in play[7].split():
            if (player[-1] == 'D') & ('D1' in visitPlayers):
              visitPlayers['D2'] = player[:-1]
            elif player[-1] == 'D':
              visitPlayers['D1'] = player[:-1]
            else:
              visitPlayers[player[-1]] = player[:-1]
          play[6] = homePlayers
          play[7] = visitPlayers
        allPlays.append(play)
  return allPlays


In [70]:
pxp_plays = clean_play_by_play(soup.select('.page'))
test_play = pxp_plays[5]

Play ID, Period, Strength, [Time Elapsed, Time Remaining], Event, Description, Visiting Players on Ice, Home Players on Ice

Should be able to match the period, event, and time elapsed to play in the main play list

Period to {play}.about.period<br />
Event to {play}.result.eventTypeId<br />
Time Elapsed to {play}.about.periodTime<br />

In [103]:
def clean_main_plays(raw_play_data):
  """
  Takes a single paramater of the json for the raw play data from the NHL API
  and then cleans it up for the required info, leaving spaces to be filled in
  by the NHL play sheet.
  
  Returns a list of plays in the following format: [playId, period, strength,
  [Time Elapsed, Time Remaining], Event, Descripition, Away Team Players on
  Ice, Home Team Players on Ice, ...1-4 players involved in the play, dateTime,
  Coordinates]
  """
  playList = []
  for play in raw_play_data:
    newPlay = [
      str(play['about']['eventId']), 
      str(play['about']['period']), 
      '', 
      play['about']['periodTime'], 
      play['about']['periodTimeRemaining'],
      play['result']['eventTypeId'],
      play['result']['description'],
      {},
      {},
      '',
      '',
      '',
      '',
      play['about']['dateTime'],
      play['coordinates']
    ]
    if 'players' in play.keys():
      step = 0
      for player in play['players']:
        newPlay[9 + step] = player['player']['fullName']
        step += 1
    playList.append(newPlay)
  return playList
    
test_main_plays = clean_main_plays(plays)
print(test_main_plays[0])

['1', '1', '', '00:00', '20:00', 'GAME_SCHEDULED', 'Game Scheduled', {}, {}, '', '', '', '', '2021-10-12T22:45:48Z', {}]


In [104]:
print(test_main_plays[4])

['8', '1', '', '00:18', '19:42', 'HIT', 'Ondrej Palat hit Jeff Carter', {}, {}, 'Ondrej Palat', 'Jeff Carter', '', '', '2021-10-12T23:44:24Z', {'x': 46.0, 'y': 40.0}]


In [105]:
print(test_play)

['6', '1', 'EV', ['00:18', '19:42'], 'HIT', 'TBL #18 PALAT HIT PIT #77 CARTER, Off. Zone', {'C': '77', 'R': '17', 'L': '43', 'D1': '8', 'D2': '58', 'G': '35'}, {'C': '21', 'R': '86', 'L': '18', 'D1': '27', 'D2': '81', 'G': '88'}]


play_id (main), period (main), strength (test), period_time (main), remain_time (main), play_code (main), play_description (main), visit_player_on_ice (test), home_player_on_ice (test), player_one (main), player_two (main), player_three (main), player_four (main), date_time (main), coordinate (main)

In [118]:
def combine_play_information(main_info, player_info):
  """
  Takes two parameters. The first is a list of plays from the NHL API parsed
  from the JSON and cleaned, taking the required information. The second is
  The NHL Play by Play HTML report that has been parsed.  It then executes an 
  inner join based on the period, time into the period, and the event type.

  Returns a list of plays in the following format: [playId, period, strength,
  Time Elapsed, Time Remaining, Event, Description, Away Team Players on
  Ice, Home Team Players on Ice, ...1-4 players involved in the play, dateTime,
  Coordinates]
  """
  
  print(main_info[2], player_info[0])
  print(player_info[0][1], main_info[2][1], player_info[0][1] != main_info[2][1])
  print(player_info[0][3][0], main_info[2][3], player_info[0][3][0] != main_info[2][3])
  print(player_info[0][4], player_info[0][4] not in play_id_dict.keys())
  print(player_info[0][4], main_info[2][5])
  play_index = 0
  pxp_total = len(player_info)
  for main_play in main_info:
    for x in range(play_index, pxp_total):
      if player_info[x][1] != main_play[1]:
        continue
      if player_info[x][3][0] != main_play[3]:
        continue
      if player_info[x][4] not in play_id_dict.keys():
        continue
      else:
        if play_id_dict[player_info[x][4]] != main_play[5]:
          play_index += 1
        else:
          play_index += 1
          print("match found")
      

In [119]:
combine_play_information(test_main_plays, pxp_plays)

['51', '1', '', '00:00', '20:00', 'PERIOD_START', 'Period Start', {}, {}, '', '', '', '', '2021-10-12T23:44:06Z', {}] ['1', '1', '', ['00:00', '20:00'], 'PGSTR', '', {}, {}]
1 1 False
00:00 00:00 False
PGSTR True
PGSTR PERIOD_START
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
match found
