In [10]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import streamlit as st
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

# URL to extract data from
url = "https://www.quanthockey.com/nhl/seasons/nhl-players-stats.html"

In [2]:
# Function to click on a specific page number
def click_page_number(page_number):
    
    # Scroll to page number links so that they're not obstructed
    pagination_element = driver.find_element(By.CLASS_NAME, "pagination")
    driver.execute_script("arguments[0].scrollIntoView();", pagination_element)
    
    try:
        
        # Find the element by link text (the page number)
        page_link = WebDriverWait(driver, 1).until(
            EC.element_to_be_clickable((By.LINK_TEXT, str(page_number)))
        )
        page_link.click()
        
        # Wait for the new content to load (if needed)
        WebDriverWait(driver, 1).until(
            EC.invisibility_of_element_located((By.CLASS_NAME, 'loading-spinner'))
        )
        
        return True
    except Exception as e:
        print(f"Error clicking on page {page_number}: {e}")
        return False

In [3]:
def extract_data(html):
    soup = BeautifulSoup(html, "html.parser")
    stats = []

    rows = soup.find_all('tr')[2:]  # Skip the first two rows
    for row in rows:
        columns = row.find_all(['th', 'td'])

        rank = columns[0].text
        player_name = columns[2].text

        values = [column.text for column in columns[3:]]  # Skip the first three columns
        values.insert(0, player_name)
        values.insert(0, rank)

        stats.append(values)

    return stats

In [5]:
driver = webdriver.Firefox()
homepage = driver.get(url)


# Initial extraction for the first set of rows
html = driver.page_source

# Extract Columns Header
soup = BeautifulSoup(html, "html.parser")
row = soup.find('tr', {'role': 'row', 'class': 'orange'})
header = [th.get_text(strip=True) for th in row.find_all('th', {'role': 'columnheader'}) if th.get_text(strip=True)]

# Get Data
stats_total = extract_data(html)

print("page 1")
page_number = 2


# Loop to load and extract additional rows
while click_page_number(page_number):
    print("page " + str(page_number))
    page_number += 1
    html = driver.page_source
    stats_total += extract_data(html)
    
driver.quit()

page = 1
page = 2
page = 3
page = 4
page = 5
page = 6
page = 7
page = 8
page = 9
page = 10
page = 11
page = 12
page = 13
page = 14
page = 15
page = 16
page = 17
Error clicking on page 18: Message: 
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:189:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:507:5
dom.find/</<@chrome://remote/content/shared/DOM.sys.mjs:132:16



In [35]:
df = pd.DataFrame(stats_total, columns=header)
df

Unnamed: 0,Rk,Name,Team,Age,Pos,GP,G,A,P,PIM,...,G/GP,A/GP,P/GP,SHOTS,SH%,HITS,BS,FOW,FOL,FO%
0,1,Nikita Kucherov,TBL,30,F,24,15,25,40,8,...,0.625,1.042,1.667,117,12.8%,13,9,0,2,0.00%
1,2,David Pastrnak,BOS,27,F,24,14,22,36,12,...,0.583,0.917,1.500,114,12.3%,24,4,0,2,0.00%
2,3,J.T. Miller,VAN,30,F,25,13,23,36,18,...,0.520,0.920,1.440,56,23.2%,52,19,270,228,54.22%
3,4,Artemi Panarin,NYR,32,F,23,15,20,35,6,...,0.652,0.870,1.522,89,16.9%,1,3,0,3,0.00%
4,5,Quinn Hughes,VAN,24,D,25,9,25,34,14,...,0.360,1.000,1.360,73,12.3%,11,19,0,0,0.00%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
827,828,Connor Hellebuyck,WPG,30,G,18,0,0,0,0,...,0.000,0.000,0.000,,,,,,,
828,829,Mackenzie Blackwood,SJS,27,G,18,0,0,0,2,...,0.000,0.000,0.000,,,,,,,
829,830,Juuse Saros,NSH,28,G,19,0,0,0,0,...,0.000,0.000,0.000,,,,,,,
830,831,Jonas Johansson,TBL,28,G,19,0,0,0,0,...,0.000,0.000,0.000,,,,,,,


In [36]:
def clean(df):
    # Remove goalie entries
    df = df[df.Pos != "G"]
    # Remove unnecessary columns
    df = df.drop(['Rk','Team'], axis=1)

    # Convert time stats (e.g., 20:12) to seconds.
    # Define function that does the string conversion:
    def time2sec(time):
        m, s = time.split(':')
        return 60 * int(m) + int(s)

    df['TOI'] = df['TOI'].apply(time2sec) 
    df['ES'] = df['ES'].apply(time2sec) 
    df['PP'] = df['PP'].apply(time2sec) 
    df['SH'] = df['SH'].apply(time2sec) 

    # Convert Position to boolean (0 = D, 1 = F)
    def Pos2bool(pos):
        if pos == "D":
            return 0
        if pos == "F":
            return 1
            
    df['Pos'] = df['Pos'].apply(Pos2bool)

    # Remove '%' symbol from columns
    def perc_rem(string):
        return string.replace('%','')

    df['PPP%'] = df['PPP%'].apply(perc_rem)
    df['FO%'] = df['FO%'].apply(perc_rem)
    df['SH%'] = df['SH%'].apply(perc_rem)

    df.set_index("Name")

    return df

In [39]:
df = pd.read_csv("data.csv")

In [41]:
names = df.Name.values

In [42]:
df.set_index("Name")

Unnamed: 0_level_0,Age,Pos,GP,G,A,P,PIM,+/-,TOI,ES,...,G/GP,A/GP,P/GP,SHOTS,SH%,HITS,BS,FOW,FOL,FO%
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Nikita Kucherov,30,1,24,15,25,40,8,-3,1295,1049,...,0.625,1.042,1.667,117,12.8,13,9,0,2,0.00
David Pastrnak,27,1,24,14,22,36,12,8,1170,934,...,0.583,0.917,1.500,114,12.3,24,4,0,2,0.00
J.T. Miller,30,1,25,13,23,36,18,11,1200,842,...,0.520,0.920,1.440,56,23.2,52,19,270,228,54.22
Artemi Panarin,32,1,23,15,20,35,6,3,1167,974,...,0.652,0.870,1.522,89,16.9,1,3,0,3,0.00
Quinn Hughes,24,0,25,9,25,34,14,18,1486,1156,...,0.360,1.000,1.360,73,12.3,11,19,0,0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Chris Tierney,29,1,12,0,0,0,2,-4,487,461,...,0.000,0.000,0.000,1,0.0,5,1,32,23,58.18
Samuel Bolduc,23,0,13,0,0,0,2,-5,635,629,...,0.000,0.000,0.000,10,0.0,11,10,0,0,0.00
Marc-Edouard Vlasic,36,0,14,0,0,0,4,-8,844,756,...,0.000,0.000,0.000,10,0.0,6,20,0,0,0.00
Waltteri Merelä,25,1,14,0,0,0,4,-1,634,567,...,0.000,0.000,0.000,16,0.0,18,3,0,1,0.00


In [50]:
def prepare_X(df):
    df = df.fillna(0)
    X = df.values
    return X

In [59]:
X_pred = prepare_X(df)[:, 1:]

In [62]:
X_pred[:, 1:]

array([[30, 1, 24, ..., 0, 2, 0.0],
       [27, 1, 24, ..., 0, 2, 0.0],
       [30, 1, 25, ..., 270, 228, 54.22],
       ...,
       [36, 0, 14, ..., 0, 0, 0.0],
       [25, 1, 14, ..., 0, 1, 0.0],
       [26, 0, 19, ..., 0, 0, 0.0]], dtype=object)

In [85]:
y = [2.3, 2.6]

In [88]:
a = [int(round(x, 0)) for x in y]

In [89]:
a

[2, 3]