# Data Scraping Using Python Selenium and BeautifulSoup

By Tanat Metmaolee

## References
    Nick's Niche: https://www.youtube.com/watch?v=GLhRlY-3QwE&t=1269s
    NBA: https://www.nba.com/stats/players
    ESPN: https://www.espn.com/nba/stats/player/_/season/2023/seasontype/2

In [25]:
import time
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.service import Service
import pandas as pd

print("Successfully imported.")

Successfully imported.


In [26]:
# Selenium Browser Built-in Options
options = webdriver.ChromeOptions()
print(options)

# Selenium 3, without having to update chromedriver to keep up its version
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options= options)

<selenium.webdriver.chrome.options.Options object at 0x000001C2C7C9CEF0>


## Things that need to be done
    1. Scrape data from NBA website.
    2. Scrape player's position from ESPN website (or NBA website with automation).
Personally, I will go for a position scraping from ESPN website and join 2 dataframes together later.

In [27]:
def get_dataset(season, season_type, sleep_time):
    """
    Scraping the data using Selenium and BeautifulSoup.
    
    :param  season (str): Season we would like to store data from. Format is 'xxxx-xx' eg. '2022-23'.
            season_type (str): Type of season. Format =  ['Regular', 'Preseason', 'Playoffs'].
            sleep_time (int): In case the system is running slow, use sleep time. Otherwise, use 0.
    """
    
    ss_type = {'Regular': 'Regular+Season',
               'Preseason': 'Pre+Season',
               'Playoffs': 'Playoffs'}
    
    # URL that we are going to navigate to
    # url = f'https://www.nba.com/stats/players/traditional?PerMode=Totals&sort=PTS&dir=-1&SeasonType={ss_type[season_type]}&Season={season}'
    url = f'https://www.nba.com/stats/leaders?SeasonType={ss_type[season_type]}&PerMode=Totals&Season={season}'
    driver.get(url)
    
    # In case the pc is running slow
    time.sleep(sleep_time)

    # Select "All" Drop down Page to show all the players at once
    select = Select(driver.find_element(By.XPATH, "/html/body/div/div/div/div/section/div/div/div/div/div/div/label/div/select"))
    select.select_by_visible_text("All")
    
    # Use BeautifulSoup to scrape the dataset
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Find the headers of each column
    header_parser = soup.find('tr', attrs = {'class' : 'Crom_headers__mzI_m'})
    
    # we sliced the header because 0 is index & 30 or above are Rank Point for each categories.
    headers = [header.get_text() for header in header_parser.find_all('th')[1:30]]
    
    # Find the records of each player
    table = soup.find('tbody', attrs = {'class' : 'Crom_body__UYOcU'})
    records = table.find_all('tr')
    
    stats = [[td.get_text() for td in records[i].find_all('td')[1:]] for i in range(len(records))]
    
    player_stats = pd.DataFrame(stats)
    
    pd.DataFrame.to_csv(player_stats, f"raw_player_stats_{season}.csv", header = headers, index = False)
    print("NBA Stats' CSV is succesfully exported.")

In [28]:
def get_player_position(season, season_type):
    """
    
    season format: xxxx e.g. 2023
    """
    
    parts = {'regular' : '2', 'post' : '3'}
    players, positions = [], []
    page_count = 1
    
    url = f'https://www.espn.com/nba/stats/player/_/season/{season}/seasontype/{parts[season_type]}'
    driver.get(url)
    
    while True:
        try:
            loadMore_button = driver.find_element(By.XPATH, '//a[@class="AnchorLink loadMore__link"]')
            page_count += 1
            # print(f"Loaded Page {page_count}.")
            loadMore_button.click()
            
            wait = WebDriverWait(driver, 5)
            element = wait.until(EC.element_to_be_clickable((By.XPATH, "//a[@class='AnchorLink loadMore__link']")))
        
        except TimeoutException:
            # print('end.')
            break
        
        except NoSuchElementException:
            break
    
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    # Find the player names and index them accordingly.
    player_table = soup.find('tbody', attrs = {'class' : 'Table__TBODY'})
    player_records = player_table.find_all('tr')
    players = [player_records[i].find('a', attrs = {'class' : 'AnchorLink'}).get_text() for i in range(len(player_records))]
    
    # Find each player positions accordingly.
    table_right = soup.find('table', attrs = {'class' : 'Table Table--align-right'})
    table_body = table_right.find('tbody', attrs = {'class' : 'Table__TBODY'})
    each_table = table_body.find_all('tr')

    positions = [each_table[i].find('div', attrs = {'class' : 'position'}).get_text() for i in range(len(each_table))]
    
    # Map Each Players in to their position accordingly.
    player_pos = {'Player' : players, 'POS' : positions}
    raw_player_pos_df = pd.DataFrame(player_pos)
    
    pd.DataFrame.to_csv(raw_player_pos_df, f"raw_player_pos_{season}.csv", index = False)
    print("NBA Player Positions' CSV exported successfully.")

In [32]:
espn_season, espn_season_type = '2024', 'regular'
nba_season, nba_season_type = '2023-24', 'Regular'

In [30]:
# Test the function
get_player_position(espn_season, espn_season_type)

NBA Player Positions' CSV exported successfully.


In [33]:
# Test the function
get_dataset(nba_season, nba_season_type, 5)

NBA Stats' CSV is succesfully exported.


In [34]:
player_stats = pd.read_csv(f'raw_player_stats_{nba_season}.csv')
player_positions = pd.read_csv(f'raw_player_pos_{espn_season}.csv')

In [None]:
player_stats.head()

Unnamed: 0,Player,Team,Age,GP,W,L,Min,PTS,FGM,FGA,...,REB,AST,TOV,STL,BLK,PF,FP,DD2,TD3,+/-
0,Jayson Tatum,BOS,25,74,52,22,2732.2,2225,727,1559,...,649,342,213,78,51,160,3691,31,1,470
1,Joel Embiid,PHI,29,66,43,23,2284.1,2183,728,1328,...,670,274,226,66,112,205,3706,39,1,424
2,Luka Doncic,DAL,24,66,33,33,2390.5,2138,719,1449,...,569,529,236,90,33,166,3747,36,10,128
3,Shai Gilgeous-Alexander,OKC,24,68,33,35,2416.0,2135,704,1381,...,329,371,192,112,65,192,3425,3,0,149
4,Giannis Antetokounmpo,MIL,28,63,47,16,2023.6,1959,707,1278,...,742,359,246,52,51,197,3451,46,6,341


In [35]:
player_positions.head()

Unnamed: 0,Player,POS
0,Luka Doncic,PG
1,Giannis Antetokounmpo,PF
2,Shai Gilgeous-Alexander,PG
3,Jalen Brunson,PG
4,Kevin Durant,PF
