In [1]:
from selenium import webdriver
from datetime import datetime
import pandas as pd
import numpy as np
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException


# Set up the Selenium WebDriver (make sure to have the appropriate webdriver installed)
driver = webdriver.Edge(executable_path=r"C:\Program Files (x86)\msedgedriver.exe")


# URL of the webpage containing the dropdown menu
url = f"https://www.basketball-reference.com/leagues/NBA_2024_per_game.html"
driver.get(url)

# Player Ratings

In [8]:
# Assuming driver is your WebDriver instance
wait = WebDriverWait(driver, 10)

# Find the element
hide_partial_row_button = wait.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="per_game_stats_toggle_partial_table"]')))

# Scroll into view if needed
driver.execute_script("arguments[0].scrollIntoView();", hide_partial_row_button)

# Click the element
hide_partial_row_button.click()

print('Partial rows hidden.')

ElementClickInterceptedException: Message: element click intercepted: Element <button class="tooltip" tip="" type="button" id="per_game_stats_toggle_partial_table" style="background-color: rgb(255, 246, 85);">...</button> is not clickable at point (73, 7). Other element would receive the click: <a href="/leagues/NBA_2024.html">...</a>
  (Session info: MicrosoftEdge=122.0.2365.66)


In [9]:
# Get table with data
table = driver.find_element(By.XPATH,'//*[@id="per_game_stats"]')

# Extract table header
headers = table.find_element(By.XPATH, '//*[@id="per_game_stats"]/thead').text.split(' ')

# Extract data from table
tbody = table.find_element(By.XPATH, '//*[@id="per_game_stats"]/tbody')

In [10]:
# Get table data
player_rows = [row.text.split(' ') for row in tbody.find_elements(By.CLASS_NAME, 'full_table')]
player_rows[:2]

[['1',
  'Precious',
  'Achiuwa',
  'PF-C',
  '24',
  'TOT',
  '53',
  '14',
  '22.5',
  '3.3',
  '6.8',
  '.490',
  '0.4',
  '1.5',
  '.250',
  '2.9',
  '5.3',
  '.559',
  '.518',
  '1.0',
  '1.6',
  '.627',
  '2.7',
  '3.9',
  '6.6',
  '1.4',
  '0.6',
  '0.8',
  '1.1',
  '1.9',
  '8.0'],
 ['2',
  'Bam',
  'Adebayo',
  'C',
  '26',
  'MIA',
  '49',
  '49',
  '34.6',
  '7.7',
  '15.1',
  '.514',
  '0.0',
  '0.2',
  '.083',
  '7.7',
  '14.8',
  '.521',
  '.514',
  '4.9',
  '6.2',
  '.778',
  '2.2',
  '8.3',
  '10.5',
  '4.1',
  '1.1',
  '1.0',
  '2.5',
  '2.5',
  '20.3']]

In [11]:
player_data = []
for row in player_rows:   
    # Find rows where the player name has three parts 'Greg Brown III'
    if len(row) > 31:
        row.pop(1)
        row.pop(1)
        row.pop(1)
    # Find rows where the player name has two parts 'Kobe Brown'
    else:
        row.pop(1)
        row.pop(1)
    player_data.append(row)
    
player_data[:2]

[['1',
  'PF-C',
  '24',
  'TOT',
  '53',
  '14',
  '22.5',
  '3.3',
  '6.8',
  '.490',
  '0.4',
  '1.5',
  '.250',
  '2.9',
  '5.3',
  '.559',
  '.518',
  '1.0',
  '1.6',
  '.627',
  '2.7',
  '3.9',
  '6.6',
  '1.4',
  '0.6',
  '0.8',
  '1.1',
  '1.9',
  '8.0'],
 ['2',
  'C',
  '26',
  'MIA',
  '49',
  '49',
  '34.6',
  '7.7',
  '15.1',
  '.514',
  '0.0',
  '0.2',
  '.083',
  '7.7',
  '14.8',
  '.521',
  '.514',
  '4.9',
  '6.2',
  '.778',
  '2.2',
  '8.3',
  '10.5',
  '4.1',
  '1.1',
  '1.0',
  '2.5',
  '2.5',
  '20.3']]

In [12]:
player_names = []

# Extract player names from table
names = tbody.find_elements(By.CLASS_NAME, 'left')

# Extract names and add to array
for name in names:
    if len(name.text) > 3:
        player_names.append(name.text)

# Close the browser
driver.close()
player_names[:4]

['Precious Achiuwa', 'Bam Adebayo', 'Ochai Agbaji', 'Santi Aldama']

In [13]:
# Add respective team names to corresponding table data rows
for i in range(len(player_data)):
    player_data[i].insert(1, player_names[i])
player_data[:2]

[['1',
  'Precious Achiuwa',
  'PF-C',
  '24',
  'TOT',
  '53',
  '14',
  '22.5',
  '3.3',
  '6.8',
  '.490',
  '0.4',
  '1.5',
  '.250',
  '2.9',
  '5.3',
  '.559',
  '.518',
  '1.0',
  '1.6',
  '.627',
  '2.7',
  '3.9',
  '6.6',
  '1.4',
  '0.6',
  '0.8',
  '1.1',
  '1.9',
  '8.0'],
 ['2',
  'Bam Adebayo',
  'C',
  '26',
  'MIA',
  '49',
  '49',
  '34.6',
  '7.7',
  '15.1',
  '.514',
  '0.0',
  '0.2',
  '.083',
  '7.7',
  '14.8',
  '.521',
  '.514',
  '4.9',
  '6.2',
  '.778',
  '2.2',
  '8.3',
  '10.5',
  '4.1',
  '1.1',
  '1.0',
  '2.5',
  '2.5',
  '20.3']]

In [14]:
# Create a pandas Data Frame
player_df = pd.DataFrame(data=player_data, columns=headers)
player_df.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Precious Achiuwa,PF-C,24,TOT,53,14,22.5,3.3,6.8,...,0.627,2.7,3.9,6.6,1.4,0.6,0.8,1.1,1.9,8.0
1,2,Bam Adebayo,C,26,MIA,49,49,34.6,7.7,15.1,...,0.778,2.2,8.3,10.5,4.1,1.1,1.0,2.5,2.5,20.3
2,3,Ochai Agbaji,SG,23,TOT,59,10,18.9,2.1,4.8,...,0.778,0.7,1.7,2.3,0.9,0.5,0.5,0.7,1.3,5.4
3,4,Santi Aldama,PF,23,MEM,48,22,25.0,3.9,9.1,...,0.597,1.2,4.4,5.6,2.1,0.7,0.7,1.2,1.5,10.2
4,5,Nickeil Alexander-Walker,SG,25,MIN,60,16,23.1,2.6,6.1,...,0.767,0.4,1.6,2.0,2.6,0.8,0.6,1.0,1.8,7.1


In [15]:
player_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 548 entries, 0 to 547
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Rk      548 non-null    object
 1   Player  548 non-null    object
 2   Pos     548 non-null    object
 3   Age     548 non-null    object
 4   Tm      548 non-null    object
 5   G       548 non-null    object
 6   GS      548 non-null    object
 7   MP      548 non-null    object
 8   FG      548 non-null    object
 9   FGA     548 non-null    object
 10  FG%     548 non-null    object
 11  3P      548 non-null    object
 12  3PA     548 non-null    object
 13  3P%     548 non-null    object
 14  2P      548 non-null    object
 15  2PA     548 non-null    object
 16  2P%     548 non-null    object
 17  eFG%    548 non-null    object
 18  FT      548 non-null    object
 19  FTA     548 non-null    object
 20  FT%     548 non-null    object
 21  ORB     548 non-null    object
 22  DRB     548 non-null    ob

In [16]:
player_df.loc[player_df['Player'] == 'Ivica Zubac']

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
547,548,Ivica Zubac,C,26,LAC,46,46,26.7,5.1,7.8,...,3.0,6.4,9.5,1.3,0.2,1.3,1.2,2.8,12.0,


In [17]:
# Convert columns with data type object to float
player_df = player_df.apply(pd.to_numeric, errors='ignore', downcast='float')
player_df.dtypes

Rk        float32
Player     object
Pos        object
Age        object
Tm         object
G          object
GS        float32
MP        float32
FG        float32
FGA       float32
FG%       float32
3P        float32
3PA       float32
3P%       float32
2P        float32
2PA       float32
2P%       float32
eFG%      float32
FT        float32
FTA       float32
FT%       float32
ORB       float32
DRB       float32
TRB       float32
AST       float32
STL       float32
BLK       float32
TOV       float32
PF        float32
PTS       float32
dtype: object

In [18]:
# player_df = player_df.dropna()

player_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 548 entries, 0 to 547
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rk      548 non-null    float32
 1   Player  548 non-null    object 
 2   Pos     548 non-null    object 
 3   Age     548 non-null    object 
 4   Tm      548 non-null    object 
 5   G       548 non-null    object 
 6   GS      548 non-null    float32
 7   MP      548 non-null    float32
 8   FG      548 non-null    float32
 9   FGA     548 non-null    float32
 10  FG%     548 non-null    float32
 11  3P      548 non-null    float32
 12  3PA     548 non-null    float32
 13  3P%     548 non-null    float32
 14  2P      548 non-null    float32
 15  2PA     548 non-null    float32
 16  2P%     548 non-null    float32
 17  eFG%    548 non-null    float32
 18  FT      548 non-null    float32
 19  FTA     548 non-null    float32
 20  FT%     548 non-null    float32
 21  ORB     548 non-null    float32
 22  DR

In [19]:
player_df.to_csv('player_ratings.csv', sep=',', encoding='utf-8', index=False)

# Team Ratings

In [20]:
from selenium import webdriver
from datetime import datetime
import pandas as pd
import numpy as np
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException

# Set up the Selenium WebDriver (make sure to have the appropriate webdriver installed)
driver = webdriver.Edge(executable_path=r"C:\Program Files (x86)\msedgedriver.exe")

# URL of the webpage containing the dropdown menu
url = f"https://www.basketball-reference.com/leagues/NBA_2024_ratings.html"
driver.get(url)

In [21]:
# Extract table
table = WebDriverWait(driver, 5).until(
    EC.presence_of_element_located((By.XPATH, '//*[@id="ratings"]'))
)

# Get headers used in table
thead = table.find_element(By.TAG_NAME, 'thead')
headers = [header.text.split() for header in thead.find_elements(By.TAG_NAME,'tr')][1]

# Get data from table
tbody = table.find_element(By.XPATH, '//*[@id="ratings"]/tbody')
team_rows = [row.text.split() for row in tbody.find_elements(By.TAG_NAME, 'tr')]

In [22]:
team_data = []

for row in team_rows:
    # Find rows where the team name has two parts 'Boston Celtics'
    if len(row) == 16:
        row.pop(1)
        row.pop(1)
    # Find rows where the team name has three parts 'Oklahoma City Thunder'
    elif len(row) == 17:
        row.pop(1)
        row.pop(1)
        row.pop(1)
    elif row[0] == 'Unadjusted' or row[0] == 'Rk':
        continue  # Skip headers and other irrelevant rows
    team_data.append(row)
    
team_data[:2]

[['1',
  'E',
  'A',
  '47',
  '12',
  '.797',
  '10.73',
  '123.55',
  '112.50',
  '11.05',
  '10.73',
  '123.47',
  '112.42',
  '11.05'],
 ['2',
  'W',
  'NW',
  '41',
  '18',
  '.695',
  '7.92',
  '121.46',
  '113.62',
  '7.84',
  '7.75',
  '121.65',
  '113.97',
  '7.68']]

In [23]:
team_names = []

# Extract team names from table
names = tbody.find_elements(By.CLASS_NAME, 'left')

# Extract team names and add to array
for name in names:
    if name.text == 'Team':
        continue
    else:
        team_names.append(name.text)
# Close the browser
driver.close()

team_names

['Boston Celtics',
 'Oklahoma City Thunder',
 'Minnesota Timberwolves',
 'Los Angeles Clippers',
 'New Orleans Pelicans',
 'Cleveland Cavaliers',
 'Denver Nuggets',
 'New York Knicks',
 'Milwaukee Bucks',
 'Philadelphia 76ers',
 'Golden State Warriors',
 'Phoenix Suns',
 'Orlando Magic',
 'Indiana Pacers',
 'Dallas Mavericks',
 'Sacramento Kings',
 'Miami Heat',
 'Houston Rockets',
 'Los Angeles Lakers',
 'Chicago Bulls',
 'Brooklyn Nets',
 'Atlanta Hawks',
 'Utah Jazz',
 'Toronto Raptors',
 'Memphis Grizzlies',
 'San Antonio Spurs',
 'Portland Trail Blazers',
 'Detroit Pistons',
 'Washington Wizards',
 'Charlotte Hornets']

In [24]:
# Add respective team names to corresponding table data rows
for i in range(len(team_data)):
    team_data[i].insert(1, team_names[i])

In [25]:
team_data[:3]

[['1',
  'Boston Celtics',
  'E',
  'A',
  '47',
  '12',
  '.797',
  '10.73',
  '123.55',
  '112.50',
  '11.05',
  '10.73',
  '123.47',
  '112.42',
  '11.05'],
 ['2',
  'Oklahoma City Thunder',
  'W',
  'NW',
  '41',
  '18',
  '.695',
  '7.92',
  '121.46',
  '113.62',
  '7.84',
  '7.75',
  '121.65',
  '113.97',
  '7.68'],
 ['3',
  'Minnesota Timberwolves',
  'W',
  'NW',
  '42',
  '18',
  '.700',
  '7.05',
  '116.81',
  '109.55',
  '7.26',
  '6.85',
  '116.77',
  '109.73',
  '7.03']]

In [26]:
team_ratings = pd.DataFrame(data=team_data, columns=headers)
team_ratings

Unnamed: 0,Rk,Team,Conf,Div,W,L,W/L%,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A
0,1,Boston Celtics,E,A,47,12,0.797,10.73,123.55,112.5,11.05,10.73,123.47,112.42,11.05
1,2,Oklahoma City Thunder,W,NW,41,18,0.695,7.92,121.46,113.62,7.84,7.75,121.65,113.97,7.68
2,3,Minnesota Timberwolves,W,NW,42,18,0.7,7.05,116.81,109.55,7.26,6.85,116.77,109.73,7.03
3,4,Los Angeles Clippers,W,P,38,20,0.655,4.9,121.07,116.16,4.91,4.74,121.21,116.46,4.75
4,5,New Orleans Pelicans,W,SW,36,25,0.59,4.54,118.61,113.88,4.73,4.38,118.62,114.06,4.55
5,6,Cleveland Cavaliers,E,C,39,20,0.661,5.07,117.18,111.99,5.2,4.32,116.57,112.11,4.46
6,7,Denver Nuggets,W,NW,41,19,0.683,4.17,119.1,114.78,4.32,4.2,119.06,114.74,4.32
7,8,New York Knicks,E,A,35,25,0.583,3.63,118.94,115.16,3.78,3.49,118.81,115.17,3.64
8,9,Milwaukee Bucks,E,C,40,21,0.656,4.43,120.53,116.13,4.41,3.49,120.15,116.67,3.48
9,10,Philadelphia 76ers,E,A,34,25,0.576,3.36,119.4,116.02,3.38,2.82,119.0,116.13,2.87


In [27]:
team_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Rk      30 non-null     object
 1   Team    30 non-null     object
 2   Conf    30 non-null     object
 3   Div     30 non-null     object
 4   W       30 non-null     object
 5   L       30 non-null     object
 6   W/L%    30 non-null     object
 7   MOV     30 non-null     object
 8   ORtg    30 non-null     object
 9   DRtg    30 non-null     object
 10  NRtg    30 non-null     object
 11  MOV/A   30 non-null     object
 12  ORtg/A  30 non-null     object
 13  DRtg/A  30 non-null     object
 14  NRtg/A  30 non-null     object
dtypes: object(15)
memory usage: 3.6+ KB


In [28]:
# Convert selected columns from object to float
team_ratings = team_ratings.apply(pd.to_numeric, errors='ignore', downcast='float')
team_ratings.dtypes

Rk        float32
Team       object
Conf       object
Div        object
W         float32
L         float32
W/L%      float32
MOV       float32
ORtg      float32
DRtg      float32
NRtg      float32
MOV/A     float32
ORtg/A    float32
DRtg/A    float32
NRtg/A    float32
dtype: object

In [29]:
team_ratings.to_csv('team_ratings.csv', sep=',', encoding='utf-8', index=False)