In [1]:
from selenium import webdriver
from datetime import datetime
import pandas as pd
import numpy as np
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException


# Set up the Selenium WebDriver (make sure to have the appropriate webdriver installed)
driver = webdriver.Edge(executable_path=r"C:\Program Files (x86)\msedgedriver.exe")


# URL of the webpage containing the dropdown menu
url = f"https://www.basketball-reference.com/leagues/NBA_2024_per_game.html"
driver.get(url)

# Player Ratings

In [3]:
hide_partial_row_button = driver.find_element_by_xpath('//*[@id="per_game_stats_toggle_partial_table"]')

driver.execute_script("arguments[0].scrollIntoView();", hide_partial_row_button)

hide_partial_row_button.click()


# # Scroll into view
driver.execute_script("arguments[0].scrollIntoView(true);", hide_partial_row_button)

# # Click the element
hide_partial_row_button.click()

print('Hide rows with partial data')

Hide rows with partial data


In [4]:
# Get table with data
table = driver.find_element(By.XPATH,'//*[@id="per_game_stats"]')

# Extract table header
headers = table.find_element(By.XPATH, '//*[@id="per_game_stats"]/thead').text.split(' ')

# Extract data from table
tbody = table.find_element(By.XPATH, '//*[@id="per_game_stats"]/tbody')

In [5]:
# Get table data
player_rows = [row.text.split(' ') for row in tbody.find_elements(By.CLASS_NAME, 'full_table')]
player_rows[:2]

[['1',
  'Precious',
  'Achiuwa',
  'C-PF',
  '24',
  'TOT',
  '48',
  '9',
  '21.3',
  '3.3',
  '6.7',
  '.491',
  '0.4',
  '1.5',
  '.268',
  '2.9',
  '5.2',
  '.554',
  '.520',
  '0.9',
  '1.3',
  '.641',
  '2.6',
  '3.7',
  '6.3',
  '1.4',
  '0.6',
  '0.8',
  '1.0',
  '1.8',
  '7.8'],
 ['4',
  'Bam',
  'Adebayo',
  'C',
  '26',
  'MIA',
  '45',
  '45',
  '34.6',
  '7.6',
  '15.0',
  '.510',
  '0.0',
  '0.2',
  '.091',
  '7.6',
  '14.8',
  '.517',
  '.510',
  '4.9',
  '6.3',
  '.784',
  '2.3',
  '8.4',
  '10.6',
  '4.2',
  '1.1',
  '1.0',
  '2.4',
  '2.5',
  '20.2']]

In [6]:
player_data = []
for row in player_rows:   
    # Find rows where the player name has three parts 'Greg Brown III'
    if len(row) > 31:
        row.pop(1)
        row.pop(1)
        row.pop(1)
    # Find rows where the player name has two parts 'Kobe Brown'
    else:
        row.pop(1)
        row.pop(1)
    player_data.append(row)
    
player_data[:2]

[['1',
  'C-PF',
  '24',
  'TOT',
  '48',
  '9',
  '21.3',
  '3.3',
  '6.7',
  '.491',
  '0.4',
  '1.5',
  '.268',
  '2.9',
  '5.2',
  '.554',
  '.520',
  '0.9',
  '1.3',
  '.641',
  '2.6',
  '3.7',
  '6.3',
  '1.4',
  '0.6',
  '0.8',
  '1.0',
  '1.8',
  '7.8'],
 ['4',
  'C',
  '26',
  'MIA',
  '45',
  '45',
  '34.6',
  '7.6',
  '15.0',
  '.510',
  '0.0',
  '0.2',
  '.091',
  '7.6',
  '14.8',
  '.517',
  '.510',
  '4.9',
  '6.3',
  '.784',
  '2.3',
  '8.4',
  '10.6',
  '4.2',
  '1.1',
  '1.0',
  '2.4',
  '2.5',
  '20.2']]

In [7]:
player_names = []

# Extract player names from table
names = tbody.find_elements(By.CLASS_NAME, 'left')

# Extract names and add to array
for name in names:
    if len(name.text) > 3:
        player_names.append(name.text)

# Close the browser
driver.close()
player_names[:4]

['Precious Achiuwa', 'Precious Achiuwa', 'Precious Achiuwa', 'Bam Adebayo']

In [8]:
# Add respective team names to corresponding table data rows
for i in range(len(player_data)):
    player_data[i].insert(1, player_names[i])
player_data[:2]

[['1',
  'Precious Achiuwa',
  'C-PF',
  '24',
  'TOT',
  '48',
  '9',
  '21.3',
  '3.3',
  '6.7',
  '.491',
  '0.4',
  '1.5',
  '.268',
  '2.9',
  '5.2',
  '.554',
  '.520',
  '0.9',
  '1.3',
  '.641',
  '2.6',
  '3.7',
  '6.3',
  '1.4',
  '0.6',
  '0.8',
  '1.0',
  '1.8',
  '7.8'],
 ['4',
  'Precious Achiuwa',
  'C',
  '26',
  'MIA',
  '45',
  '45',
  '34.6',
  '7.6',
  '15.0',
  '.510',
  '0.0',
  '0.2',
  '.091',
  '7.6',
  '14.8',
  '.517',
  '.510',
  '4.9',
  '6.3',
  '.784',
  '2.3',
  '8.4',
  '10.6',
  '4.2',
  '1.1',
  '1.0',
  '2.4',
  '2.5',
  '20.2']]

In [9]:
# Create a pandas Data Frame
player_df = pd.DataFrame(data=player_data, columns=headers)
player_df.head()

Unnamed: 0,Rk,Player,Pos,Age,Tm,G,GS,MP,FG,FGA,...,FT%,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS
0,1,Precious Achiuwa,C-PF,24,TOT,48,9,21.3,3.3,6.7,...,0.641,2.6,3.7,6.3,1.4,0.6,0.8,1.0,1.8,7.8
1,4,Precious Achiuwa,C,26,MIA,45,45,34.6,7.6,15.0,...,0.784,2.3,8.4,10.6,4.2,1.1,1.0,2.4,2.5,20.2
2,5,Precious Achiuwa,SG,23,TOT,54,10,19.4,2.1,4.9,...,0.739,0.7,1.7,2.4,0.9,0.5,0.5,0.7,1.3,5.3
3,8,Bam Adebayo,PF,23,MEM,44,18,24.8,3.9,9.2,...,0.597,1.1,4.4,5.5,2.2,0.7,0.7,1.1,1.5,10.4
4,9,Ochai Agbaji,SG,25,MIN,55,16,23.1,2.5,5.9,...,0.763,0.4,1.5,1.9,2.5,0.9,0.5,1.0,1.9,7.0


In [10]:
player_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 540 entries, 0 to 539
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Rk      540 non-null    object
 1   Player  540 non-null    object
 2   Pos     540 non-null    object
 3   Age     540 non-null    object
 4   Tm      540 non-null    object
 5   G       540 non-null    object
 6   GS      540 non-null    object
 7   MP      540 non-null    object
 8   FG      540 non-null    object
 9   FGA     540 non-null    object
 10  FG%     540 non-null    object
 11  3P      540 non-null    object
 12  3PA     540 non-null    object
 13  3P%     540 non-null    object
 14  2P      540 non-null    object
 15  2PA     540 non-null    object
 16  2P%     540 non-null    object
 17  eFG%    540 non-null    object
 18  FT      540 non-null    object
 19  FTA     540 non-null    object
 20  FT%     540 non-null    object
 21  ORB     540 non-null    object
 22  DRB     540 non-null    ob

In [11]:
# Convert columns with data type object to float
player_df = player_df.apply(pd.to_numeric, errors='ignore', downcast='float')
player_df.dtypes

Rk        float32
Player     object
Pos        object
Age        object
Tm         object
G          object
GS        float32
MP        float32
FG        float32
FGA       float32
FG%       float32
3P        float32
3PA       float32
3P%       float32
2P        float32
2PA       float32
2P%       float32
eFG%      float32
FT        float32
FTA       float32
FT%       float32
ORB       float32
DRB       float32
TRB       float32
AST       float32
STL       float32
BLK       float32
TOV       float32
PF        float32
PTS       float32
dtype: object

In [12]:
player_df = player_df.dropna()

player_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 466 entries, 0 to 538
Data columns (total 30 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Rk      466 non-null    float32
 1   Player  466 non-null    object 
 2   Pos     466 non-null    object 
 3   Age     466 non-null    object 
 4   Tm      466 non-null    object 
 5   G       466 non-null    object 
 6   GS      466 non-null    float32
 7   MP      466 non-null    float32
 8   FG      466 non-null    float32
 9   FGA     466 non-null    float32
 10  FG%     466 non-null    float32
 11  3P      466 non-null    float32
 12  3PA     466 non-null    float32
 13  3P%     466 non-null    float32
 14  2P      466 non-null    float32
 15  2PA     466 non-null    float32
 16  2P%     466 non-null    float32
 17  eFG%    466 non-null    float32
 18  FT      466 non-null    float32
 19  FTA     466 non-null    float32
 20  FT%     466 non-null    float32
 21  ORB     466 non-null    float32
 22  DR

In [13]:
player_df.to_csv('player_ratings.csv', sep=',', encoding='utf-8', index=False)

# Team Ratings

In [14]:
from selenium import webdriver
from datetime import datetime
import pandas as pd
import numpy as np
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import Select
from selenium.webdriver.common.by import By
from selenium.common.exceptions import StaleElementReferenceException

# Set up the Selenium WebDriver (make sure to have the appropriate webdriver installed)
driver = webdriver.Edge(executable_path=r"C:\Program Files (x86)\msedgedriver.exe")

# URL of the webpage containing the dropdown menu
url = f"https://www.basketball-reference.com/leagues/NBA_2024_ratings.html"
driver.get(url)

In [15]:
# Extract table
table = WebDriverWait(driver, 5).until(
    EC.presence_of_element_located((By.XPATH, '//*[@id="ratings"]'))
)

# Get headers used in table
thead = table.find_element(By.TAG_NAME, 'thead')
headers = [header.text.split() for header in thead.find_elements(By.TAG_NAME,'tr')][1]

# Get data from table
tbody = table.find_element(By.XPATH, '//*[@id="ratings"]/tbody')
team_rows = [row.text.split() for row in tbody.find_elements(By.TAG_NAME, 'tr')]

In [16]:
team_data = []

for row in team_rows:
    # Find rows where the team name has two parts 'Boston Celtics'
    if len(row) == 16:
        row.pop(1)
        row.pop(1)
    # Find rows where the team name has three parts 'Oklahoma City Thunder'
    elif len(row) == 17:
        row.pop(1)
        row.pop(1)
        row.pop(1)
    elif row[0] == 'Unadjusted' or row[0] == 'Rk':
        continue  # Skip headers and other irrelevant rows
    team_data.append(row)
    
team_data[:2]

[['1',
  'E',
  'A',
  '43',
  '12',
  '.782',
  '10.11',
  '122.77',
  '112.41',
  '10.36',
  '10.05',
  '122.68',
  '112.38',
  '10.30'],
 ['2',
  'W',
  'NW',
  '39',
  '16',
  '.709',
  '7.25',
  '117.26',
  '109.78',
  '7.48',
  '7.24',
  '117.30',
  '109.85',
  '7.45']]

In [17]:
team_names = []

# Extract team names from table
names = tbody.find_elements(By.CLASS_NAME, 'left')

# Extract team names and add to array
for name in names:
    if name.text == 'Team':
        continue
    else:
        team_names.append(name.text)
# Close the browser
driver.close()

team_names

['Boston Celtics',
 'Minnesota Timberwolves',
 'Oklahoma City Thunder',
 'Los Angeles Clippers',
 'Cleveland Cavaliers',
 'New York Knicks',
 'New Orleans Pelicans',
 'Denver Nuggets',
 'Philadelphia 76ers',
 'Golden State Warriors',
 'Milwaukee Bucks',
 'Phoenix Suns',
 'Dallas Mavericks',
 'Orlando Magic',
 'Indiana Pacers',
 'Houston Rockets',
 'Sacramento Kings',
 'Los Angeles Lakers',
 'Miami Heat',
 'Chicago Bulls',
 'Brooklyn Nets',
 'Utah Jazz',
 'Atlanta Hawks',
 'Toronto Raptors',
 'Memphis Grizzlies',
 'San Antonio Spurs',
 'Portland Trail Blazers',
 'Detroit Pistons',
 'Washington Wizards',
 'Charlotte Hornets']

In [18]:
# Add respective team names to corresponding table data rows
for i in range(len(team_data)):
    team_data[i].insert(1, team_names[i])

In [19]:
team_data[:3]

[['1',
  'Boston Celtics',
  'E',
  'A',
  '43',
  '12',
  '.782',
  '10.11',
  '122.77',
  '112.41',
  '10.36',
  '10.05',
  '122.68',
  '112.38',
  '10.30'],
 ['2',
  'Minnesota Timberwolves',
  'W',
  'NW',
  '39',
  '16',
  '.709',
  '7.25',
  '117.26',
  '109.78',
  '7.48',
  '7.24',
  '117.30',
  '109.85',
  '7.45'],
 ['3',
  'Oklahoma City Thunder',
  'W',
  'NW',
  '37',
  '17',
  '.685',
  '7.19',
  '121.31',
  '114.14',
  '7.17',
  '7.26',
  '121.52',
  '114.27',
  '7.25']]

In [20]:
team_ratings = pd.DataFrame(data=team_data, columns=headers)
team_ratings

Unnamed: 0,Rk,Team,Conf,Div,W,L,W/L%,MOV,ORtg,DRtg,NRtg,MOV/A,ORtg/A,DRtg/A,NRtg/A
0,1,Boston Celtics,E,A,43,12,0.782,10.11,122.77,112.41,10.36,10.05,122.68,112.38,10.3
1,2,Minnesota Timberwolves,W,NW,39,16,0.709,7.25,117.26,109.78,7.48,7.24,117.3,109.85,7.45
2,3,Oklahoma City Thunder,W,NW,37,17,0.685,7.19,121.31,114.14,7.17,7.26,121.52,114.27,7.25
3,4,Los Angeles Clippers,W,P,36,17,0.679,5.57,121.49,115.88,5.61,5.5,121.65,116.1,5.55
4,5,Cleveland Cavaliers,E,C,36,17,0.679,5.68,117.58,111.77,5.81,4.98,116.92,111.82,5.1
5,6,New York Knicks,E,A,33,22,0.6,4.55,119.52,114.71,4.81,4.12,119.29,114.92,4.37
6,7,New Orleans Pelicans,W,SW,33,22,0.6,4.24,118.79,114.37,4.42,4.13,118.94,114.64,4.3
7,8,Denver Nuggets,W,NW,36,19,0.655,3.13,118.99,115.7,3.29,3.45,119.02,115.43,3.59
8,9,Philadelphia 76ers,E,A,32,22,0.593,4.39,120.22,115.78,4.44,3.48,119.58,116.04,3.55
9,10,Golden State Warriors,W,P,27,26,0.509,1.55,119.52,117.93,1.59,2.35,119.83,117.45,2.38


In [21]:
team_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 15 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Rk      30 non-null     object
 1   Team    30 non-null     object
 2   Conf    30 non-null     object
 3   Div     30 non-null     object
 4   W       30 non-null     object
 5   L       30 non-null     object
 6   W/L%    30 non-null     object
 7   MOV     30 non-null     object
 8   ORtg    30 non-null     object
 9   DRtg    30 non-null     object
 10  NRtg    30 non-null     object
 11  MOV/A   30 non-null     object
 12  ORtg/A  30 non-null     object
 13  DRtg/A  30 non-null     object
 14  NRtg/A  30 non-null     object
dtypes: object(15)
memory usage: 3.6+ KB


In [22]:
# Convert selected columns from object to float
team_ratings = team_ratings.apply(pd.to_numeric, errors='ignore', downcast='float')
team_ratings.dtypes

Rk        float32
Team       object
Conf       object
Div        object
W         float32
L         float32
W/L%      float32
MOV       float32
ORtg      float32
DRtg      float32
NRtg      float32
MOV/A     float32
ORtg/A    float32
DRtg/A    float32
NRtg/A    float32
dtype: object

In [23]:
team_ratings.to_csv('team_ratings.csv', sep=',', encoding='utf-8', index=False)