In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [2]:
def extract_season_data(season_year):
    session = requests.Session()
    retry = Retry(connect=5, read=5, backoff_factor=1)
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('https://', adapter)
    
    url = 'https://www.basketball-reference.com/'

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                        'AppleWebKit/537.36 (KHTML, like Gecko) '
                        'Chrome/140.0.0.0 Safari/537.36',
        'Accept-Language': 'en'
    }

    # Request for Seasons URL
    season_url = urljoin(url, f'/leagues/NBA_{season_year}.html')
    season_page = session.get(season_url, headers=headers, timeout=20)
    if season_page.status_code != 200:
        print(f'Bad status code {season_page.status_code} for {season_url}')
        return []
    season_soup = BeautifulSoup(season_page.text, 'html.parser')
    print(f'NBA_{season_year} URL:', season_page.url)

    # Request for Seasons Totals URL
    season_total_url = urljoin(url, f'/leagues/NBA_{season_year}_totals.html')
    season_total_page = session.get(season_total_url, headers=headers, timeout=20)
    if season_total_page.status_code != 200:
        print(f'Bad status code {season_total_page.status_code} for {season_total_url}')
        return []
    season_total_soup = BeautifulSoup(season_total_page.text, 'html.parser')
    print(f'NBA_{season_year}_totals URL:', season_total_page.url)

    player_links = []
    player_ids = []

    tds = season_total_soup.find_all('td', {'data-stat': 'name_display'})[:25]

    for td in tds:
        if td.a:
            player_link = urljoin(url, td.a['href'])
            player_links.append(player_link)

            player_id = td.a['href'].split('/')[2].split('.')[0]
            player_ids.append(player_id)

            print(f'Player Link: {player_link}, Player ID: {player_id}')

    return player_links

In [3]:
def extract_player_details(player_links):
    players = []
    positions = []
    heights = []
    weights = []
    experiences = []

    for player_link in player_links:
        page = requests.get(player_link)
        player_soup = BeautifulSoup(page.text, 'html.parser')

        meta_divs = player_soup.find_all('div', id='meta')

        for meta_div in meta_divs:
            # Player name
            name_tag = meta_div.find('h1')
            if name_tag and name_tag.find('span'):
                name = name_tag.find('span').text.strip()
                players.append(name)

            # Position
            position_tag = meta_div.find('p', string=lambda text: text and 'Position:' in text)
            if position_tag:
                position = position_tag.find_all_next('strong')[0].find_next('p').text.strip()
                positions.append(position)

            # Height and Weight
            height_weight_tag = meta_div.find('p', string=lambda text: text and 'cm' in text and 'kg' in text)
            if height_weight_tag:
                height_weight = height_weight_tag.text.strip()
                height_weight_parts = height_weight.split(',')
                if len(height_weight_parts) == 2:
                    height = height_weight_parts[0].strip()
                    weight = height_weight_parts[1].strip()
                    heights.append(height)
                    weights.append(weight)

            # Experience
            experience_tag = meta_div.find('p', string=lambda text: text and 'Experience:' in text)
            if experience_tag:
                experience = experience_tag.find_next('p').text.strip()
                experiences.append(experience)

    return players, positions, heights, weights, experiences

In [4]:
for season_year in range(2019, 2025):
    print(f'\nExtracting data for the {season_year} season')
    player_links = extract_season_data(season_year)
    if player_links:
        players, positions, heights, weights, experiences = extract_player_details(player_links)
        print('Players:', players)
        print('Position:', positions)
        print('Heights:', heights)
        print('Weights:', weights)
        print('Experience:', experiences)
        print('-'*40)
    else:
        print(f'Not found data for the {season_year} season')


Extracting data for the 2019 season
Bad status code 403 for https://www.basketball-reference.com/leagues/NBA_2019.html
Not found data for the 2019 season

Extracting data for the 2020 season
Bad status code 403 for https://www.basketball-reference.com/leagues/NBA_2020.html
Not found data for the 2020 season

Extracting data for the 2021 season
Bad status code 403 for https://www.basketball-reference.com/leagues/NBA_2021.html
Not found data for the 2021 season

Extracting data for the 2022 season
Bad status code 403 for https://www.basketball-reference.com/leagues/NBA_2022.html
Not found data for the 2022 season

Extracting data for the 2023 season
Bad status code 403 for https://www.basketball-reference.com/leagues/NBA_2023.html
Not found data for the 2023 season

Extracting data for the 2024 season
Bad status code 403 for https://www.basketball-reference.com/leagues/NBA_2024.html
Not found data for the 2024 season
