In [1]:
!pip install requests beautifulsoup4 pandas




In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np


In [3]:
seasons = [2021,2022,2023,2024]

In [3]:
url = "https://www.basketball-reference.com/leagues/NBA_{season}.html"


In [4]:
Historic_data = []
for season in seasons:
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}.html"
    print(f"Scraping data for {season} season...")

    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    table = soup.find('table', {'id': 'per_game-team'})

    if not table:
        print(f"Team stats table not found for {season}")
        df = np.NaN
    
    headers = [th.text for th in table.find('thead').find_all('th')][1:]

    rows = table.find('tbody').find_all('tr')
    data = []
    for row in rows:
        if row.get('class') and 'thead' in row.get('class'):
            continue
        data.append([td.text for td in row.find_all('td')])

    df = pd.DataFrame(data,columns = headers)
    df['Season'] = f"{season-1}/{season}"

    Historic_data.append(df)

    final_historic_data = pd.concat(Historic_data, ignore_index=True)

    





Scraping data for 2021 season...
Scraping data for 2022 season...
Scraping data for 2023 season...
Scraping data for 2024 season...


In [4]:
df = pd.read_csv('nba_team_stats_2020_to_2024.csv')
print(df.head())

                      Team   G     MP    FG   FGA    FG%    3P   3PA    3P%  \
0         Milwaukee Bucks*  72  240.7  44.7  91.8  0.487  14.4  37.1  0.389   
1           Brooklyn Nets*  72  241.7  43.1  87.3  0.494  14.2  36.1  0.392   
2      Washington Wizards*  72  241.7  43.2  90.9  0.475  10.2  29.0  0.351   
3               Utah Jazz*  72  241.0  41.3  88.1  0.468  16.7  43.0  0.389   
4  Portland Trail Blazers*  72  240.3  41.3  91.1  0.453  15.7  40.8  0.385   

     2P  ...   ORB   DRB   TRB   AST  STL  BLK   TOV    PF    PTS     Season  
0  30.3  ...  10.3  37.8  48.1  25.5  8.1  4.6  13.8  17.3  120.1  2020/2021  
1  29.0  ...   8.9  35.5  44.4  26.8  6.7  5.3  13.5  19.0  118.6  2020/2021  
2  33.0  ...   9.7  35.5  45.2  25.5  7.3  4.1  14.4  21.6  116.6  2020/2021  
3  24.5  ...  10.6  37.6  48.3  23.7  6.6  5.2  14.2  18.5  116.4  2020/2021  
4  25.6  ...  10.6  33.9  44.5  21.3  6.9  5.0  11.1  18.9  116.1  2020/2021  

[5 rows x 25 columns]


## Collecting total wins abd conferences



In [5]:
conference_standings = []

for season in seasons:
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_standings.html"
    print(f"Scraping Conference Standings for {season} season...")

    # Fetch the webpage content
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Define conferences and an empty list to store data
    conferences = ["E", "W"]  # East and West
    standings_data = []

    for conference in conferences:
        # Locate the standings table
        table = soup.find('table', {'id': f"confs_standings_{conference}"})
        if not table:
            print(f"Standings table for {conference} conference not found for {season}. Skipping...")
            continue

        # Extract headers
        headers = [th.text for th in table.find('thead').find_all('th')][1:]  # Skip rank column

        # Extract rows of data
        rows = table.find('tbody').find_all('tr')
        data = []
        for row in rows:
            if row.get('class') and 'thead' in row.get('class'):
                continue
            team_data = [td.text for td in row.find_all('td')]
            data.append(team_data)

        # Create a DataFrame for the conference
        df = pd.DataFrame(data, columns=headers)
        df['Conference'] = "East" if conference == "E" else "West"  # Add conference column
        standings_data.append(df)

    # Combine East and West standings into one DataFrame
    standings_df = pd.concat(standings_data, ignore_index=True)
    standings_df['Season'] = f"{season-1}/{season}"  # Add season column




Scraping Conference Standings for 2021 season...
Scraping Conference Standings for 2022 season...
Scraping Conference Standings for 2023 season...
Scraping Conference Standings for 2024 season...


In [7]:
conference_standings = []

for season in seasons:
    url = f"https://www.basketball-reference.com/leagues/NBA_{season}_standings.html"
    print(f"Scraping Conference Standings for {season} season...")

    # Fetch the webpage content
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")

    # Define conferences and an empty list to store data
    conferences = ["E", "W"]  # East and West
    standings_data = []

    for conference in conferences:
        # Locate the standings table
        table = soup.find('table', {'id': f"confs_standings_{conference}"})
        if not table:
            print(f"Standings table for {conference} conference not found for {season}. Skipping...")
            continue

        # Extract headers
        headers = [th.text for th in table.find('thead').find_all('th')]  # Include rank and team name
        headers[0] = "Team"  # Rename the first column to "Team"

        # Extract rows of data
        rows = table.find('tbody').find_all('tr')
        data = []
        for row in rows:
            if row.get('class') and 'thead' in row.get('class'):
                continue  # Skip embedded headers
            # Extract the team name from the <th> tag
            team_name = row.find('th').text
            # Extract other stats from the <td> tags
            stats = [td.text for td in row.find_all('td')]
            # Combine team name with stats
            team_data = [team_name] + stats
            data.append(team_data)

        # Create a DataFrame for the conference
        df = pd.DataFrame(data, columns=headers)
        df['Conference'] = "East" if conference == "E" else "West"  # Add conference column
        standings_data.append(df)

    # Combine East and West standings into one DataFrame
    standings_df = pd.concat(standings_data, ignore_index=True)
    standings_df['Season'] = f"{season-1}/{season}"  # Add season column
    conference_standings.append(standings_df)

# Combine all seasons into one DataFrame
final_standings = pd.concat(conference_standings, ignore_index=True)
final_standings.to_csv('nba_conference_standings_2021_to_2024.csv', index=False)
print("Conference standings data saved to 'nba_conference_standings_2021_to_2024.csv'.")


Scraping Conference Standings for 2021 season...
Scraping Conference Standings for 2022 season...
Scraping Conference Standings for 2023 season...
Scraping Conference Standings for 2024 season...
Conference standings data saved to 'nba_conference_standings_2021_to_2024.csv'.


## Combining the two datasets

In [8]:
team_stats = pd.read_csv('nba_team_stats_2020_to_2024.csv')
conference_standings = pd.read_csv('nba_conference_standings_2021_to_2024.csv')


In [12]:
standings_subset = conference_standings[['Team', 'Season', 'W', 'L', 'W/L%','PS/G','PA/G', 'Conference']]
standings_subset

Unnamed: 0,Team,Season,W,L,W/L%,PS/G,PA/G,Conference
0,Philadelphia 76ers*,2020/2021,49,23,0.681,113.6,108.1,East
1,Brooklyn Nets*,2020/2021,48,24,0.667,118.6,114.1,East
2,Milwaukee Bucks*,2020/2021,46,26,0.639,120.1,114.2,East
3,New York Knicks*,2020/2021,41,31,0.569,107.0,104.7,East
4,Atlanta Hawks*,2020/2021,41,31,0.569,113.7,111.4,East
...,...,...,...,...,...,...,...,...
115,Houston Rockets,2023/2024,41,41,0.500,114.3,113.2,West
116,Utah Jazz,2023/2024,31,51,0.378,115.7,120.5,West
117,Memphis Grizzlies,2023/2024,27,55,0.329,105.8,112.8,West
118,San Antonio Spurs,2023/2024,22,60,0.268,112.1,118.6,West


In [13]:
merged_data = pd.merge(team_stats, standings_subset, on=['Team', 'Season'], how='left')

In [14]:
merged_data.to_csv('nba_combined_team_stats_2020_to_2024.csv', index=False)
print("Data successfully combined and saved to 'nba_combined_team_stats_2020_to_2024.csv'.")

print(merged_data.head())

Data successfully combined and saved to 'nba_combined_team_stats_2020_to_2024.csv'.
                      Team   G     MP    FG   FGA    FG%    3P   3PA    3P%  \
0         Milwaukee Bucks*  72  240.7  44.7  91.8  0.487  14.4  37.1  0.389   
1           Brooklyn Nets*  72  241.7  43.1  87.3  0.494  14.2  36.1  0.392   
2      Washington Wizards*  72  241.7  43.2  90.9  0.475  10.2  29.0  0.351   
3               Utah Jazz*  72  241.0  41.3  88.1  0.468  16.7  43.0  0.389   
4  Portland Trail Blazers*  72  240.3  41.3  91.1  0.453  15.7  40.8  0.385   

     2P  ...   TOV    PF    PTS     Season   W   L   W/L%   PS/G   PA/G  \
0  30.3  ...  13.8  17.3  120.1  2020/2021  46  26  0.639  120.1  114.2   
1  29.0  ...  13.5  19.0  118.6  2020/2021  48  24  0.667  118.6  114.1   
2  33.0  ...  14.4  21.6  116.6  2020/2021  34  38  0.472  116.6  118.5   
3  24.5  ...  14.2  18.5  116.4  2020/2021  52  20  0.722  116.4  107.2   
4  25.6  ...  11.1  18.9  116.1  2020/2021  42  30  0.583  116.1  