In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [2]:
url = "https://www.basketball-reference.com/leagues/NBA_2025.html"


In [3]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
season = 2025

table = soup.find('table', {'id': 'per_game-team'})

if not table:
    print(f"Team stats table not found for {season}")
    df = np.NaN

headers = [th.text for th in table.find('thead').find_all('th')][1:]

rows = table.find('tbody').find_all('tr')
data = []
for row in rows:
    if row.get('class') and 'thead' in row.get('class'):
        continue
    data.append([td.text for td in row.find_all('td')])

df = pd.DataFrame(data,columns = headers)
df['Season'] = f"{season-1}/{season}"


data_season_2025 = df
data_season_2025.to_csv('nba_team_stats_2025.csv', index=False)



## Getting Conferences and team wins

In [4]:

season = 2025

url = f"https://www.basketball-reference.com/leagues/NBA_{season}_standings.html"
print(f"Scraping Conference Standings for {season} season...")

# Fetch the webpage content
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Define conferences and an empty list to store data
conferences = ["E", "W"]  # East and West
standings_data = []

for conference in conferences:
    # Locate the standings table
    table = soup.find('table', {'id': f"confs_standings_{conference}"})
    if not table:
        print(f"Standings table for {conference} conference not found for {season}. Skipping...")
        continue

    # Extract headers
    headers = [th.text for th in table.find('thead').find_all('th')]  # Include rank and team name
    headers[0] = "Team"  # Rename the first column to "Team"

    # Extract rows of data
    rows = table.find('tbody').find_all('tr')
    data = []
    for row in rows:
        if row.get('class') and 'thead' in row.get('class'):
            continue  # Skip embedded headers
        # Extract the team name from the <th> tag
        team_name = row.find('th').text
        # Extract other stats from the <td> tags
        stats = [td.text for td in row.find_all('td')]
        # Combine team name with stats
        team_data = [team_name] + stats
        data.append(team_data)

    df = pd.DataFrame(data, columns=headers)
    df['Conference'] = "East" if conference == "E" else "West"  # Add conference column
    standings_data.append(df)

# Combine East and West standings into one DataFrame
standings_df = pd.concat(standings_data, ignore_index=True)
standings_df['Season'] = f"{season-1}/{season}"  # Add season column


# Combine all seasons into one DataFrame
final_standings = standings_df
final_standings.to_csv('nba_conference_standings_2025.csv', index=False)
print("Conference standings data saved to 'nba_conference_standings_2025.csv'.")

Scraping Conference Standings for 2025 season...
Conference standings data saved to 'nba_conference_standings_2025.csv'.


## Cleaning and merging data


In [5]:
import re

In [6]:
team_stats_2025 = pd.read_csv('nba_team_stats_2025.csv')
conference_standings_2025 = pd.read_csv('nba_conference_standings_2025.csv')

# Clean the 'Team' column in the standings DataFrame
# Remove anything inside parentheses and trailing whitespace
conference_standings_2025['Team'] = conference_standings_2025['Team'].apply(
    lambda x: re.sub(r'\s*\(.*?\)', '', x).strip()
)

Use the re.sub() function to match and remove:
\s*: Any whitespace before the parentheses.
\(.*?\): Anything inside parentheses, including the parentheses themselves.
strip(): Remove any leading or trailing whitespace.
Example: "Cleveland Cavaliers (1)" → "Cleveland Cavaliers"



In [7]:
standings_subset = conference_standings_2025[['Team', 'Season', 'W', 'L', 'W/L%','PS/G','PA/G', 'Conference']]

# Merge the DataFrames on 'Team' and 'Season'
merged_data_2025 = pd.merge(team_stats_2025, standings_subset, on=['Team', 'Season'], how='left')

# Save the combined DataFrame to a new CSV file
merged_data_2025.to_csv('nba_combined_team_stats_2025.csv', index=False)
print("Data successfully combined and saved to 'nba_combined_team_stats_2025.csv'.")

# Display a preview of the merged data
print(merged_data_2025.head())


Data successfully combined and saved to 'nba_combined_team_stats_2025.csv'.
                  Team   G     MP    FG   FGA    FG%    3P   3PA    3P%    2P  \
0    Memphis Grizzlies  30  240.0  45.1  92.9  0.486  13.9  37.6  0.370  31.2   
1  Cleveland Cavaliers  30  240.0  44.7  88.7  0.504  16.1  39.6  0.406  28.6   
2       Denver Nuggets  28  242.7  44.3  89.3  0.496  11.6  30.9  0.376  32.6   
3       Boston Celtics  30  242.5  41.7  91.5  0.456  18.4  50.4  0.365  23.3   
4     Dallas Mavericks  30  240.8  43.2  88.9  0.486  13.5  35.8  0.377  29.7   

   ...   TOV    PF    PTS     Season   W   L   W/L%   PS/G   PA/G  Conference  
0  ...  17.1  21.6  122.7  2024/2025  20  10  0.667  122.7  113.0        West  
1  ...  13.2  18.3  121.8  2024/2025  26   4  0.867  121.8  110.2        East  
2  ...  14.9  18.3  119.1  2024/2025  16  12  0.571  119.1  116.1        West  
3  ...  12.1  16.2  118.9  2024/2025  22   8  0.733  118.9  109.8        East  
4  ...  14.2  19.2  118.2  2024/2025 