In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np


url = "https://www.basketball-reference.com/leagues/NBA_2025.html"


In [None]:
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")
season = 2025

table = soup.find('table', {'id': 'per_game-team'})

if not table:
    print(f"Team stats table not found for {season}")
    df = np.NaN

headers = [th.text for th in table.find('thead').find_all('th')][1:]

rows = table.find('tbody').find_all('tr')
data = []
for row in rows:
    if row.get('class') and 'thead' in row.get('class'):
        continue
    data.append([td.text for td in row.find_all('td')])

df = pd.DataFrame(data,columns = headers)
df['Season'] = f"{season-1}/{season}"


data_season_2025 = df
data_season_2025.to_csv('nba_team_stats_2025.csv', index=False)



## Getting Conferences and team wins

In [None]:

season = 2025

url = f"https://www.basketball-reference.com/leagues/NBA_{season}_standings.html"
print(f"Scraping Conference Standings for {season} season...")

# Fetch the webpage content
response = requests.get(url)
soup = BeautifulSoup(response.text, "html.parser")

# Define conferences and an empty list to store data
conferences = ["E", "W"]  # East and West
standings_data = []

for conference in conferences:
    # Locate the standings table
    table = soup.find('table', {'id': f"confs_standings_{conference}"})
    if not table:
        print(f"Standings table for {conference} conference not found for {season}. Skipping...")
        continue

    # Extract headers
    headers = [th.text for th in table.find('thead').find_all('th')]  # Include rank and team name
    headers[0] = "Team"  # Rename the first column to "Team"

    # Extract rows of data
    rows = table.find('tbody').find_all('tr')
    data = []
    for row in rows:
        if row.get('class') and 'thead' in row.get('class'):
            continue  # Skip embedded headers
        # Extract the team name from the <th> tag
        team_name = row.find('th').text
        # Extract other stats from the <td> tags
        stats = [td.text for td in row.find_all('td')]
        # Combine team name with stats
        team_data = [team_name] + stats
        data.append(team_data)

    df = pd.DataFrame(data, columns=headers)
    df['Conference'] = "East" if conference == "E" else "West"  # Add conference column
    standings_data.append(df)

# Combine East and West standings into one DataFrame
standings_df = pd.concat(standings_data, ignore_index=True)
standings_df['Season'] = f"{season-1}/{season}"  # Add season column


# Combine all seasons into one DataFrame
final_standings = standings_df
final_standings.to_csv('nba_conference_standings_2025.csv', index=False)
print("Conference standings data saved to 'nba_conference_standings_2025.csv'.")

## Cleaning and merging data


In [2]:
import re

In [None]:
team_stats_2025 = pd.read_csv('nba_team_stats_2025.csv')
conference_standings_2025 = pd.read_csv('nba_conference_standings_2025.csv')

# Clean the 'Team' column in the standings DataFrame
# Remove anything inside parentheses and trailing whitespace
conference_standings_2025['Team'] = conference_standings_2025['Team'].apply(
    lambda x: re.sub(r'\s*\(.*?\)', '', x).strip()
)

In [None]:
standings_subset = conference_standings_2025[['Team', 'Season', 'W', 'L', 'W/L%','PS/G','PA/G', 'Conference']]

# Merge the DataFrames on 'Team' and 'Season'
merged_data_2025 = pd.merge(team_stats_2025, standings_subset, on=['Team', 'Season'], how='left')

# Save the combined DataFrame to a new CSV file
merged_data_2025.to_csv('nba_combined_team_stats_2025.csv', index=False)
print("Data successfully combined and saved to 'nba_combined_team_stats_2025.csv'.")

# Display a preview of the merged data
print(merged_data_2025.head())


## Feature engineering


In [None]:
nba_stats_2025 = pd.read_csv("merged_nba_stats_2025.csv")

columns_to_drop = [col for col in nba_stats_2025.columns if "Unnamed" in col or nba_stats_2025[col].isnull().mean() > 0.5]
nba_cleaned = nba_stats_2025.drop(columns=columns_to_drop)

In [None]:
Features_2025 = ["Team","W/L%", "PW", "SRS", "NRtg", "ORtg", "TS%", "DRtg", "PA/G", "eFG%", "3P%","FG%"]

# Filter the dataset for the recommended features
filtered_data_2025 = nba_stats_2025[Features_2025]

In [None]:
filtered_data_2025.to_csv("Featured_data_2025.csv")