In [41]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO
import io
import time
import numpy as np

In [3]:
years = [2023, 2024, 2025]
months  = ["october", "november", "december", "january", "february", "march", "april"]
teams = ['ATL', 'BOS', 'BRK', 'CHO', 'CHI', 'CLE', 'NYK', 'IND', 'MIL', 'DET', 'MIA', 'ORL', 'PHI', 'TOR', 'WAS',
         'OKC', 'MEM', 'DEN', 'LAL', 'HOU', 'LAC', 'MIN', 'DAL', 'SAC', 'GSW', 'PHO', 'SAS', 'POR', 'UTA', 'NOP']
team_name_to_code = {
    "Atlanta Hawks": "ATL",
    "Boston Celtics": "BOS",
    "Brooklyn Nets": "BRK",
    "Charlotte Hornets": "CHO",
    "Chicago Bulls": "CHI",
    "Cleveland Cavaliers": "CLE",
    "New York Knicks": "NYK",
    "Indiana Pacers": "IND",
    "Milwaukee Bucks": "MIL",
    "Detroit Pistons": "DET",
    "Miami Heat": "MIA",
    "Orlando Magic": "ORL",
    "Philadelphia 76ers": "PHI",
    "Toronto Raptors": "TOR",
    "Washington Wizards": "WAS",
    "Oklahoma City Thunder": "OKC",
    "Memphis Grizzlies": "MEM",
    "Denver Nuggets": "DEN",
    "Los Angeles Lakers": "LAL",
    "Houston Rockets": "HOU",
    "Los Angeles Clippers": "LAC",
    "Minnesota Timberwolves": "MIN",
    "Dallas Mavericks": "DAL",
    "Sacramento Kings": "SAC",
    "Golden State Warriors": "GSW",
    "Phoenix Suns": "PHO",
    "San Antonio Spurs": "SAS",
    "Portland Trail Blazers": "POR",
    "Utah Jazz": "UTA",
    "New Orleans Pelicans": "NOP"
}
all_games = []

In [None]:
def scrape_team_games(team, year):
    """
    Scrape all games for a given team from Basketball-Reference.
    """
    url = f"https://www.basketball-reference.com/teams/{team}/{year}_games.html"
    response = requests.get(url)
    
    if response.status_code != 200:
        print(f"Failed to fetch data for {team}")
        return None
    
    soup = BeautifulSoup(response.text, "html.parser")
    

    table = soup.find("table", {"id": "games"})
    if table is None:
        print(f"No table found for {team}")
        return None
    

    table_html = str(table)
    df = pd.read_html(io.StringIO(table_html))[0] 
    df = df.dropna(axis=1, how="all")
    

    

    if df.iloc[0, 0] == "Date":
        df = df[1:]

    

    rename_dict = {
        "Date": "date",
        "Start (ET)": "start_time",
        "Opponent": "opponent",
        "Unnamed: 5": "home_away", 
        "Tm": "points_scored", 
        "Opp": "points_allowed",  
        "Unnamed: 7": "result"  
    }
    
    df = df.rename(columns={col: rename_dict[col] for col in rename_dict if col in df.columns})
    
    df["date"] = pd.to_datetime(df["date"], format="%a, %b %d, %Y", errors="coerce")
    df["opponent"] = df["opponent"].map(team_name_to_code)

    df["home_away"] = df["home_away"].apply(lambda x: "Away" if x == "@" else "Home")
    df = df[df["result"].notna()]
    # Add team name
    df["team"] = team
    df["year"] = year
    

    df = df[["date", "team", "home_away", "opponent", "points_scored", "points_allowed", "result", "year"]]
    return df

In [5]:
for year in years:
    for team in teams:
        team_games = scrape_team_games(team, year)
        if team_games is not None:
            all_games.append(team_games)
        time.sleep(1) 


games_df = pd.concat(all_games, ignore_index=True)
games_df.to_csv("nba_games_by_team.csv", index=False)
print(games_df.head)
        

<bound method NDFrame.head of            date team home_away opponent points_scored points_allowed result  \
0    2022-10-19  ATL      Home      HOU           117            107      W   
1    2022-10-21  ATL      Home      ORL           108             98      W   
2    2022-10-23  ATL      Home      CHO           109            126      L   
3    2022-10-26  ATL      Away      DET           118            113      W   
4    2022-10-28  ATL      Away      DET           136            112      W   
...         ...  ...       ...      ...           ...            ...    ...   
6983 2025-03-09  NOP      Home      MEM           104            107      L   
6984 2025-03-11  NOP      Home      LAC           127            120      W   
6985 2025-03-13  NOP      Home      ORL            93            113      L   
6986 2025-03-15  NOP      Away      SAS           115            119      L   
6987 2025-03-17  NOP      Home      DET            81            127      L   

      year  
0     20

In [6]:
df = pd.read_csv("nba_games_by_team.csv") 

In [7]:
df["points_scored"] = pd.to_numeric(df["points_scored"], errors="coerce")
df["points_allowed"] = pd.to_numeric(df["points_allowed"], errors="coerce")

In [8]:
print(df.head)


<bound method NDFrame.head of             date team home_away opponent  points_scored  points_allowed  \
0     2022-10-19  ATL      Home      HOU            117             107   
1     2022-10-21  ATL      Home      ORL            108              98   
2     2022-10-23  ATL      Home      CHO            109             126   
3     2022-10-26  ATL      Away      DET            118             113   
4     2022-10-28  ATL      Away      DET            136             112   
...          ...  ...       ...      ...            ...             ...   
6983  2025-03-09  NOP      Home      MEM            104             107   
6984  2025-03-11  NOP      Home      LAC            127             120   
6985  2025-03-13  NOP      Home      ORL             93             113   
6986  2025-03-15  NOP      Away      SAS            115             119   
6987  2025-03-17  NOP      Home      DET             81             127   

     result  year  
0         W  2023  
1         W  2023  
2        

In [None]:
all_team_stats = pd.DataFrame()

for year in years:
    url = f"https://www.basketball-reference.com/leagues/NBA_{year}.html"
    headers = {"User-Agent": "Mozilla/5.0"}
    
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    
    table = soup.find("table", {"id": "advanced-team"})
    
    if table:
        table_str = StringIO(str(table))
        team_stats = pd.read_html(table_str)[0]  

        team_stats.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in team_stats.columns]
        
        rename_dict = {
            "Unnamed: 1_level_0_Team": "team",
            "Unnamed: 10_level_0_ORtg": "ORtg",
            "Unnamed: 11_level_0_DRtg": "DRtg",
            "Unnamed: 12_level_0_NRtg": "NRtg",
            "Unnamed: 13_level_0_Pace": "Pace"
        }
        # Rename relevant columns
        team_stats.rename(columns=rename_dict, inplace=True)

        team_stats = team_stats[["team", "ORtg", "DRtg", "NRtg", "Pace"]]
        team_stats["team"] = team_stats["team"].str.replace("*", "", regex=False).map(team_name_to_code)
        # Add season year
        team_stats["Year"] = year


        team_stats = team_stats[~team_stats["team"].str.contains("League Average", na=False)]
        team_stats = team_stats.dropna(subset=["team"])

        all_team_stats = pd.concat([all_team_stats, team_stats], ignore_index=True)


all_team_stats.to_csv("nba_team_advanced_stats_all_seasons.csv", index=False)


print(all_team_stats.head)


<bound method NDFrame.head of    team   ORtg   DRtg  NRtg   Pace  Year
0   BOS  118.0  111.5   6.5   98.5  2023
1   CLE  116.1  110.6   5.5   95.7  2023
2   PHI  117.7  113.3   4.4   96.9  2023
3   MEM  115.1  111.2   3.9  101.1  2023
4   MIL  115.4  111.9   3.5  100.5  2023
..  ...    ...    ...   ...    ...   ...
85  BRK  109.6  115.8  -6.2   96.0  2025
86  UTA  111.6  119.1  -7.5  100.0  2025
87  CHO  107.7  115.5  -7.8   97.7  2025
88  NOP  110.8  119.5  -8.7   99.2  2025
89  WAS  107.0  118.6 -11.6  101.3  2025

[90 rows x 6 columns]>


In [11]:
games = pd.read_csv("nba_games_by_team.csv")
team_stats = pd.read_csv("nba_team_advanced_stats_all_seasons.csv")

In [12]:
print("Game Data Columns:", games.columns)
print("Team Stats Columns:", team_stats.columns)

Game Data Columns: Index(['date', 'team', 'home_away', 'opponent', 'points_scored',
       'points_allowed', 'result', 'year'],
      dtype='object')
Team Stats Columns: Index(['team', 'ORtg', 'DRtg', 'NRtg', 'Pace', 'Year'], dtype='object')


In [13]:
games = games.merge(team_stats, left_on=["team", "year"], right_on=["team", "Year"], suffixes=("", "_home"))

In [14]:
games = games.merge(team_stats, left_on=["opponent", "year"], right_on=["team", "Year"], suffixes=("", "_opponent"))
games.drop(columns=["team_opponent"], inplace=True)

In [15]:
games.to_csv("nba_game_data_with_stats_all_seasons.csv", index=False)

In [16]:
print(games.head)

<bound method NDFrame.head of             date team home_away opponent  points_scored  points_allowed  \
0     2022-10-19  ATL      Home      HOU            117             107   
1     2022-11-25  ATL      Away      HOU            122             128   
2     2022-12-27  BOS      Home      HOU            126             102   
3     2023-03-13  BOS      Away      HOU            109             111   
4     2023-03-07  BRK      Away      HOU            118              96   
...          ...  ...       ...      ...            ...             ...   
6983  2025-02-05  SAS      Away      ATL            126             125   
6984  2024-11-17  POR      Home      ATL            114             110   
6985  2025-01-07  UTA      Home      ATL            121             124   
6986  2024-11-03  NOP      Home      ATL            111             126   
6987  2024-12-02  NOP      Away      ATL            112             124   

     result  year   ORtg   DRtg  NRtg   Pace  Year  ORtg_opponent  \


In [17]:
games = pd.read_csv("nba_game_data_with_stats_all_seasons.csv")
games["date"] = pd.to_datetime(games["date"])
games = games.sort_values(by=["team", "date"]) 

In [18]:
def add_rolling_avg(df, team_col, stat_col, new_col_name, window=5):
    df[new_col_name] = df.groupby(team_col)[stat_col].transform(lambda x: x.shift().rolling(window, min_periods=1).mean())

In [19]:
def last_x_games(df, games):
    col_name = "wins_last_" + str(games)
    df["win"] = (df["result"] == "W").astype(int)
    df = df.sort_values(by=["team", "date"])
    df[col_name] = df.groupby("team")["win"].transform(lambda x: x.shift().rolling(games, min_periods=1).sum())
    df[col_name] = df[col_name].fillna(0).astype(int)
    return df

In [20]:
add_rolling_avg(games, "team", "points_scored", "points_scored_rolling5")
add_rolling_avg(games, "team", "points_allowed", "points_allowed_rolling5")
games = last_x_games(games, 5)
games = last_x_games(games, 10)

In [21]:
print(games.head)

<bound method NDFrame.head of            date team home_away opponent  points_scored  points_allowed result  \
0    2022-10-19  ATL      Home      HOU            117             107      W   
82   2022-10-21  ATL      Home      ORL            108              98      W   
164  2022-10-23  ATL      Home      CHO            109             126      L   
246  2022-10-26  ATL      Away      DET            118             113      W   
247  2022-10-28  ATL      Away      DET            136             112      W   
...         ...  ...       ...      ...            ...             ...    ...   
6415 2025-03-10  WAS      Away      TOR            104             119      L   
5529 2025-03-11  WAS      Away      DET            103             123      L   
5530 2025-03-13  WAS      Away      DET            129             125      W   
6055 2025-03-15  WAS      Away      DEN            126             123      W   
5645 2025-03-17  WAS      Away      POR             97             112      L  

In [22]:
games.to_csv("nba_game_data_with_features.csv", index=False)

In [None]:
def calculate_streaks(df):
    df = df.sort_values(by=["team", "date"]) 

    win_streaks = []
    loss_streaks = []

    for team, group in df.groupby("team"):
        win_streak = 0
        loss_streak = 0

        team_win_streaks = [0]  # First game always has a 0 streak
        team_loss_streaks = [0] 

        for result in group["win"].iloc[:-1]:  
            if result == 1:
                win_streak += 1
                loss_streak = 0 
            else:
                loss_streak += 1
                win_streak = 0  

            team_win_streaks.append(win_streak)
            team_loss_streaks.append(loss_streak)

        win_streaks.extend(team_win_streaks)
        loss_streaks.extend(team_loss_streaks)

    df["win_streak"] = win_streaks
    df["loss_streak"] = loss_streaks

    return df


In [24]:
games = pd.read_csv("nba_game_data_with_features.csv")
games = calculate_streaks(games)

In [25]:
games.to_csv("nba_game_data_with_features_win_streak.csv", index=False)

In [26]:
print(games[["date", "team", "opponent", "win_streak", "win", "loss_streak"]].head())

         date team opponent  win_streak  win  loss_streak
0  2022-10-19  ATL      HOU           0    1            0
1  2022-10-21  ATL      ORL           1    1            0
2  2022-10-23  ATL      CHO           2    0            0
3  2022-10-26  ATL      DET           0    1            1
4  2022-10-28  ATL      DET           1    1            0


In [27]:
print(games.columns)

Index(['date', 'team', 'home_away', 'opponent', 'points_scored',
       'points_allowed', 'result', 'year', 'ORtg', 'DRtg', 'NRtg', 'Pace',
       'Year', 'ORtg_opponent', 'DRtg_opponent', 'NRtg_opponent',
       'Pace_opponent', 'Year_opponent', 'points_scored_rolling5',
       'points_allowed_rolling5', 'win', 'wins_last_5', 'wins_last_10',
       'win_streak', 'loss_streak'],
      dtype='object')


In [28]:
games = games = pd.read_csv("nba_game_data_with_features_win_streak.csv")

In [29]:
games["date"] = pd.to_datetime(games["date"], errors="coerce")
games = games.sort_values(by=["team", "date"])
games["rest_days"] = games.groupby("team")["date"].diff().dt.days.fillna(3).astype(int)

In [30]:
print(games[["date", "team", "opponent", "rest_days"]].head())

        date team opponent  rest_days
0 2022-10-19  ATL      HOU          3
1 2022-10-21  ATL      ORL          2
2 2022-10-23  ATL      CHO          2
3 2022-10-26  ATL      DET          3
4 2022-10-28  ATL      DET          2


In [None]:

games["home_win"] = ((games["home_away"] == "Home") & (games["result"] == "W")).astype(int)
games["away_win"] = ((games["home_away"] == "Away") & (games["result"] == "W")).astype(int)


games["home_win_pct"] = games.groupby("team")["home_win"].transform(lambda x: x.shift().rolling(10, min_periods=1).mean())
games["away_win_pct"] = games.groupby("team")["away_win"].transform(lambda x: x.shift().rolling(10, min_periods=1).mean())

In [None]:

games["head_to_head_wins"] = games.groupby(["team", "opponent"])["win"].transform(lambda x: x.shift().rolling(5, min_periods=1).sum())

In [None]:

games["season_win_pct"] = games.groupby("team")["win"].transform(lambda x: x.expanding().mean())


In [None]:

games["points_scored"] = pd.to_numeric(games["points_scored"], errors="coerce")
games["points_allowed"] = pd.to_numeric(games["points_allowed"], errors="coerce")


games["point_differential"] = games["points_scored"] - games["points_allowed"]


games["rolling_point_differential"] = games.groupby("team")["point_differential"].transform(lambda x: x.shift().rolling(5, min_periods=1).mean())


In [35]:
print(games[["date", "team", "opponent", "rest_days", "home_win_pct", "away_win_pct", "head_to_head_wins", "season_win_pct"]].head())

        date team opponent  rest_days  home_win_pct  away_win_pct  \
0 2022-10-19  ATL      HOU          3           NaN           NaN   
1 2022-10-21  ATL      ORL          2      1.000000          0.00   
2 2022-10-23  ATL      CHO          2      1.000000          0.00   
3 2022-10-26  ATL      DET          3      0.666667          0.00   
4 2022-10-28  ATL      DET          2      0.500000          0.25   

   head_to_head_wins  season_win_pct  
0                NaN        1.000000  
1                NaN        1.000000  
2                NaN        0.666667  
3                NaN        0.750000  
4                1.0        0.800000  


In [None]:

games["head_to_head_wins"] = games["head_to_head_wins"].fillna(0).astype(int)
games["points_scored_rolling5"] = games["points_scored_rolling5"].fillna(0).astype(int)
games["points_allowed_rolling5"] = games["points_allowed_rolling5"].fillna(0).astype(int)
games["rolling_point_differential"] = games["rolling_point_differential"].fillna(0).astype(int)


games["home_win_pct"] = games["home_win_pct"].fillna(0.5)
games["away_win_pct"] = games["away_win_pct"].fillna(0.5)


games["season_win_pct"] = games["season_win_pct"].fillna(0)

In [None]:
print(games.isnull().sum())
print(games.dtypes)

date                          0
team                          0
home_away                     0
opponent                      0
points_scored                 0
points_allowed                0
result                        0
year                          0
ORtg                          0
DRtg                          0
NRtg                          0
Pace                          0
Year                          0
ORtg_opponent                 0
DRtg_opponent                 0
NRtg_opponent                 0
Pace_opponent                 0
Year_opponent                 0
points_scored_rolling5        0
points_allowed_rolling5       0
win                           0
wins_last_5                   0
wins_last_10                  0
win_streak                    0
loss_streak                   0
rest_days                     0
home_win                      0
away_win                      0
home_win_pct                  0
away_win_pct                  0
head_to_head_wins             0
season_w

In [38]:
games.to_csv("nba_game_data_with_all_features.csv", index=False)

In [39]:
games = games.sort_values(by=["team", "date"])

In [None]:
# Function for weighted rolling average
def weighted_rolling_avg(series, window):
    weights = np.arange(1, window + 1) 
    return series.rolling(window).apply(lambda x: np.dot(x, weights) / weights.sum(), raw=True)

# Apply to key features
games["weighted_points_scored"] = (
    games.groupby("team")["points_scored"]
    .apply(lambda x: weighted_rolling_avg(x.shift(1), 5))  
    .reset_index(level=0, drop=True)  
)

games["weighted_points_allowed"] = (
    games.groupby("team")["points_allowed"]
    .apply(lambda x: weighted_rolling_avg(x.shift(1), 5))  
    .reset_index(level=0, drop=True)  
)

In [None]:
def games_played_last_x_days(df, team_col, date_col, days):
    df = df.sort_values(by=[team_col, date_col])
    

    games_played = (
        df.groupby(team_col)[date_col]
        .apply(lambda x: x.diff().dt.days.rolling(days, min_periods=1).sum())
        .reset_index(level=0, drop=True)  
    )
    
    return games_played.fillna(0).astype(int)


games["games_last_3"] = games_played_last_x_days(games, "team", "date", 3)
games["games_last_5"] = games_played_last_x_days(games, "team", "date", 5)
games["games_last_7"] = games_played_last_x_days(games, "team", "date", 7)


In [None]:
home_rolling = (
    games[games["home_away"] == "Home"]
    .groupby("team")["points_scored"]
    .apply(lambda x: x.shift().rolling(5, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)
home_rolling_allowed = (
    games[games["home_away"] == "Home"]
    .groupby("team")["points_allowed"]
    .apply(lambda x: x.shift().rolling(5, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)
games["home_points_allowed_rolling5"] = home_rolling_allowed.reindex(games.index)
games["home_points_scored_rolling5"] = home_rolling.reindex(games.index)

In [45]:
away_rolling = (
    games[games["home_away"] == "Away"]
    .groupby("team")["points_scored"]
    .apply(lambda x: x.shift().rolling(5, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)
games["away_points_scored_rolling5"] = away_rolling.reindex(games.index)

away_rolling_allowed = (
    games[games["home_away"] == "Away"]
    .groupby("team")["points_allowed"]
    .apply(lambda x: x.shift().rolling(5, min_periods=1).mean())
    .reset_index(level=0, drop=True)
)
games["away_points_allowed_rolling5"] = away_rolling_allowed.reindex(games.index)


In [46]:
games.to_csv("nba_game_data_with_all_features.csv", index=False)

In [52]:
print(games.head)

<bound method NDFrame.head of            date team home_away opponent  points_scored  points_allowed result  \
0    2022-10-19  ATL      Home      HOU            117             107      W   
1    2022-10-21  ATL      Home      ORL            108              98      W   
2    2022-10-23  ATL      Home      CHO            109             126      L   
3    2022-10-26  ATL      Away      DET            118             113      W   
4    2022-10-28  ATL      Away      DET            136             112      W   
...         ...  ...       ...      ...            ...             ...    ...   
6983 2025-03-10  WAS      Away      TOR            104             119      L   
6984 2025-03-11  WAS      Away      DET            103             123      L   
6985 2025-03-13  WAS      Away      DET            129             125      W   
6986 2025-03-15  WAS      Away      DEN            126             123      W   
6987 2025-03-17  WAS      Away      POR             97             112      L  

In [None]:
games.fillna(0, inplace=True) 

In [49]:
max(games["away_points_allowed_rolling5"])

142.4

In [53]:
games["home_points_scored_rolling5"].fillna(
    games.groupby("team")["points_scored"].transform("mean"), inplace=True
)
games["home_points_allowed_rolling5"].fillna(
    games.groupby("team")["points_allowed"].transform("mean"), inplace=True
)
games["away_points_scored_rolling5"].fillna(
    games.groupby("team")["points_scored"].transform("mean"), inplace=True
)
games["away_points_allowed_rolling5"].fillna(
    games.groupby("team")["points_allowed"].transform("mean"), inplace=True
)


In [None]:
games.to_csv("nba_game_data_with_all_features_final.csv", index=False)