## Import libraries

In [1]:
from nba_api.stats.endpoints import leaguegamefinder, playergamelog, commonteamroster
from nba_api.stats.static import teams, players
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import os
import matplotlib.pyplot as plt

%load_ext autoreload 
%autoreload 2

## Data collection

**References:**
- https://medium.com/@juliuscecilia33/predicting-nba-game-results-using-machine-learning-and-python-6be209d6d165
- https://github.com/luke-lite/NBA-Web-Scraper/tree/main
- https://github.com/luke-lite/NBA-Prediction-Modeling?tab=readme-ov-file

In [2]:
# URL for web scraping of Boston Celtics games
FUTURE_SCHEDULE_URL = 'https://www.basketball-reference.com/teams/BOS/2025_games.html'

# 1 Fetch historical stats with LeagueGameFinder
def fetch_past_games():
    nba_teams = teams.get_teams()
    celtics = [team for team in nba_teams if team['abbreviation'] == 'BOS'][0]
    team_id = celtics['id']

    # Fetch past games
    gamefinder = leaguegamefinder.LeagueGameFinder(team_id_nullable=team_id)
    past_games = gamefinder.get_data_frames()[0]

    # Clean and process data
    past_games['GAME_DATE'] = pd.to_datetime(past_games['GAME_DATE'])
    past_games['WIN'] = past_games['WL'].apply(lambda x: 1 if x == 'W' else 0)
    past_games['HOME_GAME'] = past_games['MATCHUP'].apply(lambda x: 1 if 'vs.' in x else 0)
    return past_games

# 2 Fetch upcoming games from Basketball Reference
def fetch_future_games():
    response = requests.get(FUTURE_SCHEDULE_URL)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract the schedule table
    table = soup.find('table', {'id': 'games'})
    future_games = pd.read_html(str(table))[0]

    # Filter and clean the data
    future_games = future_games[future_games['Date'].notna()]                   # Drop invalid rows
    future_games['Date'] = pd.to_datetime(future_games['Date'], errors='coerce')
    future_games = future_games[future_games['Date'] >= datetime.now()]         # Only future dates

    return future_games

# 3 Fetch player data
def fetch_player_data():
    # Boston Celtics Team ID (1610612738)
    team_id = 1610612738

    # Fetch current team roster
    roster = commonteamroster.CommonTeamRoster(team_id=team_id).get_data_frames()[0]
    player_ids = roster['PLAYER_ID']
    player_names = roster['PLAYER']

    # Create an empty DataFrame to store player game logs
    all_player_logs = pd.DataFrame()

    # Fetch recent game logs for each player on the team
    for player_id, player_name in zip(player_ids, player_names):
        print(f"Fetching game log for: {player_name}")
        try:
            player_log = playergamelog.PlayerGameLog(player_id=player_id, season='2023-24').get_data_frames()[0]
            player_log['PLAYER_NAME'] = player_name
            all_player_logs = pd.concat([all_player_logs, player_log], ignore_index=True)
        except Exception as e:
            print(f"Error fetching data for {player_name}: {e}")

    return all_player_logs


# Step 4: Combine Historical and Future Games
def main():

    # Call functions
    print("\nFetching past games...")
    past_games = fetch_past_games()
    print("\nFetching future games...")
    future_games = fetch_future_games()
    print("\nFetching player data...")
    player_logs = fetch_player_data()

    # Save to CSV
    output_folder = "data"
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Save files into the 'data' folder
    future_games.to_csv(os.path.join(output_folder, 'celtics_future_games.csv'), index=False)
    past_games.to_csv(os.path.join(output_folder, 'celtics_past_games.csv'), index=False)
    player_logs.to_csv(os.path.join(output_folder, 'celtics_player_data.csv'), index=False)
    print(".csv files saved.")

if __name__ == "__main__":
    main()



Fetching past games...

Fetching future games...


  future_games = pd.read_html(str(table))[0]



Fetching player data...
Fetching game log for: Jayson Tatum
Fetching game log for: Jrue Holiday
Fetching game log for: Jaylen Brown
Fetching game log for: Kristaps Porziņģis
Fetching game log for: Derrick White
Fetching game log for: Payton Pritchard
Fetching game log for: Drew Peterson
Fetching game log for: JD Davison
Fetching game log for: Xavier Tillman
Fetching game log for: Jordan Walsh
Fetching game log for: Anton Watson


  all_player_logs = pd.concat([all_player_logs, player_log], ignore_index=True)


Fetching game log for: Sam Hauser
Fetching game log for: Luke Kornet
Fetching game log for: Al Horford
Fetching game log for: Jaden Springer
Fetching game log for: Baylor Scheierman


  all_player_logs = pd.concat([all_player_logs, player_log], ignore_index=True)


Fetching game log for: Neemias Queta
.csv files saved.


## Data cleaning