# NBA Data Collection for Predictive Modeling

This notebook collects data from ESPN's NBA APIs and structures it into tables suitable for predictive modeling.

## Data Sources:
- Team Information
- Game Scores/Results
- Game Summaries
- News (for context)


In [None]:
# Import necessary libraries
import requests
import pandas as pd
import json
from datetime import datetime, timedelta
import time
from typing import Dict, List, Optional
import warnings
warnings.filterwarnings('ignore')


## API Configuration and Helper Functions


In [None]:
# Base URL for NBA APIs
NBA_BASE_URL = "http://site.api.espn.com/apis/site/v2/sports/basketball/nba"

def fetch_api_data(url: str, params: Optional[Dict] = None) -> Dict:
    """
    Fetch data from ESPN API with error handling
    """
    try:
        response = requests.get(url, params=params, timeout=10)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data from {url}: {e}")
        return None

def get_all_teams() -> pd.DataFrame:
    """
    Fetch all NBA teams and return as DataFrame
    """
    url = f"{NBA_BASE_URL}/teams"
    data = fetch_api_data(url)
    
    if data and 'sports' in data:
        teams = []
        for sport in data['sports']:
            for league in sport.get('leagues', []):
                for team in league.get('teams', []):
                    team_info = team.get('team', {})
                    teams.append({
                        'team_id': team_info.get('id'),
                        'team_name': team_info.get('displayName'),
                        'team_abbreviation': team_info.get('abbreviation'),
                        'location': team_info.get('location'),
                        'conference': team_info.get('groups', {}).get('conference'),
                        'division': team_info.get('groups', {}).get('division'),
                        'logo': team_info.get('logo'),
                        'color': team_info.get('color'),
                        'alternate_color': team_info.get('alternateColor')
                    })
        return pd.DataFrame(teams)
    return pd.DataFrame()

print("Helper functions loaded successfully!")


## Table 1: NBA Teams Information


In [None]:
# Fetch all NBA teams
teams_df = get_all_teams()

print(f"Total teams fetched: {len(teams_df)}")
print("\nTeams DataFrame:")
print(teams_df.head(10))
print("\nTeams DataFrame Info:")
print(teams_df.info())


## Table 2: Game Scores and Results


In [None]:
def get_scoreboard(dates: Optional[List[str]] = None) -> pd.DataFrame:
    """
    Fetch scoreboard data for specified dates
    If no dates provided, fetches today's games
    """
    if dates is None:
        # Get today's date and a few days back
        today = datetime.now()
        dates = [(today - timedelta(days=i)).strftime('%Y%m%d') for i in range(7)]
    
    all_games = []
    
    for date in dates:
        url = f"{NBA_BASE_URL}/scoreboard"
        params = {'dates': date}
        data = fetch_api_data(url, params)
        
        if data and 'events' in data:
            for event in data['events']:
                game_info = {
                    'game_id': event.get('id'),
                    'date': date,
                    'name': event.get('name'),
                    'short_name': event.get('shortName'),
                    'status': event.get('status', {}).get('type', {}).get('description'),
                    'completed': event.get('status', {}).get('type', {}).get('completed', False),
                }
                
                # Get competition details
                competition = event.get('competitions', [{}])[0]
                game_info['venue'] = competition.get('venue', {}).get('fullName')
                game_info['attendance'] = competition.get('attendance')
                
                # Get team information and scores
                competitors = competition.get('competitors', [])
                if len(competitors) >= 2:
                    home_team = competitors[0] if competitors[0].get('homeAway') == 'home' else competitors[1]
                    away_team = competitors[1] if competitors[0].get('homeAway') == 'home' else competitors[0]
                    
                    game_info['home_team_id'] = home_team.get('team', {}).get('id')
                    game_info['home_team_name'] = home_team.get('team', {}).get('displayName')
                    game_info['home_team_abbrev'] = home_team.get('team', {}).get('abbreviation')
                    game_info['home_score'] = home_team.get('score')
                    game_info['home_winner'] = home_team.get('winner', False)
                    
                    game_info['away_team_id'] = away_team.get('team', {}).get('id')
                    game_info['away_team_name'] = away_team.get('team', {}).get('displayName')
                    game_info['away_team_abbrev'] = away_team.get('team', {}).get('abbreviation')
                    game_info['away_score'] = away_team.get('score')
                    game_info['away_winner'] = away_team.get('winner', False)
                    
                    # Calculate point differential
                    if game_info['home_score'] and game_info['away_score']:
                        game_info['point_differential'] = game_info['home_score'] - game_info['away_score']
                    else:
                        game_info['point_differential'] = None
                
                all_games.append(game_info)
        
        time.sleep(0.5)  # Be respectful to the API
    
    return pd.DataFrame(all_games)

# Fetch recent game scores
games_df = get_scoreboard()

print(f"Total games fetched: {len(games_df)}")
if len(games_df) > 0:
    print("\nGames DataFrame:")
    print(games_df.head(10))
    print("\nGames DataFrame Info:")
    print(games_df.info())
else:
    print("No games found. This might be during off-season.")


## Table 3: Detailed Game Statistics


In [None]:
def get_game_summary(game_id: str) -> Dict:
    """
    Fetch detailed game summary for a specific game
    """
    url = f"{NBA_BASE_URL}/summary"
    params = {'event': game_id}
    return fetch_api_data(url, params)

def extract_game_stats(game_data: Dict) -> pd.DataFrame:
    """
    Extract detailed statistics from game summary
    """
    if not game_data:
        return pd.DataFrame()
    
    stats_list = []
    competition = game_data.get('header', {}).get('competitions', [{}])[0]
    competitors = competition.get('competitors', [])
    
    for competitor in competitors:
        team = competitor.get('team', {})
        team_id = team.get('id')
        team_name = team.get('displayName')
        
        # Get team statistics
        team_stats = competitor.get('statistics', [])
        stats_dict = {
            'game_id': game_data.get('header', {}).get('id'),
            'team_id': team_id,
            'team_name': team_name,
            'home_away': competitor.get('homeAway'),
            'score': competitor.get('score'),
            'winner': competitor.get('winner', False)
        }
        
        # Extract all statistics
        for stat in team_stats:
            stat_name = stat.get('name', '').lower().replace(' ', '_')
            stat_value = stat.get('displayValue')
            stats_dict[stat_name] = stat_value
        
        stats_list.append(stats_dict)
    
    return pd.DataFrame(stats_list)

# Get detailed stats for completed games
if len(games_df) > 0:
    completed_games = games_df[games_df['completed'] == True]
    
    if len(completed_games) > 0:
        print(f"Fetching detailed stats for {min(5, len(completed_games))} completed games...")
        
        detailed_stats_list = []
        for idx, game in completed_games.head(5).iterrows():
            game_id = game['game_id']
            game_summary = get_game_summary(game_id)
            game_stats = extract_game_stats(game_summary)
            if len(game_stats) > 0:
                detailed_stats_list.append(game_stats)
            time.sleep(0.5)
        
        if detailed_stats_list:
            detailed_stats_df = pd.concat(detailed_stats_list, ignore_index=True)
            print(f"\nDetailed Stats DataFrame:")
            print(detailed_stats_df.head())
            print(f"\nColumns: {list(detailed_stats_df.columns)}")
        else:
            print("No detailed stats available")
            detailed_stats_df = pd.DataFrame()
    else:
        print("No completed games found in recent data")
        detailed_stats_df = pd.DataFrame()
else:
    print("No games data available")
    detailed_stats_df = pd.DataFrame()


## Table 4: Team Performance Metrics (Aggregated)


In [None]:
def calculate_team_metrics(games_df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculate aggregated team performance metrics from game results
    """
    if len(games_df) == 0:
        return pd.DataFrame()
    
    # Filter completed games with scores
    completed = games_df[
        (games_df['completed'] == True) & 
        (games_df['home_score'].notna()) & 
        (games_df['away_score'].notna())
    ].copy()
    
    if len(completed) == 0:
        return pd.DataFrame()
    
    team_metrics = []
    
    # Process home games
    home_games = completed.groupby('home_team_id').agg({
        'home_score': ['mean', 'sum', 'count'],
        'away_score': 'mean',
        'home_winner': 'sum',
        'point_differential': 'mean'
    }).reset_index()
    home_games.columns = ['team_id', 'points_for_avg', 'points_for_total', 'games_played', 
                          'points_against_avg', 'wins', 'point_diff_avg']
    home_games['team_id'] = home_games['team_id'].astype(str)
    
    # Process away games
    away_games = completed.groupby('away_team_id').agg({
        'away_score': ['mean', 'sum'],
        'home_score': 'mean',
        'away_winner': 'sum',
        'point_differential': lambda x: -x.mean()  # Reverse for away team
    }).reset_index()
    away_games.columns = ['team_id', 'points_for_avg_away', 'points_for_total_away',
                          'points_against_avg_away', 'wins_away', 'point_diff_avg_away']
    away_games['team_id'] = away_games['team_id'].astype(str)
    
    # Combine home and away
    all_teams = set(completed['home_team_id'].dropna().astype(str)) | \
                set(completed['away_team_id'].dropna().astype(str))
    
    for team_id in all_teams:
        home_data = home_games[home_games['team_id'] == team_id]
        away_data = away_games[away_games['team_id'] == team_id]
        
        home_wins = home_data['wins'].values[0] if len(home_data) > 0 else 0
        home_games_count = home_data['games_played'].values[0] if len(home_data) > 0 else 0
        away_wins = away_data['wins_away'].values[0] if len(away_data) > 0 else 0
        away_games_count = len(completed[completed['away_team_id'].astype(str) == team_id])
        
        total_games = home_games_count + away_games_count
        total_wins = home_wins + away_wins
        
        if total_games > 0:
            team_metrics.append({
                'team_id': team_id,
                'total_games': total_games,
                'total_wins': total_wins,
                'win_percentage': total_wins / total_games,
                'home_games': home_games_count,
                'home_wins': home_wins,
                'away_games': away_games_count,
                'away_wins': away_wins,
                'avg_points_for': (
                    (home_data['points_for_avg'].values[0] * home_games_count if len(home_data) > 0 else 0) +
                    (away_data['points_for_avg_away'].values[0] * away_games_count if len(away_data) > 0 else 0)
                ) / total_games if total_games > 0 else 0,
                'avg_points_against': (
                    (home_data['points_against_avg'].values[0] * home_games_count if len(home_data) > 0 else 0) +
                    (away_data['points_against_avg_away'].values[0] * away_games_count if len(away_data) > 0 else 0)
                ) / total_games if total_games > 0 else 0
            })
    
    metrics_df = pd.DataFrame(team_metrics)
    if len(metrics_df) > 0:
        metrics_df['net_rating'] = metrics_df['avg_points_for'] - metrics_df['avg_points_against']
        metrics_df = metrics_df.sort_values('win_percentage', ascending=False)
    
    return metrics_df

# Calculate team performance metrics
team_metrics_df = calculate_team_metrics(games_df)

if len(team_metrics_df) > 0:
    print("Team Performance Metrics:")
    print(team_metrics_df.head(10))
    print(f"\nTotal teams with metrics: {len(team_metrics_df)}")
else:
    print("No team metrics calculated (insufficient game data)")


## Table 5: Head-to-Head Matchup History


In [None]:
def create_matchup_history(games_df: pd.DataFrame) -> pd.DataFrame:
    """
    Create head-to-head matchup history between teams
    """
    if len(games_df) == 0:
        return pd.DataFrame()
    
    completed = games_df[
        (games_df['completed'] == True) & 
        (games_df['home_score'].notna()) & 
        (games_df['away_score'].notna())
    ].copy()
    
    if len(completed) == 0:
        return pd.DataFrame()
    
    matchups = []
    
    for idx, game in completed.iterrows():
        team1_id = str(game['home_team_id'])
        team2_id = str(game['away_team_id'])
        team1_name = game['home_team_name']
        team2_name = game['away_team_name']
        
        # Create matchup key (always smaller ID first for consistency)
        matchup_key = tuple(sorted([team1_id, team2_id]))
        
        matchups.append({
            'team1_id': matchup_key[0],
            'team2_id': matchup_key[1],
            'team1_name': team1_name if team1_id == matchup_key[0] else team2_name,
            'team2_name': team2_name if team2_id == matchup_key[1] else team1_name,
            'game_id': game['game_id'],
            'date': game['date'],
            'team1_score': game['home_score'] if team1_id == str(game['home_team_id']) else game['away_score'],
            'team2_score': game['away_score'] if team2_id == str(game['away_team_id']) else game['home_score'],
            'team1_winner': (game['home_winner'] if team1_id == str(game['home_team_id']) else game['away_winner']),
            'venue': game['venue']
        })
    
    matchup_df = pd.DataFrame(matchups)
    
    # Calculate head-to-head statistics
    if len(matchup_df) > 0:
        h2h_stats = []
        for matchup_key in matchup_df.groupby(['team1_id', 'team2_id']).groups.keys():
            matchup_games = matchup_df[
                (matchup_df['team1_id'] == matchup_key[0]) & 
                (matchup_df['team2_id'] == matchup_key[1])
            ]
            
            team1_wins = matchup_games['team1_winner'].sum()
            team2_wins = len(matchup_games) - team1_wins
            team1_avg_score = matchup_games['team1_score'].mean()
            team2_avg_score = matchup_games['team2_score'].mean()
            
            h2h_stats.append({
                'team1_id': matchup_key[0],
                'team2_id': matchup_key[1],
                'team1_name': matchup_games['team1_name'].iloc[0],
                'team2_name': matchup_games['team2_name'].iloc[0],
                'games_played': len(matchup_games),
                'team1_wins': team1_wins,
                'team2_wins': team2_wins,
                'team1_win_pct': team1_wins / len(matchup_games) if len(matchup_games) > 0 else 0,
                'team1_avg_score': team1_avg_score,
                'team2_avg_score': team2_avg_score,
                'avg_point_diff': team1_avg_score - team2_avg_score
            })
        
        return pd.DataFrame(h2h_stats)
    
    return pd.DataFrame()

# Create matchup history
matchup_history_df = create_matchup_history(games_df)

if len(matchup_history_df) > 0:
    print("Head-to-Head Matchup Statistics:")
    print(matchup_history_df.head(10))
    print(f"\nTotal unique matchups: {len(matchup_history_df)}")
else:
    print("No matchup history available")


## Table 6: Recent Form (Last N Games)


In [None]:
def get_team_recent_form(games_df: pd.DataFrame, n_games: int = 5) -> pd.DataFrame:
    """
    Calculate recent form for each team (last N games)
    """
    if len(games_df) == 0:
        return pd.DataFrame()
    
    completed = games_df[
        (games_df['completed'] == True) & 
        (games_df['home_score'].notna()) & 
        (games_df['away_score'].notna())
    ].copy()
    
    if len(completed) == 0:
        return pd.DataFrame()
    
    # Sort by date (most recent first)
    completed['date'] = pd.to_datetime(completed['date'], format='%Y%m%d', errors='coerce')
    completed = completed.sort_values('date', ascending=False)
    
    team_form = []
    all_teams = set(completed['home_team_id'].dropna().astype(str)) | \
                set(completed['away_team_id'].dropna().astype(str))
    
    for team_id in all_teams:
        team_id_str = str(team_id)
        
        # Get all games for this team
        team_games = completed[
            (completed['home_team_id'].astype(str) == team_id_str) |
            (completed['away_team_id'].astype(str) == team_id_str)
        ].head(n_games)
        
        if len(team_games) > 0:
            wins = 0
            total_points_for = 0
            total_points_against = 0
            home_games = 0
            away_games = 0
            
            for idx, game in team_games.iterrows():
                is_home = str(game['home_team_id']) == team_id_str
                points_for = game['home_score'] if is_home else game['away_score']
                points_against = game['away_score'] if is_home else game['home_score']
                won = game['home_winner'] if is_home else game['away_winner']
                
                wins += int(won)
                total_points_for += points_for
                total_points_against += points_against
                if is_home:
                    home_games += 1
                else:
                    away_games += 1
            
            team_name = team_games.iloc[0]['home_team_name'] if team_games.iloc[0]['home_team_id'] == team_id else team_games.iloc[0]['away_team_name']
            
            team_form.append({
                'team_id': team_id_str,
                'team_name': team_name,
                'games_in_period': len(team_games),
                'wins': wins,
                'losses': len(team_games) - wins,
                'win_percentage': wins / len(team_games) if len(team_games) > 0 else 0,
                'avg_points_for': total_points_for / len(team_games) if len(team_games) > 0 else 0,
                'avg_points_against': total_points_against / len(team_games) if len(team_games) > 0 else 0,
                'net_rating': (total_points_for - total_points_against) / len(team_games) if len(team_games) > 0 else 0,
                'home_games': home_games,
                'away_games': away_games
            })
    
    form_df = pd.DataFrame(team_form)
    if len(form_df) > 0:
        form_df = form_df.sort_values('win_percentage', ascending=False)
    
    return form_df

# Get recent form (last 5 games)
recent_form_df = get_team_recent_form(games_df, n_games=5)

if len(recent_form_df) > 0:
    print("Recent Form (Last 5 Games):")
    print(recent_form_df.head(10))
else:
    print("No recent form data available")


## Data Summary and Export


In [None]:
# Summary of all data tables
print("=" * 60)
print("DATA COLLECTION SUMMARY")
print("=" * 60)
print(f"\n1. Teams DataFrame: {len(teams_df)} teams")
print(f"2. Games DataFrame: {len(games_df)} games")
print(f"3. Detailed Stats DataFrame: {len(detailed_stats_df) if 'detailed_stats_df' in locals() else 0} game records")
print(f"4. Team Metrics DataFrame: {len(team_metrics_df) if 'team_metrics_df' in locals() else 0} teams")
print(f"5. Matchup History DataFrame: {len(matchup_history_df) if 'matchup_history_df' in locals() else 0} matchups")
print(f"6. Recent Form DataFrame: {len(recent_form_df) if 'recent_form_df' in locals() else 0} teams")

print("\n" + "=" * 60)
print("DATA TABLES READY FOR PREDICTIVE MODELING")
print("=" * 60)
print("\nThese tables can be used for:")
print("- Feature engineering (team stats, recent form, head-to-head)")
print("- Target variable creation (win/loss prediction, score prediction)")
print("- Time series analysis (trends over time)")
print("- Team strength ratings")
print("- Home/away performance analysis")


## Optional: Export Data to CSV Files


In [None]:
# Uncomment to export data to CSV files
# teams_df.to_csv('nba_teams.csv', index=False)
# games_df.to_csv('nba_games.csv', index=False)
# if 'detailed_stats_df' in locals() and len(detailed_stats_df) > 0:
#     detailed_stats_df.to_csv('nba_detailed_stats.csv', index=False)
# if 'team_metrics_df' in locals() and len(team_metrics_df) > 0:
#     team_metrics_df.to_csv('nba_team_metrics.csv', index=False)
# if 'matchup_history_df' in locals() and len(matchup_history_df) > 0:
#     matchup_history_df.to_csv('nba_matchup_history.csv', index=False)
# if 'recent_form_df' in locals() and len(recent_form_df) > 0:
#     recent_form_df.to_csv('nba_recent_form.csv', index=False)

print("Data export code ready (commented out). Uncomment to save CSV files.")


## Additional: Fetch Historical Data for Multiple Dates

To build a more comprehensive dataset for predictive modeling, you can fetch data for specific date ranges:


## Fetch Historical Data for Full Season

Let's fetch data for the 2023-24 season and current season to build a comprehensive dataset.


In [None]:
# Fetch historical data for 2023-24 season (Oct 2023 - June 2024)
# NBA season typically runs from October to June
print("Fetching historical game data...")
print("This may take a few minutes...\n")

# Define date ranges for 2023-24 season
start_date = datetime(2023, 10, 1)
end_date = datetime(2024, 6, 30)
current_date = datetime.now()

# Generate date list
date_list = []
current = start_date
while current <= min(end_date, current_date):
    date_list.append(current.strftime('%Y%m%d'))
    current += timedelta(days=1)

print(f"Fetching data for {len(date_list)} days ({start_date.strftime('%Y-%m-%d')} to {min(end_date, current_date).strftime('%Y-%m-%d')})")

# Fetch games in batches
historical_games = []
batch_size = 30  # Process 30 days at a time
total_batches = (len(date_list) + batch_size - 1) // batch_size

for batch_num in range(0, len(date_list), batch_size):
    batch_dates = date_list[batch_num:batch_num + batch_size]
    batch_num_display = (batch_num // batch_size) + 1
    
    print(f"Processing batch {batch_num_display}/{total_batches} ({len(batch_dates)} dates)...", end=" ")
    
    for date in batch_dates:
        url = f"{NBA_BASE_URL}/scoreboard"
        params = {'dates': date}
        data = fetch_api_data(url, params)
        
        if data and 'events' in data:
            for event in data['events']:
                game_info = {
                    'game_id': event.get('id'),
                    'date': date,
                    'name': event.get('name'),
                    'short_name': event.get('shortName'),
                    'status': event.get('status', {}).get('type', {}).get('description'),
                    'completed': event.get('status', {}).get('type', {}).get('completed', False),
                }
                
                competition = event.get('competitions', [{}])[0]
                game_info['venue'] = competition.get('venue', {}).get('fullName')
                
                competitors = competition.get('competitors', [])
                if len(competitors) >= 2:
                    home_team = competitors[0] if competitors[0].get('homeAway') == 'home' else competitors[1]
                    away_team = competitors[1] if competitors[0].get('homeAway') == 'home' else competitors[0]
                    
                    game_info['home_team_id'] = home_team.get('team', {}).get('id')
                    game_info['home_team_name'] = home_team.get('team', {}).get('displayName')
                    game_info['home_team_abbrev'] = home_team.get('team', {}).get('abbreviation')
                    game_info['home_score'] = home_team.get('score')
                    game_info['home_winner'] = home_team.get('winner', False)
                    
                    game_info['away_team_id'] = away_team.get('team', {}).get('id')
                    game_info['away_team_name'] = away_team.get('team', {}).get('displayName')
                    game_info['away_team_abbrev'] = away_team.get('team', {}).get('abbreviation')
                    game_info['away_score'] = away_team.get('score')
                    game_info['away_winner'] = away_team.get('winner', False)
                    
                    if game_info['home_score'] and game_info['away_score']:
                        game_info['point_differential'] = game_info['home_score'] - game_info['away_score']
                    else:
                        game_info['point_differential'] = None
                
                historical_games.append(game_info)
        
        time.sleep(0.3)  # Be respectful to the API
    
    print(f"✓ Found {len([g for g in historical_games[-len(batch_dates)*10:] if g.get('home_score')])} completed games")
    
    # Save progress periodically
    if batch_num_display % 5 == 0:
        temp_df = pd.DataFrame(historical_games)
        temp_df.to_csv('nba_games_temp.csv', index=False)

# Create final DataFrame
historical_games_df = pd.DataFrame(historical_games)

# Filter to completed games with scores
completed_games = historical_games_df[
    (historical_games_df['completed'] == True) & 
    (historical_games_df['home_score'].notna()) & 
    (historical_games_df['away_score'].notna())
].copy()

print(f"\n{'='*60}")
print(f"Data Collection Complete!")
print(f"{'='*60}")
print(f"Total games fetched: {len(historical_games_df)}")
print(f"Completed games with scores: {len(completed_games)}")
print(f"Date range: {completed_games['date'].min()} to {completed_games['date'].max()}")

# Save to CSV
completed_games['date'] = pd.to_datetime(completed_games['date'], format='%Y%m%d')
completed_games = completed_games.sort_values('date').reset_index(drop=True)
completed_games.to_csv('nba_games.csv', index=False)
print(f"\n✓ Saved {len(completed_games)} games to nba_games.csv")


In [None]:
# Example: Fetch data for a specific date range
# Uncomment and modify dates as needed

# from datetime import datetime, timedelta
# 
# start_date = datetime(2024, 1, 1)  # Start of date range
# end_date = datetime(2024, 1, 31)   # End of date range
# 
# date_list = []
# current_date = start_date
# while current_date <= end_date:
#     date_list.append(current_date.strftime('%Y%m%d'))
#     current_date += timedelta(days=1)
# 
# print(f"Fetching data for {len(date_list)} dates...")
# historical_games_df = get_scoreboard(dates=date_list)
# print(f"Fetched {len(historical_games_df)} games")

print("Historical data fetching code template ready (commented out).")
