In [1]:
# STEP 1: Install packages
%pip install nba_api pandas numpy scikit-learn matplotlib seaborn

Collecting nba_api
  Downloading nba_api-1.11.3-py3-none-any.whl.metadata (5.8 kB)
Downloading nba_api-1.11.3-py3-none-any.whl (318 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m319.0/319.0 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nba_api
Successfully installed nba_api-1.11.3


In [2]:
# NBA Props Predictor - Data Collection
# Run this in Google Colab or Jupyter Notebook

# STEP 2: Imports
import pandas as pd
import numpy as np
from datetime import datetime
import time
from nba_api.stats.endpoints import playergamelog, leaguedashteamstats
from nba_api.stats.static import players, teams
import warnings
warnings.filterwarnings('ignore')

print("✓ All imports successful!")
print(f"Current date: {datetime.now().strftime('%Y-%m-%d')}")

# STEP 3: Configuration
# Using NBA API for historical data (reliable and complete)
# Will use ESPN scraping for current 2025-26 season (NBA API not updating)
SEASONS = ['2022-23', '2023-24', '2024-25']  # Historical seasons via NBA API
OUTPUT_FILE = 'nba_player_game_logs.csv'
TEAM_STATS_FILE = 'nba_team_stats.csv'
CURRENT_SEASON = '2025-26'  # Will collect separately via ESPN

TARGET_PLAYERS = [
    # Superstars - USE EXACT NAMES WITH UNICODE CHARACTERS
    'LeBron James', 'Stephen Curry', 'Kevin Durant', 'Giannis Antetokounmpo',
    'Luka Dončić', 'Nikola Jokić', 'Joel Embiid', 'Jayson Tatum',
    'Damian Lillard', 'Anthony Davis', 'Devin Booker', 'Donovan Mitchell',
    'Jaylen Brown', 'Trae Young', 'Anthony Edwards', 'Shai Gilgeous-Alexander',
    'Jimmy Butler III', 'Paul George', 'Tyrese Haliburton', 'De\'Aaron Fox',  # Fixed: added III
    'Domantas Sabonis', 'Bam Adebayo', 'Julius Randle', 'DeMar DeRozan',
    'Pascal Siakam', 'LaMelo Ball', 'James Harden', 'Karl-Anthony Towns',
    'Nikola Vučević', 'Jalen Brunson', 'Fred VanVleet', 'Tyler Herro',
    'Victor Wembanyama', 'Paolo Banchero', 'Franz Wagner', 'Scottie Barnes',
    'Cade Cunningham', 'Alperen Sengun', 'Jaren Jackson Jr.', 'Mikal Bridges',
    'Darius Garland', 'Lauri Markkanen', 'Desmond Bane', 'Jalen Williams',
    'OG Anunoby', 'Jarrett Allen', 'Kristaps Porziņģis', 'CJ McCollum'
]

print(f"Configuration:")
print(f"  Seasons: {SEASONS}")
print(f"  Target players: {len(TARGET_PLAYERS)}")
print(f"  Output file: {OUTPUT_FILE}")

# STEP 4: Get active players
all_active = players.get_active_players()
active_df = pd.DataFrame(all_active)

target_df = active_df[active_df['full_name'].isin(TARGET_PLAYERS)].copy()

found_players = set(target_df['full_name'].tolist())
missing_players = set(TARGET_PLAYERS) - found_players

print(f"\n✓ Found {len(target_df)} out of {len(TARGET_PLAYERS)} target players")
if missing_players:
    print(f"\n⚠ Missing players:")
    for player in missing_players:
        print(f"  - {player}")

print(f"\nPlayers to collect:")
print(target_df[['full_name', 'id']])

# STEP 5: Collect player game logs
def get_player_season_logs(player_id, player_name, season):
    time.sleep(0.6)
    
    try:
        gamelog = playergamelog.PlayerGameLog(
            player_id=player_id,
            season=season,
            season_type_all_star='Regular Season'
        )
        df = gamelog.get_data_frames()[0]
        
        if len(df) > 0:
            df['PLAYER_ID'] = player_id
            df['PLAYER_NAME'] = player_name
            df['SEASON'] = season
            return df
        return None
        
    except Exception as e:
        print(f"  ✗ Error for {player_name} ({season}): {e}")
        return None

all_game_logs = []
total_requests = len(target_df) * len(SEASONS)
current_request = 0

print(f"\nStarting collection of {total_requests} requests...\n")

for idx, row in target_df.iterrows():
    player_id = row['id']
    player_name = row['full_name']
    
    for season in SEASONS:
        current_request += 1
        print(f"[{current_request}/{total_requests}] {player_name} - {season}", end='')
        
        df = get_player_season_logs(player_id, player_name, season)
        
        if df is not None:
            all_game_logs.append(df)
            print(f" ✓ ({len(df)} games)")
        else:
            print(" ✗ (no data)")

print(f"\n{'='*60}")
print(f"Collection complete! Total dataframes: {len(all_game_logs)}")

# STEP 6: Combine data
if all_game_logs:
    player_data = pd.concat(all_game_logs, ignore_index=True)
    
    print(f"\nCombined dataset shape: {player_data.shape}")
    print(f"Date range: {player_data['GAME_DATE'].min()} to {player_data['GAME_DATE'].max()}")
    print(f"Unique players: {player_data['PLAYER_NAME'].nunique()}")
    print(f"Total games: {len(player_data)}")
    
    print("\nColumn names:")
    print(player_data.columns.tolist())
    
    print("\nSample data:")
    print(player_data[['GAME_DATE', 'PLAYER_NAME', 'MATCHUP', 'MIN', 'PTS', 'REB', 'AST']].head())
else:
    raise Exception("ERROR: No data collected!")

# STEP 7: Parse opponent
def parse_opponent(matchup):
    if 'vs.' in matchup:
        return matchup.split('vs.')[-1].strip()
    elif '@' in matchup:
        return matchup.split('@')[-1].strip()
    return None

player_data['OPPONENT'] = player_data['MATCHUP'].apply(parse_opponent)
player_data['IS_HOME'] = player_data['MATCHUP'].str.contains('vs.').astype(int)

print("\nOpponent extraction:")
print(player_data[['MATCHUP', 'OPPONENT', 'IS_HOME']].head())

# STEP 8: Get team defensive stats
print("\n\nCollecting team statistics...")

def get_team_defensive_stats(season):
    time.sleep(0.6)
    try:
        team_stats = leaguedashteamstats.LeagueDashTeamStats(
            season=season,
            season_type_all_star='Regular Season',
            measure_type_detailed_defense='Advanced'
        )
        df = team_stats.get_data_frames()[0]
        df['SEASON'] = season
        return df
    except Exception as e:
        print(f"Error fetching team stats for {season}: {e}")
        return None

all_team_stats = []
for season in SEASONS:
    print(f"Fetching team stats for {season}...")
    team_df = get_team_defensive_stats(season)
    if team_df is not None:
        all_team_stats.append(team_df)

if all_team_stats:
    team_stats = pd.concat(all_team_stats, ignore_index=True)
    print(f"\n✓ Team stats collected: {len(team_stats)} team-seasons")
    print("\nTeam stats columns available:")
    print(team_stats.columns.tolist())
    
    # Show sample with actual column names
    display_cols = []
    if 'TEAM_NAME' in team_stats.columns:
        display_cols.append('TEAM_NAME')
    if 'TEAM_ABBREVIATION' in team_stats.columns:
        display_cols.append('TEAM_ABBREVIATION')
    elif 'TEAM_ID' in team_stats.columns:
        display_cols.append('TEAM_ID')
    if 'DEF_RATING' in team_stats.columns:
        display_cols.append('DEF_RATING')
    if 'PACE' in team_stats.columns:
        display_cols.append('PACE')
    if 'SEASON' in team_stats.columns:
        display_cols.append('SEASON')
    
    if display_cols:
        print("\nSample team stats:")
        print(team_stats[display_cols].head())
else:
    print("⚠ Warning: No team stats collected")
    team_stats = None

# STEP 9: Merge opponent defensive ratings
if team_stats is not None:
    # Find the correct team abbreviation column
    team_abbrev_col = None
    if 'TEAM_ABBREVIATION' in team_stats.columns:
        team_abbrev_col = 'TEAM_ABBREVIATION'
    elif 'TEAM_ABBRV' in team_stats.columns:
        team_abbrev_col = 'TEAM_ABBRV'
    elif 'TEAM_ID' in team_stats.columns:
        # Create mapping from team names to abbreviations
        from nba_api.stats.static import teams
        team_list = teams.get_teams()
        team_mapping = {t['full_name']: t['abbreviation'] for t in team_list}
        team_stats['TEAM_ABBREVIATION'] = team_stats['TEAM_NAME'].map(team_mapping)
        team_abbrev_col = 'TEAM_ABBREVIATION'
    
    if team_abbrev_col:
        opp_def_stats = team_stats[[team_abbrev_col, 'SEASON', 'DEF_RATING', 'PACE']].copy()
        opp_def_stats.columns = ['OPPONENT', 'SEASON', 'OPP_DEF_RATING', 'OPP_PACE']
        
        player_data = player_data.merge(
            opp_def_stats,
            on=['OPPONENT', 'SEASON'],
            how='left'
        )
        print(f"\n✓ Merged opponent defensive ratings")
        print(f"Missing opponent data: {player_data['OPP_DEF_RATING'].isna().sum()} games")
    else:
        print("\n⚠ Could not find team abbreviation column, skipping opponent merge")
else:
    print("\n⚠ No team stats available, skipping opponent merge")

# STEP 10: Convert data types
player_data['GAME_DATE'] = pd.to_datetime(player_data['GAME_DATE'])
player_data = player_data.sort_values(['PLAYER_NAME', 'GAME_DATE'])

numeric_cols = ['MIN', 'PTS', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'FGA', 'FGM', 
                'FG3A', 'FG3M', 'FTA', 'FTM', 'PLUS_MINUS']

for col in numeric_cols:
    if col in player_data.columns:
        player_data[col] = pd.to_numeric(player_data[col], errors='coerce')

print("\nData types converted to numeric")

# STEP 11: Save data
player_data.to_csv(OUTPUT_FILE, index=False)
print(f"\n✓ Player game logs saved to: {OUTPUT_FILE}")

if team_stats is not None:
    team_stats.to_csv(TEAM_STATS_FILE, index=False)
    print(f"✓ Team stats saved to: {TEAM_STATS_FILE}")

# STEP 12: Summary statistics
print(f"\n{'='*60}")
print("DATA COLLECTION SUMMARY")
print(f"{'='*60}")
print(f"Total games collected: {len(player_data)}")
print(f"Unique players: {player_data['PLAYER_NAME'].nunique()}")
print(f"Date range: {player_data['GAME_DATE'].min().date()} to {player_data['GAME_DATE'].max().date()}")
print(f"\nGames per player:")
print(player_data.groupby('PLAYER_NAME').size().describe())
print(f"\nAverage stats:")
print(player_data[['PTS', 'REB', 'AST', 'MIN']].describe())
print(f"\n✓ Data collection complete!")
print(f"\nNext step: Run 02_data_exploration.ipynb")

✓ All imports successful!
Current date: 2025-12-04
Configuration:
  Seasons: ['2022-23', '2023-24', '2024-25']
  Target players: 48
  Output file: nba_player_game_logs.csv

✓ Found 48 out of 48 target players

Players to collect:
                   full_name       id
2                Bam Adebayo  1628389
8              Jarrett Allen  1628386
12     Giannis Antetokounmpo   203507
15                OG Anunoby  1628384
20               LaMelo Ball  1630163
22            Paolo Banchero  1631094
23              Desmond Bane  1630217
26            Scottie Barnes  1630567
41              Devin Booker  1626164
47             Mikal Bridges  1628969
52              Jaylen Brown  1627759
54             Jalen Brunson  1628973
57          Jimmy Butler III   202710
97           Cade Cunningham  1630595
98             Stephen Curry   201939
102            Anthony Davis   203076
104            DeMar DeRozan   201942
113              Luka Dončić  1629029
119             Kevin Durant   201142
125       

In [7]:
# UPDATED Configuration - Save to your betting folder
import os

# Set your project path (change this to your actual path)
PROJECT_PATH = r'C:\Users\noell\Documents\betting'  # UPDATE THIS!
# Or use: PROJECT_PATH = r'C:\Users\YourName\betting'  # Windows
# Or use: PROJECT_PATH = '/Users/YourName/betting'     # Mac

# Create data folder if it doesn't exist
DATA_FOLDER = os.path.join(PROJECT_PATH, 'data')
os.makedirs(DATA_FOLDER, exist_ok=True)

# Update output paths
OUTPUT_FILE = os.path.join(DATA_FOLDER, 'nba_player_game_logs.csv')
TEAM_STATS_FILE = os.path.join(DATA_FOLDER, 'nba_team_stats.csv')

print(f"Files will be saved to:")
print(f"  {OUTPUT_FILE}")
print(f"  {TEAM_STATS_FILE}")

Files will be saved to:
  C:\Users\noell\Documents\betting/data/nba_player_game_logs.csv
  C:\Users\noell\Documents\betting/data/nba_team_stats.csv
