# CFP Committee Simulator - Data Pipeline

This notebook sets up CFBD API connection and builds our data infrastructure


In [52]:
# Cell 1: Setup and Imports
import pandas as pd
import numpy as np
import cfbd
from cfbd.rest import ApiException
import requests
import os
from datetime import datetime
import pickle
import json
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)


In [53]:
# Cell 2: Configure CFBD API
# Get your API key from: https://collegefootballdata.com/key
# Store it in .env file as CFBD_API_KEY=your_key_here

# Load environment variables
load_dotenv()

# Get API key and verify it's loaded
api_key = os.getenv('CFBD_API_KEY')
if not api_key:
    raise ValueError("‚ùå CFBD_API_KEY not found! Please set it in .env file or environment variables.")

# Remove any accidental spaces or quotes
api_key = api_key.strip().strip('"').strip("'")

if len(api_key) < 10:
    raise ValueError(f"‚ùå API key appears invalid (too short: {len(api_key)} chars)")

print(f"‚úÖ API Key loaded (length: {len(api_key)} chars)")

# Configure API with correct format
configuration = cfbd.Configuration()
configuration.api_key['Authorization'] = api_key
configuration.api_key_prefix['Authorization'] = 'Bearer'

# Create API instances
api_client = cfbd.ApiClient(configuration)
games_api = cfbd.GamesApi(api_client)
teams_api = cfbd.TeamsApi(api_client)
stats_api = cfbd.StatsApi(api_client)
ratings_api = cfbd.RatingsApi(api_client)

print("‚úÖ API Configuration Complete")

‚úÖ API Key loaded (length: 64 chars)
‚úÖ API Configuration Complete


In [54]:
# Cell 3: Fetch Games Data (Final Fixed Version)
import requests
import pandas as pd
from pathlib import Path
import os

# Configuration
current_year = 2025
FETCH_START_WEEK = 1 
FETCH_END_WEEK = 15

# Setup paths (use absolute path from notebook location)
notebook_dir = Path.cwd()
if 'notebooks' in str(notebook_dir):
    project_root = notebook_dir.parent
else:
    project_root = notebook_dir

cache_dir = project_root / 'data' / 'cache' / str(current_year)
csv_path = cache_dir / 'games_w15.csv'
cache_dir.mkdir(parents=True, exist_ok=True)

print(f"üìÅ Will save to: {csv_path}")

# Headers
headers = {
    "Authorization": f"Bearer {api_key}",
    "accept": "application/json"
}

print(f"‚¨áÔ∏è  Fetching fresh data from CFBD API (Weeks {FETCH_START_WEEK}-{FETCH_END_WEEK})...")

try:
    # 1. Fetch FBS Team List
    print("   Fetching FBS team list...")
    teams_url = "https://api.collegefootballdata.com/teams/fbs"
    teams_response = requests.get(teams_url, headers=headers, params={"year": current_year})
    fbs_teams_data = teams_response.json()
    fbs_team_names = {t['school'] for t in fbs_teams_data}
    print(f"   ‚úì Found {len(fbs_team_names)} FBS teams")

    # 2. Fetch Games
    print("   Fetching games...")
    processed_games = []
    base_games_url = "https://api.collegefootballdata.com/games"
    
    for week in range(FETCH_START_WEEK, FETCH_END_WEEK + 1):
        params = {
            "year": current_year,
            "week": week,
            "seasonType": "regular",
            "division": "fbs"
        }
        
        response = requests.get(base_games_url, headers=headers, params=params)
        
        if response.status_code == 200:
            week_games = response.json()
            
            for game in week_games:
                # Handle keys (snake_case vs camelCase)
                h_team = game.get('home_team') or game.get('homeTeam')
                a_team = game.get('away_team') or game.get('awayTeam')
                h_pts = game.get('home_points') or game.get('homePoints')
                a_pts = game.get('away_points') or game.get('awayPoints')
                
                # Robust lookups for Date and Neutral Site
                date_val = game.get('start_date') or game.get('startDate')
                neutral_val = game.get('neutral_site', game.get('neutralSite'))
                
                # Check 1: Are both teams FBS?
                is_fbs_matchup = (h_team in fbs_team_names and a_team in fbs_team_names)
                
                # Check 2: Does the game have a score?
                has_score = (h_pts is not None and a_pts is not None)
                
                if is_fbs_matchup and has_score:
                    processed_games.append({
                        'game_id': game.get('id'),
                        'week': game.get('week'),
                        'date': date_val,
                        'neutral_site': neutral_val,
                        'home_team': h_team,
                        'home_conference': game.get('home_conference') or game.get('homeConference'),
                        'home_score': h_pts,
                        'away_team': a_team,
                        'away_conference': game.get('away_conference') or game.get('awayConference'),
                        'away_score': a_pts
                    })
        else:
            print(f"     ‚ö†Ô∏è Week {week} failed: {response.status_code}")

    # 3. Create and Save DataFrame
    if processed_games:
        games_df = pd.DataFrame(processed_games)
        
        # Verify date/neutral columns aren't empty
        null_dates = games_df['date'].isnull().sum()
        if null_dates > 0:
            print(f"‚ö†Ô∏è Warning: {null_dates} games have missing dates.")
        
        # Save to CSV
        games_df.to_csv(csv_path, index=False)
        print(f"\n‚úÖ Fetched and cached {len(games_df)} valid FBS vs FBS games")
        print(f"   Saved to: {csv_path}")
        print(f"   Weeks covered: {games_df['week'].min()}-{games_df['week'].max()}")
        
        # Show sample
        print(f"\nüìã Sample data:")
        print(games_df[['week', 'date', 'home_team', 'home_score', 'away_team', 'neutral_site']].head(3))
        
        # Verify a few team records
        print(f"\nüèà Sample team game counts:")
        for team_name in ['Notre Dame', 'Ohio State', 'Indiana']:
            team_games = games_df[(games_df['home_team'] == team_name) | (games_df['away_team'] == team_name)]
            print(f"   {team_name}: {len(team_games)} games")
            
    else:
        print("\n‚ö†Ô∏è  No games found.")

except Exception as e:
    print(f"‚ùå Error: {e}")
    import traceback
    traceback.print_exc()

üìÅ Will save to: /app/data/cache/2025/games_w15.csv
‚¨áÔ∏è  Fetching fresh data from CFBD API (Weeks 1-15)...
   Fetching FBS team list...
   ‚úì Found 136 FBS teams
   Fetching games...

‚úÖ Fetched and cached 752 valid FBS vs FBS games
   Saved to: /app/data/cache/2025/games_w15.csv
   Weeks covered: 1-14

üìã Sample data:
   week                      date         home_team  home_score     away_team  neutral_site
0     1  2025-08-23T16:00:00.000Z      Kansas State          21    Iowa State          True
1     1  2025-08-23T22:30:00.000Z            Kansas          31  Fresno State         False
2     1  2025-08-23T23:00:00.000Z  Western Kentucky          41   Sam Houston         False

üèà Sample team game counts:
   Notre Dame: 12 games
   Ohio State: 11 games
   Indiana: 11 games

‚úÖ Fetched and cached 752 valid FBS vs FBS games
   Saved to: /app/data/cache/2025/games_w15.csv
   Weeks covered: 1-14

üìã Sample data:
   week                      date         home_team  home_sco

In [55]:
# Cell 4: Fetch Advanced Stats (Fixed Keys)
def fetch_advanced_stats(year, week=None):
    """
    Pull EPA, success rate, and other advanced metrics from CFBD API.
    """
    base_url = "https://api.collegefootballdata.com/stats/season/advanced"
    
    # Get API key
    if 'api_key' not in globals():
        load_dotenv()
        api_key = os.getenv('CFBD_API_KEY')
        api_key = api_key.strip().strip('"').strip("'") if api_key else None
    else:
        api_key = globals()['api_key']
    
    if not api_key:
        print("‚ö†Ô∏è  API key not found, skipping advanced stats")
        return pd.DataFrame()
    
    headers = {
        "Authorization": f"Bearer {api_key}",
        "accept": "application/json"
    }
    
    params = {
        "year": year,
        "excludeGarbageTime": "true"
    }
    if week:
        params["week"] = week
    
    try:
        response = requests.get(base_url, headers=headers, params=params)
        if response.status_code == 200:
            stats_list = response.json()
            stats_data = []
            for stat in stats_list:
                offense = stat.get('offense', {})
                defense = stat.get('defense', {})
                
                # FIX: Use 'ppa' instead of 'totalEPA'
                # PPA (Predicted Points Added) is the CFBD equivalent of EPA
                stats_data.append({
                    'team': stat.get('team'),
                    'offensive_epa_per_play': offense.get('ppa'), # Fixed
                    'defensive_epa_per_play': defense.get('ppa'), # Fixed
                    'offensive_success_rate': offense.get('successRate'),
                    'defensive_success_rate': defense.get('successRate'),
                    'offensive_explosiveness': offense.get('explosiveness'),
                    'defensive_explosiveness': defense.get('explosiveness')
                })
            return pd.DataFrame(stats_data)
        else:
            print(f"‚ö†Ô∏è  Advanced stats API returned status {response.status_code}")
            return pd.DataFrame()
    except Exception as e:
        print(f"‚ö†Ô∏è  Advanced stats not available: {e}")
        return pd.DataFrame()

adv_stats_df = fetch_advanced_stats(current_year)
print(f"üìà Loaded advanced stats for {len(adv_stats_df)} teams")
adv_stats_df.head()

üìà Loaded advanced stats for 136 teams


Unnamed: 0,team,offensive_epa_per_play,defensive_epa_per_play,offensive_success_rate,defensive_success_rate,offensive_explosiveness,defensive_explosiveness
0,Air Force,0.285923,0.409162,0.472637,0.497804,1.218247,1.367531
1,Akron,0.072618,0.110227,0.38358,0.391931,1.253991,1.288075
2,Alabama,0.240271,0.054294,0.452037,0.372274,1.266788,1.25356
3,App State,0.127412,0.177679,0.414573,0.447134,1.206297,1.204328
4,Arizona,0.197982,0.090551,0.425272,0.382102,1.301498,1.203614


In [56]:
# Cell 5: Cache Management
def save_data_cache(games_df, stats_df, year, week):
    """Save processed data to avoid repeated API calls"""
    cache_dir = f'./data/cache/{year}'
    os.makedirs(cache_dir, exist_ok=True)
    
    try:
        # Try parquet first (more efficient)
        games_df.to_parquet(f'{cache_dir}/games_w{week}.parquet')
        if not stats_df.empty:
            stats_df.to_parquet(f'{cache_dir}/stats_w{week}.parquet')
        print(f"üíæ Cached data for Year {year}, Week {week} (parquet format)")
    except (ImportError, ModuleNotFoundError):
        # Fallback to CSV if pyarrow not available
        games_df.to_csv(f'{cache_dir}/games_w{week}.csv', index=False)
        if not stats_df.empty:
            stats_df.to_csv(f'{cache_dir}/stats_w{week}.csv', index=False)
        print(f"üíæ Cached data for Year {year}, Week {week} (CSV format - install pyarrow for parquet)")

def load_data_cache(year, week):
    """Load cached data if available"""
    cache_dir = f'./data/cache/{year}'
    try:
        # Try parquet first
        games_df = pd.read_parquet(f'{cache_dir}/games_w{week}.parquet')
        try:
            stats_df = pd.read_parquet(f'{cache_dir}/stats_w{week}.parquet')
        except:
            stats_df = pd.read_csv(f'{cache_dir}/stats_w{week}.csv') if os.path.exists(f'{cache_dir}/stats_w{week}.csv') else pd.DataFrame()
        print(f"üìÇ Loaded cached data for Year {year}, Week {week}")
        return games_df, stats_df
    except:
        # Try CSV fallback
        try:
            games_df = pd.read_csv(f'{cache_dir}/games_w{week}.csv')
            stats_df = pd.read_csv(f'{cache_dir}/stats_w{week}.csv') if os.path.exists(f'{cache_dir}/stats_w{week}.csv') else pd.DataFrame()
            print(f"üìÇ Loaded cached data for Year {year}, Week {week} (CSV format)")
            return games_df, stats_df
        except Exception:
            return None, None

# Save current data (only if we have games)
if 'games_df' in globals() and not games_df.empty:
    save_data_cache(games_df, adv_stats_df, current_year, 14)  # Week 14 for 2025 season
else:
    print("‚ö†Ô∏è  No games data to cache - check filtering logic or run Cell 3 first")

üíæ Cached data for Year 2025, Week 14 (parquet format)
