# CFP Committee Simulator - Data Pipeline

This notebook sets up CFBD API connection and builds our data infrastructure


In [1]:
# Cell 1: Setup and Imports
import pandas as pd
import numpy as np
import cfbd
from cfbd.rest import ApiException
import requests
import os
from datetime import datetime
import pickle
import json
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)


In [2]:
# Cell 2: Configure CFBD API
# Get your API key from: https://collegefootballdata.com/key
# Store it in .env file as CFBD_API_KEY=your_key_here

# Load environment variables
load_dotenv()

# Get API key and verify it's loaded
api_key = os.getenv('CFBD_API_KEY')
if not api_key:
    raise ValueError("‚ùå CFBD_API_KEY not found! Please set it in .env file or environment variables.")

# Remove any accidental spaces or quotes
api_key = api_key.strip().strip('"').strip("'")

if len(api_key) < 10:
    raise ValueError(f"‚ùå API key appears invalid (too short: {len(api_key)} chars)")

print(f"‚úÖ API Key loaded (length: {len(api_key)} chars)")

# Configure API with correct format
configuration = cfbd.Configuration()
configuration.api_key['Authorization'] = api_key
configuration.api_key_prefix['Authorization'] = 'Bearer'

# Create API instances
api_client = cfbd.ApiClient(configuration)
games_api = cfbd.GamesApi(api_client)
teams_api = cfbd.TeamsApi(api_client)
stats_api = cfbd.StatsApi(api_client)
ratings_api = cfbd.RatingsApi(api_client)

print("‚úÖ API Configuration Complete")

‚úÖ API Key loaded (length: 64 chars)
‚úÖ API Configuration Complete


In [3]:
# Cell 3: Data Fetching Functions
def get_fbs_teams_list():
    """
    Fetch list of FBS teams for filtering using requests library.
    """
    # Get API key
    if 'api_key' not in globals():
        load_dotenv()
        api_key = os.getenv('CFBD_API_KEY')
        api_key = api_key.strip().strip('"').strip("'") if api_key else None
    else:
        api_key = globals()['api_key']
    
    if not api_key:
        print("‚ùå API key not found!")
        return set()
    
    # Use requests library directly for more control
    url = "https://api.collegefootballdata.com/teams/fbs"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "accept": "application/json"
    }
    params = {"year": 2025}
    
    try:
        response = requests.get(url, headers=headers, params=params)
        if response.status_code == 200:
            teams_data = response.json()
            fbs_team_names = set([team['school'] for team in teams_data])
            print(f"‚úÖ Loaded {len(fbs_team_names)} FBS teams")
            return fbs_team_names
        else:
            print(f"‚ùå Error fetching FBS teams: Status {response.status_code}")
            print(f"   Response: {response.text[:200]}")
            return set()
    except Exception as e:
        print(f"‚ùå Exception fetching FBS teams: {e}")
        return set()

def fetch_season_games(year, start_week=1, fbs_teams=None):
    """
    Fetch all games for a season starting from specified week.
    Filters to include only games between FBS teams.
    """
    base_url = "https://api.collegefootballdata.com/games"
    
    # Get API key (should already be loaded from Cell 2)
    if 'api_key' not in globals():
        load_dotenv()
        api_key = os.getenv('CFBD_API_KEY')
        api_key = api_key.strip().strip('"').strip("'") if api_key else None
    else:
        api_key = globals()['api_key']
    
    if not api_key:
        raise ValueError("‚ùå CFBD_API_KEY not found!")
    
    headers = {
        "Authorization": f"Bearer {api_key}",
        "accept": "application/json"
    }
    
    # Get FBS teams list if not provided
    if fbs_teams is None:
        fbs_teams = get_fbs_teams_list()
    
    if not fbs_teams:
        print("‚ö†Ô∏è  Warning: No FBS teams loaded. All games will be filtered out.")
        print("   Check your API key in the .env file.")
        return pd.DataFrame()
    
    all_games = []
    
    for week in range(start_week, 16):  # Regular season + conf championships
        params = {
            "year": year,
            "week": week,
            "seasonType": "regular",
            "division": "fbs"  # Request FBS division games
        }
        
        try:
            response = requests.get(base_url, headers=headers, params=params)
            if response.status_code == 200:
                games = response.json()
                all_games.extend(games)
                print(f"  Week {week}: {len(games)} games fetched")
            else:
                print(f"  Week {week}: API Error - Status {response.status_code}")
                if response.status_code == 401:
                    print(f"    ‚ö†Ô∏è  Authorization failed. Check your API key.")
                    break
        except Exception as e:
            print(f"  Week {week}: Error - {type(e).__name__}: {str(e)[:100]}")
            continue
    
    # Convert to DataFrame, filtering FBS-only games
    games_data = []
    for game in all_games:
        # Handle both snake_case and camelCase field names from API
        home_team = game.get('home_team') or game.get('homeTeam')
        away_team = game.get('away_team') or game.get('awayTeam')
        home_score = game.get('home_points') or game.get('homePoints')
        away_score = game.get('away_points') or game.get('awayPoints')
        
        # Filter: both teams must be FBS teams and scores must exist
        both_fbs = (home_team in fbs_teams) and (away_team in fbs_teams)
        has_scores = home_score is not None and away_score is not None
        
        if both_fbs and has_scores:
            try:
                home_conf = game.get('home_conference') or game.get('homeConference')
                away_conf = game.get('away_conference') or game.get('awayConference')
                
                games_data.append({
                    'game_id': game.get('id') or game.get('gameId'),
                    'week': game.get('week'),
                    'home_team': home_team,
                    'away_team': away_team,
                    'home_score': int(home_score),
                    'away_score': int(away_score),
                    'home_conference': home_conf,
                    'away_conference': away_conf,
                    'neutral_site': game.get('neutral_site') or game.get('neutralSite', False),
                    'date': game.get('start_date') or game.get('startDate')
                })
            except (ValueError, KeyError) as e:
                # Skip games with invalid data
                continue
    
    df = pd.DataFrame(games_data)
    print(f"\nüìä Fetched {len(df)} FBS vs FBS games for {year} season (from {len(all_games)} total games)")
    
    if len(df) == 0 and len(all_games) > 0:
        print("‚ö†Ô∏è  Warning: No FBS games found after filtering.")
        print("   This might indicate:")
        print("   1. FBS teams list failed to load (check API key)")
        print("   2. Team name mismatch between FBS list and game data")
    
    return df

# Fetch current season data
current_year = 2025  # 2025-2026 season
games_df = fetch_season_games(current_year, start_week=5)  # Start week 5 per report
games_df.head()

‚úÖ Loaded 136 FBS teams
  Week 5: 267 games fetched
  Week 6: 292 games fetched
  Week 7: 287 games fetched
  Week 8: 295 games fetched
  Week 9: 307 games fetched
  Week 10: 304 games fetched
  Week 11: 309 games fetched
  Week 12: 304 games fetched
  Week 13: 138 games fetched
  Week 14: 94 games fetched
  Week 15: 32 games fetched

üìä Fetched 557 FBS vs FBS games for 2025 season (from 2629 total games)


Unnamed: 0,game_id,week,home_team,away_team,home_score,away_score,home_conference,away_conference,neutral_site,date
0,401762471,5,East Carolina,Army,28,6,American Athletic,American Athletic,False,2025-09-25T23:30:00.000Z
1,401754543,5,Virginia,Florida State,46,38,ACC,ACC,False,2025-09-26T23:00:00.000Z
2,401756902,5,Arizona State,TCU,27,24,Big 12,Big 12,False,2025-09-27T01:00:00.000Z
3,401752939,5,Oregon State,Houston,24,27,Pac-12,Big 12,False,2025-09-27T02:30:00.000Z
4,401756906,5,Kansas State,UCF,34,20,Big 12,Big 12,False,2025-09-27T16:00:00.000Z


In [4]:
# Cell 4: Fetch Advanced Stats
def fetch_advanced_stats(year, week=None):
    """
    Pull EPA, success rate, and other advanced metrics from CFBD API.
    """
    base_url = "https://api.collegefootballdata.com/stats/season/advanced"
    
    # Get API key
    if 'api_key' not in globals():
        load_dotenv()
        api_key = os.getenv('CFBD_API_KEY')
        api_key = api_key.strip().strip('"').strip("'") if api_key else None
    else:
        api_key = globals()['api_key']
    
    if not api_key:
        print("‚ö†Ô∏è  API key not found, skipping advanced stats")
        return pd.DataFrame()
    
    headers = {
        "Authorization": f"Bearer {api_key}",
        "accept": "application/json"
    }
    
    params = {
        "year": year,
        "excludeGarbageTime": "true"
    }
    if week:
        params["week"] = week
    
    try:
        response = requests.get(base_url, headers=headers, params=params)
        if response.status_code == 200:
            stats_list = response.json()
            stats_data = []
            for stat in stats_list:
                offense = stat.get('offense', {})
                defense = stat.get('defense', {})
                stats_data.append({
                    'team': stat.get('team'),
                    'offensive_epa_per_play': offense.get('totalEPA') if offense else None,
                    'defensive_epa_per_play': defense.get('totalEPA') if defense else None,
                    'offensive_success_rate': offense.get('successRate') if offense else None,
                    'defensive_success_rate': defense.get('successRate') if defense else None,
                    'offensive_explosiveness': offense.get('explosiveness') if offense else None,
                    'defensive_explosiveness': defense.get('explosiveness') if defense else None
                })
            return pd.DataFrame(stats_data)
        else:
            print(f"‚ö†Ô∏è  Advanced stats API returned status {response.status_code}")
            return pd.DataFrame()
    except Exception as e:
        print(f"‚ö†Ô∏è  Advanced stats not available: {e}")
        return pd.DataFrame()

adv_stats_df = fetch_advanced_stats(current_year)
print(f"üìà Loaded advanced stats for {len(adv_stats_df)} teams")
adv_stats_df.head()


üìà Loaded advanced stats for 136 teams


Unnamed: 0,team,offensive_epa_per_play,defensive_epa_per_play,offensive_success_rate,defensive_success_rate,offensive_explosiveness,defensive_explosiveness
0,Air Force,,,0.472637,0.497804,1.218247,1.367531
1,Akron,,,0.38358,0.391931,1.253991,1.288075
2,Alabama,,,0.452037,0.372274,1.266788,1.25356
3,App State,,,0.414573,0.447134,1.206297,1.204328
4,Arizona,,,0.425272,0.382102,1.301498,1.203614


In [5]:
# Cell 5: Cache Management
def save_data_cache(games_df, stats_df, year, week):
    """Save processed data to avoid repeated API calls"""
    cache_dir = f'./data/cache/{year}'
    os.makedirs(cache_dir, exist_ok=True)
    
    try:
        # Try parquet first (more efficient)
        games_df.to_parquet(f'{cache_dir}/games_w{week}.parquet')
        if not stats_df.empty:
            stats_df.to_parquet(f'{cache_dir}/stats_w{week}.parquet')
        print(f"üíæ Cached data for Year {year}, Week {week} (parquet format)")
    except (ImportError, ModuleNotFoundError):
        # Fallback to CSV if pyarrow not available
        games_df.to_csv(f'{cache_dir}/games_w{week}.csv', index=False)
        if not stats_df.empty:
            stats_df.to_csv(f'{cache_dir}/stats_w{week}.csv', index=False)
        print(f"üíæ Cached data for Year {year}, Week {week} (CSV format - install pyarrow for parquet)")

def load_data_cache(year, week):
    """Load cached data if available"""
    cache_dir = f'./data/cache/{year}'
    try:
        # Try parquet first
        games_df = pd.read_parquet(f'{cache_dir}/games_w{week}.parquet')
        try:
            stats_df = pd.read_parquet(f'{cache_dir}/stats_w{week}.parquet')
        except:
            stats_df = pd.read_csv(f'{cache_dir}/stats_w{week}.csv') if os.path.exists(f'{cache_dir}/stats_w{week}.csv') else pd.DataFrame()
        print(f"üìÇ Loaded cached data for Year {year}, Week {week}")
        return games_df, stats_df
    except:
        # Try CSV fallback
        try:
            games_df = pd.read_csv(f'{cache_dir}/games_w{week}.csv')
            stats_df = pd.read_csv(f'{cache_dir}/stats_w{week}.csv') if os.path.exists(f'{cache_dir}/stats_w{week}.csv') else pd.DataFrame()
            print(f"üìÇ Loaded cached data for Year {year}, Week {week} (CSV format)")
            return games_df, stats_df
        except Exception:
            return None, None

# Save current data (only if we have games)
if 'games_df' in globals() and not games_df.empty:
    save_data_cache(games_df, adv_stats_df, current_year, 15)  # Week 15 for 2025 season
else:
    print("‚ö†Ô∏è  No games data to cache - check filtering logic or run Cell 3 first")

üíæ Cached data for Year 2025, Week 15 (parquet format)
