In [1]:
!pip install cfbd

Collecting cfbd
  Downloading cfbd-5.6.9-py3-none-any.whl.metadata (736 bytes)
Collecting pydantic<2,>=1.10.5 (from cfbd)
  Downloading pydantic-1.10.21-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (153 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting aenum (from cfbd)
  Downloading aenum-3.1.15-py3-none-any.whl.metadata (3.7 kB)
Downloading cfbd-5.6.9-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.9/233.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydantic-1.10.21-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m55.7 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hDownloading aenum-3.1.15-py3-none-any.whl (137 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m137.6/137.6 kB[0m [31

In [3]:
import cfbd
import os
import json
from datetime import datetime
from kaggle_secrets import UserSecretsClient
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time  # For adding delays

In [4]:
url = "https://247sports.com/season/2025-football/transferportal/"

In [5]:
def scrape_for_players(url, delay=1):  # Added delay parameter
    """
    Scrapes player data from the 247Sports transfer portal, handling pagination and rate limiting.

    Args:
        url (str): The starting URL of the transfer portal page.
        delay (int, optional): The delay (in seconds) between requests. Defaults to 1 second.

    Returns:
        pd.DataFrame: A DataFrame containing the scraped player data.
    """

    all_players_data = []
    page_num = 1
    more_pages = True

    while more_pages:
        try:
            current_url = f"{url}?page={page_num}" if page_num > 1 else url
            print(f"Scraping: {current_url}")  # For debugging
            response = requests.get(current_url)
            response.raise_for_status()

            soup = BeautifulSoup(response.content, "html.parser")
            player_containers = soup.find_all(class_="transfer-player")

            for container in player_containers:
                name_element = container.find("h3")
                school_logo_link = container.find(class_="logo source")
                origin_school = school_logo_link.get("alt") if school_logo_link else "NA"
                destination_logo_link = container.find(class_="logo")
                destination_school = destination_logo_link.get("alt") if destination_logo_link else "NA"
                rating_element = container.find(class_="rating")
                stars_element = container.find(class_="stars")

                name = name_element.text.strip() if name_element else "NA"
                stars = stars_element.text.strip() if stars_element else "NA"
                rating = rating_element.text.strip() if rating_element else "NA"

                all_players_data.append({
                    "Name": name,
                    "Origin": origin_school,
                    "Destination": destination_school,
                    "Stars": stars,
                    "Rating": rating,
                })

            # Check for "Load More" button 
            load_more_button = soup.find(class_="action-button transfer-group-loadMore") 
            if load_more_button:
                page_num += 1
            else:
                more_pages = False

            time.sleep(delay)  # Respect rate limits

        except requests.exceptions.RequestException as e:
            print(f"Error fetching page {page_num}: {e}")
            more_pages = False  # Stop scraping on error
        except Exception as e:
            print(f"An error occurred on page {page_num}: {e}")
            more_pages = False

    return pd.DataFrame(all_players_data)

## NOT TODO: THIS PROJECT HAS SADLY BEEN RELEGATED TO THE DEPTHS OF TARTARUS AS MESSAGEBOARDS DO NOT ALLOW WEBSCRAPING :(

In [6]:
player_data_df = scrape_for_players(url)

if not player_data_df.empty:
    print(player_data_df.head())
    print(player_data_df.info())
else:
    print("No player data scraped.")

Scraping: https://247sports.com/season/2025-football/transferportal/
Error fetching page 1: 403 Client Error: Forbidden for url: https://247sports.com/season/2025-football/transferportal/
No player data scraped.


In [None]:
# # Get API key from Kaggle Secrets
# user_secrets = UserSecretsClient()
# api_key = user_secrets.get_secret("CFBD KEY")

# # Configure API client with explicit authorization header
# configuration = cfbd.Configuration()

# # Create API instances with explicit authorization header
# players_api = cfbd.PlayersApi(
#     cfbd.ApiClient(configuration, header_name='Authorization', header_value=f'Bearer {api_key}'))
# metrics_api = cfbd.MetricsApi(
#     cfbd.ApiClient(configuration, header_name='Authorization', header_value=f'Bearer {api_key}'))

# def fetch_transfer_portal_data(year=2024):
#     """Fetches transfer portal data for a given year."""
#     try:
#         api_response = players_api.get_transfer_portal(year=year)
#         return api_response
#     except cfbd.ApiException as e:
#         print(f"Exception when calling PlayersApi->get_transfer_portal: {e}")
#         return None


# def fetch_player_season_ppa_data(year=2024, player_id=None):
#     """Fetches player season PPA data for a given year and player ID."""
#     try:
#         if player_id is not None:
#             api_response = metrics_api.get_player_season_ppa(year=year, player_id=player_id)
#         else:
#             api_response = metrics_api.get_player_season_ppa(year=year)
#         return api_response
#     except cfbd.ApiException as e:
#         print(f"Exception when calling MetricsApi->get_player_season_ppa: {e}")
#         return None


# def fetch_player_id(first_name, last_name, origin, year=2024):
#     """Fetches player ID using player search API."""
#     try:
#         search_term = f"{first_name} {last_name}"
#         api_response = players_api.search_players(search_term, team=origin, year=year)
#         if api_response and len(api_response) > 0:
#             return api_response[0].id  # Return the first matching player's ID
#         return None
#     except cfbd.ApiException as e:
#         print(f"Exception when calling PlayersApi->search_players: {e}")
#         return None


# def save_data_to_json(data, filename):
#     """Saves data to a JSON file in Kaggle's working directory."""

#     if data:

#         def convert_datetime(obj):
#             if isinstance(obj, datetime):
#                 return obj.isoformat()
#             return obj

#         filepath = os.path.join("/kaggle/working/", filename)
#         with open(filepath, "w") as f:
#             json.dump(
#                 [json.loads(json.dumps(obj.to_dict(), default=convert_datetime)) for obj in data],
#                 f,
#                 indent=4,
#             )
#         print(f"Data saved to {filepath}")


# def create_transfer_dataframe(portal_data):
#     """Creates a Pandas DataFrame from the transfer portal data."""
#     if portal_data:
#         transfers = []
#         for player in portal_data:
#             transfers.append({
#                 'firstName': player.firstName,
#                 'lastName': player.lastName,
#                 'origin': player.origin,
#                 'stars': player.stars,
#             })
#         return pd.DataFrame(transfers)
#     return pd.DataFrame()  # Return an empty DataFrame if no data


# def add_ppa_to_dataframe(transfer_df):
#     """Adds PPA data to the transfer DataFrame."""
#     ppa_data = []
#     for index, row in transfer_df.iterrows():
#         try:
#             player_id = fetch_player_id(
#                 first_name=row['firstName'],
#                 last_name=row['lastName'],
#                 origin=row['origin'],
#             )
#             if player_id:
#                 time.sleep(1)  # Increase delay to 1 second
#                 player_ppa_data = metrics_api.get_player_season_ppa(player_id=player_id)
#                 if player_ppa_data and player_ppa_data:
#                     # Assuming player_ppa_data is a list, take the first element if available
#                     ppa = player_ppa_data[0].ppa if player_ppa_data and player_ppa_data[0].ppa else None
#                     transfer_df.loc[index, 'ppa'] = ppa
#             else:
#                 print(
#                     f"No matching player found for {row['firstName']} {row['lastName']} from {row['origin']}"
#                 )
#         except cfbd.ApiException as e:
#             print(f"API Exception: {e}")
#             time.sleep(10)  # Wait longer after an exception
#     return transfer_df


# # Fetch transfer portal data
# portal_data = fetch_transfer_portal_data()

# if portal_data:
#     transfer_df = create_transfer_dataframe(portal_data)
#     print(transfer_df.head())
#     transfer_df = add_ppa_to_dataframe(transfer_df)
#     print(transfer_df.head())
# else:
#     print("Failed to fetch transfer portal data. Exiting.")