In [4]:
!pip install requests pandas lxml

Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting lxml
  Downloading lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.5 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.2.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
[?25hDownloading lxml-5.4.0-cp312-cp312-manylinux_2_28_x86_64.whl (5.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m19.7 MB/s[0m eta [36m0:00:00[0ma [36m0:0

In [6]:
!pip list

Package                   Version
------------------------- --------------
anyio                     4.8.0
argon2-cffi               23.1.0
argon2-cffi-bindings      21.2.0
arrow                     1.3.0
astroid                   3.3.8
asttokens                 3.0.0
async-lru                 2.0.4
attrs                     25.1.0
autopep8                  2.3.2
babel                     2.17.0
bandit                    1.8.2
beautifulsoup4            4.13.3
bleach                    6.2.0
certifi                   2025.1.31
cffi                      1.17.1
charset-normalizer        3.4.1
comm                      0.2.2
cryptography              44.0.0
debugpy                   1.8.12
decorator                 5.1.1
defusedxml                0.7.1
dill                      0.3.9
distlib                   0.3.9
docutils                  0.21.2
executing                 2.2.0
fastjsonschema            2.21.1
filelock                  3.17.0
flake8                    7.1.1
fqdn          

In [2]:
import requests
import json

# Define the URL to fetch data from
url = "https://inmotion.dhl/api/f1-award-element-data/6367"

print(f"Attempting to fetch data from: {url}")

try:
    # Make the GET request
    response = requests.get(url, timeout=10) # Added a timeout

    # Check for HTTP errors (like 404 Not Found, 500 Internal Server Error)
    response.raise_for_status()
    print(f"Successfully fetched data (Status Code: {response.status_code})")

    # Parse the JSON response
    try:
        parsed_data = response.json()

        # Navigate through the dictionary to extract the 'events' list
        # Path: root -> 'data' -> 'chart' -> 'events'
        data_section = parsed_data.get('data')
        chart_section = data_section.get('chart') if data_section else None
        events_data = chart_section.get('events') if chart_section else None

        # Check if the data was found and print it
        if events_data is not None:
            print("\nSuccessfully extracted events data:")
            # Pretty print the extracted list
            print(json.dumps(events_data, indent=4))
        else:
            print("\nError: Could not find the 'events' data at the expected path ('data' -> 'chart' -> 'events') in the JSON response.")
            # Optional: Print the structure if keys are missing
            # print("\nReceived JSON structure:")
            # print(json.dumps(parsed_data, indent=4))


    except json.JSONDecodeError:
        print("\nError: Failed to decode JSON from the response.")
        print("Response text was:")
        print(response.text[:500] + "..." if len(response.text) > 500 else response.text) # Print beginning of text

except requests.exceptions.Timeout:
    print(f"\nError: The request to {url} timed out.")
except requests.exceptions.RequestException as e:
    print(f"\nError during request to {url}: {e}")
except Exception as e:
    print(f"\nAn unexpected error occurred: {e}")

Attempting to fetch data from: https://inmotion.dhl/api/f1-award-element-data/6367
Successfully fetched data (Status Code: 200)

Successfully extracted events data:
[
    {
        "id": 1086,
        "title": "FORMULA 1 LOUIS VUITTON AUSTRALIAN GRAND PRIX 2025",
        "short_title": "Albert Park Grand Prix Circuit",
        "abbr": "AUS",
        "date": {
            "date": "2025-03-15 23:00:00.000000",
            "timezone_type": 3,
            "timezone": "UTC"
        }
    },
    {
        "id": 1087,
        "title": "FORMULA 1 HEINEKEN CHINESE GRAND PRIX 2025",
        "short_title": "Shanghai International Circuit",
        "abbr": "CHI",
        "date": {
            "date": "2025-03-23 00:00:00.000000",
            "timezone_type": 3,
            "timezone": "UTC"
        }
    },
    {
        "id": 1088,
        "title": "FORMULA 1 LENOVO JAPANESE GRAND PRIX 2025",
        "short_title": "Suzuka Circuit",
        "abbr": "JAP",
        "date": {
            "date": "20

In [3]:
import requests
import json
import time # Import time for potential delays

# --- 1. Sample events_data (Replace with your actual data) ---
# Since the tool environment couldn't fetch the initial list,
# we'll use the sample data from your first prompt here.


# --- 2. Configuration ---
base_event_url = "https://inmotion.dhl/api/f1-award-element-data/6365"
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
timeout_seconds = 15
# Optional: Add a small delay between requests to avoid overwhelming the server
delay_between_requests = 0.5 # seconds

# --- 3. Data Fetching Loop ---
all_event_specific_data = {} # Dictionary to store results {event_id: data}

print(f"Found {len(events_data)} events to process.")

for event in events_data:
    event_id = event.get('id')
    event_title = event.get('title', 'Unknown Title') # Get title for logging

    if not event_id:
        print(f"Warning: Skipping event with missing ID: {event_title}")
        continue

    # Construct the specific URL for this event
    specific_url = f"{base_event_url}?event={event_id}"
    print(f"\nAttempting to fetch data for Event ID: {event_id} ({event_title})")
    print(f"URL: {specific_url}")

    try:
        # Make the GET request for the specific event
        response = requests.get(specific_url, headers=headers, timeout=timeout_seconds)
        response.raise_for_status() # Check for HTTP errors

        # Parse the JSON response
        try:
            event_specific_data = response.json()
            all_event_specific_data[event_id] = event_specific_data # Store the data
            print(f"Successfully fetched and parsed data for Event ID: {event_id}")
            # Optional: Print a snippet of the fetched data
            # print(json.dumps(event_specific_data, indent=2)[:200] + "...")

        except json.JSONDecodeError:
            print(f"Error: Failed to decode JSON for Event ID: {event_id}")
            print(f"Response text (first 500 chars): {response.text[:500]}")
            all_event_specific_data[event_id] = {"error": "JSONDecodeError", "response_text": response.text[:500]}

    except requests.exceptions.Timeout:
        print(f"Error: Request timed out for Event ID: {event_id} at {specific_url}")
        all_event_specific_data[event_id] = {"error": "Timeout"}
    except requests.exceptions.RequestException as e:
        print(f"Error during request for Event ID: {event_id} at {specific_url}: {e}")
        all_event_specific_data[event_id] = {"error": str(e)}
    except Exception as e:
        print(f"An unexpected error occurred for Event ID: {event_id}: {e}")
        all_event_specific_data[event_id] = {"error": f"Unexpected: {str(e)}"}

    # Optional delay
    if delay_between_requests > 0:
        time.sleep(delay_between_requests)

# --- 4. Final Output ---
print("\n--- Processing Complete ---")
successful_fetches = sum(1 for data in all_event_specific_data.values() if isinstance(data, dict) and 'error' not in data)
failed_fetches = len(events_data) - successful_fetches
print(f"Successfully fetched data for {successful_fetches} events.")
print(f"Failed to fetch data for {failed_fetches} events.")

# Optional: Print all collected data (can be very large)
# print("\n--- Collected Data ---")
# print(json.dumps(all_event_specific_data, indent=4))

# Example: Print data for a specific event ID if it exists and wasn't an error
target_id_to_show = 1086
if target_id_to_show in all_event_specific_data and isinstance(all_event_specific_data[target_id_to_show], dict) and 'error' not in all_event_specific_data[target_id_to_show]:
     print(f"\n--- Sample Data for Event ID {target_id_to_show} ---")
     print(json.dumps(all_event_specific_data[target_id_to_show], indent=4))
elif target_id_to_show in all_event_specific_data:
     print(f"\n--- Error Data for Event ID {target_id_to_show} ---")
     print(json.dumps(all_event_specific_data[target_id_to_show], indent=4))


Found 24 events to process.

Attempting to fetch data for Event ID: 1086 (FORMULA 1 LOUIS VUITTON AUSTRALIAN GRAND PRIX 2025)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1086
Successfully fetched and parsed data for Event ID: 1086

Attempting to fetch data for Event ID: 1087 (FORMULA 1 HEINEKEN CHINESE GRAND PRIX 2025)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1087
Successfully fetched and parsed data for Event ID: 1087

Attempting to fetch data for Event ID: 1088 (FORMULA 1 LENOVO JAPANESE GRAND PRIX 2025)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1088
Successfully fetched and parsed data for Event ID: 1088

Attempting to fetch data for Event ID: 1089 (FORMULA 1 GULF AIR BAHRAIN GRAND PRIX 2025)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1089
Successfully fetched and parsed data for Event ID: 1089

Attempting to fetch data for Event ID: 1090 (FORMULA 1 STC SAUDI ARABIAN GRAND PRIX 2025)
URL: https://inmot

In [8]:
import pandas as pd
import io # Needed to treat the string as a file for read_html
import json

# --- Sample Data (as provided in the prompt) ---
# In a real scenario, this would come from your 'all_event_specific_data[1086]'
sample_event_data  = all_event_specific_data[target_id_to_show]

def html_table_to_dataframe(event_json_data):
    """
    Extracts an HTML table string from event JSON data and converts it to a Pandas DataFrame.

    Args:
        event_json_data (dict): The JSON data dictionary for a specific event,
                                expected to contain ['htmlList']['table'].

    Returns:
        pandas.DataFrame: The DataFrame created from the HTML table, or None if an error occurs.
    """
    if not isinstance(event_json_data, dict):
        print("Error: Input must be a dictionary.")
        return None

    # Safely extract the HTML table string
    html_table_str = event_json_data.get('htmlList', {}).get('table')

    if not html_table_str:
        print("Error: Could not find 'htmlList' -> 'table' in the provided JSON data or it's empty.")
        return None

    if not isinstance(html_table_str, str):
        print("Error: The value at ['htmlList']['table'] is not a string.")
        return None

    print("Found HTML table string. Attempting to parse...")
    try:
        # pd.read_html returns a list of DataFrames. We expect only one table.
        list_of_dfs = pd.read_html(io.StringIO(html_table_str))

        if list_of_dfs:
            print("Successfully parsed HTML table into DataFrame.")
            return list_of_dfs[0] # Return the first DataFrame found
        else:
            # This case is unlikely if the input contains a <table> tag,
            # pd.read_html usually raises ValueError if no tables are found.
            print("Warning: No tables found by pd.read_html, although HTML string was present.")
            return None

    except ValueError as ve:
        # This error often means no tables were found in the string
        print(f"Error parsing HTML with pandas (ValueError): {ve}")
        print("Check if the HTML string actually contains a <table> tag.")
        return None
    except ImportError:
        print("Error: The 'lxml' library might be required by pd.read_html. Please install it (`pip install lxml`).")
        # Note: The tool environment usually has common libraries like lxml.
        return None
    except Exception as e:
        print(f"An unexpected error occurred during HTML parsing: {e}")
        return None

# --- Example Usage ---
# Assuming 'sample_event_data' holds the JSON for Event ID 1086
event_dataframe = html_table_to_dataframe(sample_event_data)

if event_dataframe is not None:
    print("\n--- DataFrame for Event ID 1086 ---")
    # Display the DataFrame. print() works, but display() might be nicer in some environments.
    # Using print() for compatibility here.
    print(event_dataframe.to_string()) # .to_string() ensures all rows/cols are printed

    # You can also print specific info, e.g., the first 5 rows:
    # print("\n--- First 5 Rows ---")
    # print(event_dataframe.head())
else:
    print("\nFailed to create DataFrame.")


Found HTML table string. Attempting to parse...
Successfully parsed HTML table into DataFrame.

--- DataFrame for Event ID 1086 ---
    Pos.          Team      Driver  Time (sec)  Lap  Points
0      1       Ferrari     Leclerc        2.32   34    25.0
1      2       Ferrari    Hamilton        2.38   47    18.0
2      3      Mercedes     Russell        2.43   34    15.0
3      4  Racing Bulls     Tsunoda        2.47   47    12.0
4      5          Haas     Bearman        2.49    4    10.0
5      6          Haas        Ocon        2.54    4     8.0
6      7       Ferrari    Hamilton        2.55   33     NaN
7      8      Red Bull  Verstappen        2.56   46     6.0
8      9      Red Bull      Lawson        2.58    4     4.0
9     10  Racing Bulls     Tsunoda        2.67   33     NaN
10    11          Haas        Ocon        2.69   46     NaN
11    12      Mercedes   Antonelli        2.73   33     2.0
12    13      Williams       Albon        2.75   33     1.0
13    14      Mercedes     R

In [9]:
event_dataframe

Unnamed: 0,Pos.,Team,Driver,Time (sec),Lap,Points
0,1,Ferrari,Leclerc,2.32,34,25.0
1,2,Ferrari,Hamilton,2.38,47,18.0
2,3,Mercedes,Russell,2.43,34,15.0
3,4,Racing Bulls,Tsunoda,2.47,47,12.0
4,5,Haas,Bearman,2.49,4,10.0
5,6,Haas,Ocon,2.54,4,8.0
6,7,Ferrari,Hamilton,2.55,33,
7,8,Red Bull,Verstappen,2.56,46,6.0
8,9,Red Bull,Lawson,2.58,4,4.0
9,10,Racing Bulls,Tsunoda,2.67,33,


In [None]:
import requests
import json
from typing import Dict, Any, Optional, List, Tuple


def fetch_data(url: str, timeout: int = 10) -> Tuple[bool, Any]:
    """
    Fetch data from the specified URL.

    Args:
        url: The URL to fetch data from
        timeout: Request timeout in seconds

    Returns:
        Tuple containing:
            - Success status (True/False)
            - Response data if successful, error message if not
    """
    print(f"Attempting to fetch data from: {url}")

    try:
        response = requests.get(url, timeout=timeout)
        response.raise_for_status()
        print(f"Successfully fetched data (Status Code: {response.status_code})")

        try:
            return True, response.json()
        except json.JSONDecodeError:
            error_msg = "Failed to decode JSON from the response."
            print(f"\nError: {error_msg}")
            print("Response text was:")
            print(response.text[:500] + "..." if len(response.text) > 500 else response.text)
            return False, error_msg

    except requests.exceptions.Timeout:
        error_msg = f"The request to {url} timed out."
        print(f"\nError: {error_msg}")
        return False, error_msg
    except requests.exceptions.RequestException as e:
        error_msg = f"Request error: {e}"
        print(f"\nError during request to {url}: {error_msg}")
        return False, error_msg
    except Exception as e:
        error_msg = f"Unexpected error: {e}"
        print(f"\nAn unexpected error occurred: {error_msg}")
        return False, error_msg


def extract_events_data(data: Dict[str, Any]) -> Tuple[bool, Optional[List[Dict[str, Any]]]]:
    """
    Extract events data from the parsed JSON response.

    Args:
        data: Parsed JSON data

    Returns:
        Tuple containing:
            - Success status (True/False)
            - Events data if successful, None if not
    """
    try:
        # Navigate through the dictionary to extract the 'events' list
        # Path: root -> 'data' -> 'chart' -> 'events'
        data_section = data.get('data')
        chart_section = data_section.get('chart') if data_section else None
        events_data = chart_section.get('events') if chart_section else None

        if events_data is not None:
            return True, events_data
        else:
            error_msg = "Could not find the 'events' data at the expected path ('data' -> 'chart' -> 'events') in the JSON response."
            print(f"\nError: {error_msg}")
            return False, None
    except Exception as e:
        error_msg = f"Error extracting events data: {e}"
        print(f"\nError: {error_msg}")
        return False, None


def process_f1_data(url: str = "https://inmotion.dhl/api/f1-award-element-data/6367") -> None:
    """
    Main function to fetch and process F1 data.

    Args:
        url: The API URL to fetch data from
    """
    # Fetch data
    success, response_data = fetch_data(url)

    if not success:
        return

    # Extract events data
    success, events_data = extract_events_data(response_data)

    if success and events_data is not None:
        print("\nSuccessfully extracted events data:")
        # Pretty print the extracted list
        print(json.dumps(events_data, indent=4))


# Execute the code if this cell is run
if __name__ == "__main__" or 'get_ipython' in globals():
    process_f1_data()


Attempting to fetch data from: https://inmotion.dhl/api/f1-award-element-data/6367
Successfully fetched data (Status Code: 200)

Successfully extracted events data:
[
    {
        "id": 1086,
        "title": "FORMULA 1 LOUIS VUITTON AUSTRALIAN GRAND PRIX 2025",
        "short_title": "Albert Park Grand Prix Circuit",
        "abbr": "AUS",
        "date": {
            "date": "2025-03-15 23:00:00.000000",
            "timezone_type": 3,
            "timezone": "UTC"
        }
    },
    {
        "id": 1087,
        "title": "FORMULA 1 HEINEKEN CHINESE GRAND PRIX 2025",
        "short_title": "Shanghai International Circuit",
        "abbr": "CHI",
        "date": {
            "date": "2025-03-23 00:00:00.000000",
            "timezone_type": 3,
            "timezone": "UTC"
        }
    },
    {
        "id": 1088,
        "title": "FORMULA 1 LENOVO JAPANESE GRAND PRIX 2025",
        "short_title": "Suzuka Circuit",
        "abbr": "JAP",
        "date": {
            "date": "20

In [None]:
import requests
import json
import time
from typing import Dict, List, Any, Optional, Union


class EventDataFetcher:
    """
    A class to fetch event-specific data from the DHL F1 API.
    """

    def __init__(
        self,
        base_url: str = "https://inmotion.dhl/api/f1-award-element-data/6365",
        timeout: int = 15,
        delay: float = 0.5,
        user_agent: str = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
    ):
        """
        Initialize the EventDataFetcher with configuration parameters.

        Args:
            base_url: The base URL for the API endpoint
            timeout: Request timeout in seconds
            delay: Delay between requests in seconds
            user_agent: User agent string for the HTTP requests
        """
        self.base_url = base_url
        self.timeout = timeout
        self.delay = delay
        self.headers = {
            'User-Agent': user_agent
        }
        self.results = {}  # Dictionary to store results {event_id: data}

    def fetch_event_data(self, event_id: Union[int, str], event_title: str = "Unknown Title") -> Dict[str, Any]:
        """
        Fetch data for a specific event.

        Args:
            event_id: The ID of the event
            event_title: The title of the event (for logging purposes)

        Returns:
            A dictionary containing the event data or error information
        """
        specific_url = f"{self.base_url}?event={event_id}"
        print(f"\nAttempting to fetch data for Event ID: {event_id} ({event_title})")
        print(f"URL: {specific_url}")

        try:
            # Make the GET request for the specific event
            response = requests.get(specific_url, headers=self.headers, timeout=self.timeout)
            response.raise_for_status()  # Check for HTTP errors

            # Parse the JSON response
            try:
                event_data = response.json()
                print(f"Successfully fetched and parsed data for Event ID: {event_id}")
                return event_data
            except json.JSONDecodeError:
                print(f"Error: Failed to decode JSON for Event ID: {event_id}")
                print(f"Response text (first 500 chars): {response.text[:500]}")
                return {"error": "JSONDecodeError", "response_text": response.text[:500]}

        except requests.exceptions.Timeout:
            print(f"Error: Request timed out for Event ID: {event_id}")
            return {"error": "Timeout"}
        except requests.exceptions.RequestException as e:
            print(f"Error during request for Event ID: {event_id}: {e}")
            return {"error": str(e)}
        except Exception as e:
            print(f"An unexpected error occurred for Event ID: {event_id}: {e}")
            return {"error": f"Unexpected: {str(e)}"}

    def process_events(self, events_data: List[Dict[str, Any]]) -> Dict[str, Any]:
        """
        Process a list of events and fetch data for each one.

        Args:
            events_data: A list of event dictionaries, each containing at least 'id' and optionally 'title'

        Returns:
            A dictionary mapping event IDs to their fetched data
        """
        print(f"Found {len(events_data)} events to process.")

        for event in events_data:
            event_id = event.get('id')
            event_title = event.get('title', 'Unknown Title')

            if not event_id:
                print(f"Warning: Skipping event with missing ID: {event_title}")
                continue

            # Fetch data for this event
            event_data = self.fetch_event_data(event_id, event_title)
            self.results[event_id] = event_data

            # Optional delay between requests
            if self.delay > 0:
                time.sleep(self.delay)

        return self.results

    def print_summary(self) -> None:
        """
        Print a summary of the fetching results.
        """
        print("\n--- Processing Complete ---")

        successful_fetches = sum(1 for data in self.results.values()
                                if isinstance(data, dict) and 'error' not in data)
        failed_fetches = len(self.results) - successful_fetches

        print(f"Successfully fetched data for {successful_fetches} events.")
        print(f"Failed to fetch data for {failed_fetches} events.")

    def print_sample_data(self, event_id: Union[int, str]) -> None:
        """
        Print sample data for a specific event ID.

        Args:
            event_id: The ID of the event to display data for
        """
        if event_id in self.results:
            if isinstance(self.results[event_id], dict) and 'error' not in self.results[event_id]:
                print(f"\n--- Sample Data for Event ID {event_id} ---")
                print(json.dumps(self.results[event_id], indent=4))
            else:
                print(f"\n--- Error Data for Event ID {event_id} ---")
                print(json.dumps(self.results[event_id], indent=4))
        else:
            print(f"\nNo data found for Event ID {event_id}")


# Example usage
if __name__ == "__main__":
    # Sample events data (replace with your actual data)
    sample_events = [
        {"id": 1086, "title": "Sample Event 1"},
        {"id": 1087, "title": "Sample Event 2"},
        # Add more events as needed
    ]

    # Create the fetcher with default or custom configuration
    fetcher = EventDataFetcher(
        base_url="https://inmotion.dhl/api/f1-award-element-data/6365",
        timeout=15,
        delay=0.5
    )

    # Process all events
    fetcher.process_events(sample_events)

    # Print summary
    fetcher.print_summary()

    # Print sample data for a specific event
    fetcher.print_sample_data(1086)


Found 2 events to process.

Attempting to fetch data for Event ID: 1086 (Sample Event 1)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1086
Successfully fetched and parsed data for Event ID: 1086

Attempting to fetch data for Event ID: 1087 (Sample Event 2)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1087
Successfully fetched and parsed data for Event ID: 1087

--- Processing Complete ---
Successfully fetched data for 2 events.
Failed to fetch data for 0 events.

--- Sample Data for Event ID 1086 ---
{
    "data": {
        "chart": [
            {
                "id": 7240,
                "driverNr": 16,
                "tla": "LEC",
                "firstName": "Charles",
                "lastName": "Leclerc",
                "team": "Ferrari",
                "duration": 2.32,
                "startTime": {
                    "date": "2025-03-16 16:17:28.000000",
                    "timezone_type": 3,
                    "timezone": "UTC"
    

In [5]:
import requests
import json
import time
import pandas as pd
import io
from typing import Dict, List, Optional, Any, Tuple, Union


def fetch_data_from_url(url: str, headers: Optional[Dict] = None,
                        timeout: int = 10) -> Tuple[bool, Union[Dict, str]]:
    """
    Fetch data from a URL with error handling.

    Args:
        url: The URL to fetch data from
        headers: Optional HTTP headers
        timeout: Request timeout in seconds

    Returns:
        Tuple containing success status and either the parsed JSON data or error message
    """
    print(f"Attempting to fetch data from: {url}")

    try:
        # Make the GET request
        response = requests.get(url, headers=headers, timeout=timeout)
        response.raise_for_status()
        print(f"Successfully fetched data (Status Code: {response.status_code})")

        # Parse the JSON response
        try:
            return True, response.json()
        except json.JSONDecodeError:
            error_msg = f"Failed to decode JSON from the response. Response text: {response.text[:500]}"
            print(f"Error: {error_msg}")
            return False, error_msg

    except requests.exceptions.Timeout:
        error_msg = f"The request to {url} timed out."
        print(f"Error: {error_msg}")
        return False, error_msg
    except requests.exceptions.RequestException as e:
        error_msg = f"Request error for {url}: {e}"
        print(f"Error: {error_msg}")
        return False, error_msg
    except Exception as e:
        error_msg = f"Unexpected error: {e}"
        print(f"Error: {error_msg}")
        return False, error_msg


def extract_events_data(parsed_data: Dict) -> Tuple[bool, Union[List, str]]:
    """
    Extract events data from parsed JSON.

    Args:
        parsed_data: The parsed JSON data

    Returns:
        Tuple containing success status and either the events data or error message
    """
    try:
        # Navigate through the dictionary to extract the 'events' list
        data_section = parsed_data.get("data")
        chart_section = data_section.get("chart") if data_section else None
        events_data = chart_section.get("events") if chart_section else None

        if events_data is not None:
            print("Successfully extracted events data")
            return True, events_data
        else:
            error_msg = "Could not find the 'events' data at the expected path ('data' -> 'chart' -> 'events')"
            print(f"Error: {error_msg}")
            return False, error_msg

    except Exception as e:
        error_msg = f"Error extracting events data: {e}"
        print(f"Error: {error_msg}")
        return False, error_msg


def fetch_event_specific_data(events_data: List[Dict], base_url: str,
                             headers: Optional[Dict] = None, timeout: int = 15,
                             delay: float = 0.5) -> Dict[str, Any]:
    """
    Fetch specific data for each event.

    Args:
        events_data: List of events
        base_url: Base URL for event-specific data
        headers: Optional HTTP headers
        timeout: Request timeout in seconds
        delay: Delay between requests in seconds

    Returns:
        Dictionary mapping event IDs to their specific data
    """
    all_event_specific_data = {}  # Dictionary to store results {event_id: data}

    print(f"Found {len(events_data)} events to process.")

    for event in events_data:
        event_id = event.get("id")
        event_title = event.get("title", "Unknown Title")

        if not event_id:
            print(f"Warning: Skipping event with missing ID: {event_title}")
            continue

        # Construct the specific URL for this event
        specific_url = f"{base_url}?event={event_id}"
        print(f"\nAttempting to fetch data for Event ID: {event_id} ({event_title})")

        success, result = fetch_data_from_url(specific_url, headers, timeout)

        if success:
            all_event_specific_data[event_id] = result
            print(f"Successfully fetched and parsed data for Event ID: {event_id}")
        else:
            all_event_specific_data[event_id] = {"error": result}

        # Optional delay
        if delay > 0:
            time.sleep(delay)

    return all_event_specific_data


def html_table_to_dataframe(event_json_data: Dict) -> Optional[pd.DataFrame]:
    """
    Extracts an HTML table string from event JSON data and converts it to a Pandas DataFrame.

    Args:
        event_json_data: The JSON data dictionary for a specific event,
                         expected to contain ['htmlList']['table'].

    Returns:
        The DataFrame created from the HTML table, or None if an error occurs.
    """
    if not isinstance(event_json_data, dict):
        print("Error: Input must be a dictionary.")
        return None

    # Safely extract the HTML table string
    html_table_str = event_json_data.get("htmlList", {}).get("table")

    if not html_table_str:
        print("Error: Could not find 'htmlList' -> 'table' in the provided JSON data or it's empty.")
        return None

    if not isinstance(html_table_str, str):
        print("Error: The value at ['htmlList']['table'] is not a string.")
        return None

    print("Found HTML table string. Attempting to parse...")
    try:
        # pd.read_html returns a list of DataFrames. We expect only one table.
        list_of_dfs = pd.read_html(io.StringIO(html_table_str))

        if list_of_dfs:
            print("Successfully parsed HTML table into DataFrame.")
            return list_of_dfs[0]  # Return the first DataFrame found
        else:
            print("Warning: No tables found by pd.read_html, although HTML string was present.")
            return None

    except ValueError as ve:
        print(f"Error parsing HTML with pandas (ValueError): {ve}")
        return None
    except ImportError:
        print("Error: The 'lxml' library might be required by pd.read_html. Please install it (`pip install lxml`).")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during HTML parsing: {e}")
        return None


def process_event_data(event_data: Dict, event_id: Any) -> None:
    """
    Process and display data for a specific event.

    Args:
        event_data: The event data to process
        event_id: The ID of the event
    """
    if "error" in event_data:
        print(f"\n--- Error Data for Event ID {event_id} ---")
        print(json.dumps(event_data, indent=4))
        return

    # Convert HTML table to DataFrame
    event_dataframe = html_table_to_dataframe(event_data)

    if event_dataframe is not None:
        print(f"\n--- DataFrame for Event ID {event_id} ---")
        print(event_dataframe.to_string())
    else:
        print(f"\nFailed to create DataFrame for Event ID {event_id}.")


def main():
    """Main function to orchestrate the workflow."""
    # Configuration
    config = {
        "events_url": "https://inmotion.dhl/api/f1-award-element-data/6367",
        "base_event_url": "https://inmotion.dhl/api/f1-award-element-data/6365",
        "headers": {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
        },
        "timeout": 15,
        "delay": 0.5,
        "target_event_id": 1086  # Example event ID to display
    }

    # Step 1: Fetch events data
    success, result = fetch_data_from_url(config["events_url"], config["headers"], config["timeout"])
    if not success:
        print("Failed to fetch initial events data. Exiting.")
        return

    # Step 2: Extract events data
    success, events_data = extract_events_data(result)
    if not success:
        print("Failed to extract events data. Exiting.")
        return

    # Step 3: Fetch specific data for each event
    all_event_specific_data = fetch_event_specific_data(
        events_data,
        config["base_event_url"],
        config["headers"],
        config["timeout"],
        config["delay"]
    )

    # Step 4: Display summary
    print("\n--- Processing Complete ---")
    successful_fetches = sum(
        1 for data in all_event_specific_data.values()
        if isinstance(data, dict) and "error" not in data
    )
    failed_fetches = len(events_data) - successful_fetches
    print(f"Successfully fetched data for {successful_fetches} events.")
    print(f"Failed to fetch data for {failed_fetches} events.")

    # Step 5: Process and display data for a specific event
    target_id = config["target_event_id"]
    if target_id in all_event_specific_data:
        process_event_data(all_event_specific_data[target_id], target_id)
    else:
        print(f"\nNo data found for Event ID {target_id}")


if __name__ == "__main__":
    main()


Attempting to fetch data from: https://inmotion.dhl/api/f1-award-element-data/6367
Successfully fetched data (Status Code: 200)
Successfully extracted events data
Found 24 events to process.

Attempting to fetch data for Event ID: 1086 (FORMULA 1 LOUIS VUITTON AUSTRALIAN GRAND PRIX 2025)
Attempting to fetch data from: https://inmotion.dhl/api/f1-award-element-data/6365?event=1086
Successfully fetched data (Status Code: 200)
Successfully fetched and parsed data for Event ID: 1086

Attempting to fetch data for Event ID: 1087 (FORMULA 1 HEINEKEN CHINESE GRAND PRIX 2025)
Attempting to fetch data from: https://inmotion.dhl/api/f1-award-element-data/6365?event=1087
Successfully fetched data (Status Code: 200)
Successfully fetched and parsed data for Event ID: 1087

Attempting to fetch data for Event ID: 1088 (FORMULA 1 LENOVO JAPANESE GRAND PRIX 2025)
Attempting to fetch data from: https://inmotion.dhl/api/f1-award-element-data/6365?event=1088
Successfully fetched data (Status Code: 200)
Suc

In [None]:
import requests
import json
import time
import pandas as pd
import io
import os
import re
from typing import Dict, List, Optional, Tuple, Union, Any

# Configuration constants
DEFAULT_TIMEOUT = 10
EVENT_DATA_URL = "https://inmotion.dhl/api/f1-award-element-data/6367"
EVENT_SPECIFIC_URL = "https://inmotion.dhl/api/f1-award-element-data/6365"
DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
DELAY_BETWEEN_REQUESTS = 0.5  # seconds
FUEL_CORRECTION_FACTOR = 0.03
INITIAL_FUEL_LOAD = {"Race": 100, "Sprint": 30}


class F1DataFetcher:
    """Class for fetching and processing Formula 1 data."""

    def __init__(self, timeout: int = DEFAULT_TIMEOUT, headers: Dict = None):
        """
        Initialize the F1DataFetcher.

        Args:
            timeout: Request timeout in seconds
            headers: HTTP headers for requests
        """
        self.timeout = timeout
        self.headers = headers or DEFAULT_HEADERS
        self.event_data_cache = {}
        self.event_specific_data_cache = {}

    def fetch_events_data(self, url: str = EVENT_DATA_URL) -> Dict:
        """
        Fetch events data from the specified URL.

        Args:
            url: URL to fetch data from

        Returns:
            Dictionary containing events data

        Raises:
            requests.exceptions.RequestException: If request fails
            json.JSONDecodeError: If response is not valid JSON
        """
        print(f"Attempting to fetch data from: {url}")

        try:
            response = self._make_request(url)
            parsed_data = response.json()

            # Extract events list from the nested structure
            data_section = parsed_data.get("data", {})
            chart_section = data_section.get("chart", {})
            events_data = chart_section.get("events", [])

            if events_data:
                print("\nSuccessfully extracted events data")
                return events_data
            else:
                print("\nError: Could not find the 'events' data at the expected path")
                return []

        except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
            print(f"\nError fetching events data: {e}")
            return []

    def fetch_event_specific_data(self, events_data: List[Dict],
                                  base_url: str = EVENT_SPECIFIC_URL) -> Dict[str, Any]:
        """
        Fetch specific data for each event.

        Args:
            events_data: List of event dictionaries
            base_url: Base URL for event-specific data

        Returns:
            Dictionary mapping event IDs to their specific data
        """
        all_event_specific_data = {}

        print(f"Found {len(events_data)} events to process.")

        for event in events_data:
            event_id = event.get("id")
            event_title = event.get("title", "Unknown Title")

            if not event_id:
                print(f"Warning: Skipping event with missing ID: {event_title}")
                continue

            # Check cache first
            if event_id in self.event_specific_data_cache:
                all_event_specific_data[event_id] = self.event_specific_data_cache[event_id]
                print(f"Using cached data for Event ID: {event_id} ({event_title})")
                continue

            specific_url = f"{base_url}?event={event_id}"
            print(f"\nAttempting to fetch data for Event ID: {event_id} ({event_title})")
            print(f"URL: {specific_url}")

            try:
                response = self._make_request(specific_url)
                event_specific_data = response.json()

                # Cache the result
                self.event_specific_data_cache[event_id] = event_specific_data
                all_event_specific_data[event_id] = event_specific_data

                print(f"Successfully fetched and parsed data for Event ID: {event_id}")

            except requests.exceptions.RequestException as e:
                error_info = {"error": str(e)}
                all_event_specific_data[event_id] = error_info
                print(f"Error during request for Event ID: {event_id}: {e}")

            except json.JSONDecodeError:
                error_info = {
                    "error": "JSONDecodeError",
                    "response_text": response.text[:500] if hasattr(response, 'text') else "No response text"
                }
                all_event_specific_data[event_id] = error_info
                print(f"Error: Failed to decode JSON for Event ID: {event_id}")

            # Add delay between requests
            if DELAY_BETWEEN_REQUESTS > 0:
                time.sleep(DELAY_BETWEEN_REQUESTS)

        # Print summary
        self._print_fetch_summary(events_data, all_event_specific_data)
        return all_event_specific_data

    def _make_request(self, url: str) -> requests.Response:
        """
        Make an HTTP request with error handling.

        Args:
            url: URL to request

        Returns:
            Response object

        Raises:
            requests.exceptions.RequestException: If request fails
        """
        try:
            response = requests.get(url, headers=self.headers, timeout=self.timeout)
            response.raise_for_status()
            return response
        except requests.exceptions.Timeout:
            print(f"Error: The request to {url} timed out.")
            raise
        except requests.exceptions.RequestException:
            print(f"Error during request to {url}")
            raise

    def _print_fetch_summary(self, events_data: List[Dict],
                            all_event_specific_data: Dict[str, Any]) -> None:
        """Print a summary of the fetch operation."""
        print("\n--- Processing Complete ---")
        successful_fetches = sum(
            1 for data in all_event_specific_data.values()
            if isinstance(data, dict) and "error" not in data
        )
        failed_fetches = len(events_data) - successful_fetches
        print(f"Successfully fetched data for {successful_fetches} events.")
        print(f"Failed to fetch data for {failed_fetches} events.")


class DataProcessor:
    """Class for processing F1 data."""

    @staticmethod
    def html_table_to_dataframe(event_json_data: Dict) -> Optional[pd.DataFrame]:
        """
        Extract HTML table from event JSON data and convert to DataFrame.

        Args:
            event_json_data: JSON data dictionary for a specific event

        Returns:
            DataFrame created from HTML table, or None if extraction fails
        """
        if not isinstance(event_json_data, dict):
            print("Error: Input must be a dictionary.")
            return None

        # Extract HTML table string
        html_table_str = event_json_data.get("htmlList", {}).get("table")

        if not html_table_str:
            print("Error: Could not find 'htmlList' -> 'table' in the provided JSON data or it's empty.")
            return None

        if not isinstance(html_table_str, str):
            print("Error: The value at ['htmlList']['table'] is not a string.")
            return None

        print("Found HTML table string. Attempting to parse...")
        try:
            # Parse HTML table into DataFrame
            list_of_dfs = pd.read_html(io.StringIO(html_table_str))

            if list_of_dfs:
                print("Successfully parsed HTML table into DataFrame.")
                return list_of_dfs[0]
            else:
                print("Warning: No tables found by pd.read_html, although HTML string was present.")
                return None

        except ValueError as ve:
            print(f"Error parsing HTML with pandas (ValueError): {ve}")
            print("Check if the HTML string actually contains a <table> tag.")
            return None
        except ImportError:
            print("Error: The 'lxml' library might be required by pd.read_html. Please install it (`pip install lxml`).")
            return None
        except Exception as e:
            print(f"An unexpected error occurred during HTML parsing: {e}")
            return None



    @staticmethod
    def save_dataframe_to_json(df: pd.DataFrame, event_title: str, year: int = 2025, output_dir: str = "race_data") -> str:
        """
        Save DataFrame to a JSON file with a standardized filename.

        Args:
            df: DataFrame to save
            event_title: Title of the event (e.g., "FORMULA 1 LOUIS VUITTON AUSTRALIAN GRAND PRIX 2025")
            year: Year of the event
            output_dir: Directory to save the JSON file

        Returns:
            Path to the saved JSON file
        """
        if df is None:
            print(f"Error: Cannot save None DataFrame for {event_title}")
            return ""

        # Extract the Grand Prix name from the title
        # Example: "FORMULA 1 LOUIS VUITTON AUSTRALIAN GRAND PRIX 2025" -> "Australian_Grand_Prix"
        match = re.search(r'([A-Z]+(?:\s+[A-Z]+)*)\s+GRAND\s+PRIX', event_title, re.IGNORECASE)
        if match:
            grand_prix_name = match.group(1).strip()
        else:
            # Fallback: extract words between the sponsor and the year
            parts = event_title.split()
            if len(parts) >= 3:
                # Try to find the Grand Prix name by removing common elements
                filtered_parts = [p for p in parts if p not in ["FORMULA", "1", "GRAND", "PRIX", str(year)]]
                # Remove likely sponsor names (all caps words at the beginning)
                while filtered_parts and filtered_parts[0].isupper():
                    filtered_parts.pop(0)
                grand_prix_name = " ".join(filtered_parts[:2])  # Take the first two remaining words
            else:
                grand_prix_name = "Unknown_Grand_Prix"

        # Clean up the name and replace spaces with underscores
        grand_prix_name = re.sub(r'[^\w\s]', '', grand_prix_name)  # Remove special characters
        grand_prix_name = grand_prix_name.replace(' ', '_')

        # Create the filename
        filename = f"{year}_{grand_prix_name}.json"

        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Save the DataFrame to a JSON file
        file_path = os.path.join(output_dir, filename)
        df.to_json(file_path, orient="records", indent=4)

        print(f"Saved data to {file_path}")
        return file_path


def main():
    """Main function to demonstrate the usage of the classes."""
    # Initialize the data fetcher
    fetcher = F1DataFetcher()

    # Fetch events data
    events_data = fetcher.fetch_events_data()
    if not events_data:
        print("No events data found. Exiting.")
        return

    # Fetch specific data for each event
    all_event_specific_data = fetcher.fetch_event_specific_data(events_data)

    # Create a DataProcessor instance
    processor = DataProcessor()

    # Create a directory to store the JSON files
    output_dir = "race_data"
    os.makedirs(output_dir, exist_ok=True)

    # Process each event and save to JSON
    saved_files = []
    for event in events_data:
        event_id = event.get("id")
        event_title = event.get("title", "Unknown Title")

        if not event_id or event_id not in all_event_specific_data:
            print(f"Skipping event {event_title}: No data available")
            continue

        event_data = all_event_specific_data[event_id]

        # Check if there was an error fetching this event's data
        if isinstance(event_data, dict) and "error" in event_data:
            print(f"Skipping event {event_title}: Error in data - {event_data.get('error')}")
            continue

        # Convert HTML table to DataFrame
        print(f"\nProcessing event: {event_title}")
        event_dataframe = processor.html_table_to_dataframe(event_data)

        if event_dataframe is not None:
            # Extract year from title or use default
            year_match = re.search(r'(\d{4})', event_title)
            year = int(year_match.group(1)) if year_match else 2025

            # Save DataFrame to JSON
            file_path = processor.save_dataframe_to_json(
                event_dataframe,
                event_title,
                year,
                output_dir
            )

            if file_path:
                saved_files.append(file_path)
        else:
            print(f"Failed to create DataFrame for {event_title}")

    # Print summary
    print(f"\n--- JSON Export Complete ---")
    print(f"Successfully saved {len(saved_files)} event data files to the '{output_dir}' directory.")
    if saved_files:
        print("Files saved:")
        for file_path in saved_files:
            print(f"  - {file_path}")


if __name__ == "__main__":
    main()

Attempting to fetch data from: https://inmotion.dhl/api/f1-award-element-data/6367

Successfully extracted events data
Found 24 events to process.

Attempting to fetch data for Event ID: 1086 (FORMULA 1 LOUIS VUITTON AUSTRALIAN GRAND PRIX 2025)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1086
Successfully fetched and parsed data for Event ID: 1086

Attempting to fetch data for Event ID: 1087 (FORMULA 1 HEINEKEN CHINESE GRAND PRIX 2025)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1087
Successfully fetched and parsed data for Event ID: 1087

Attempting to fetch data for Event ID: 1088 (FORMULA 1 LENOVO JAPANESE GRAND PRIX 2025)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1088
Successfully fetched and parsed data for Event ID: 1088

Attempting to fetch data for Event ID: 1089 (FORMULA 1 GULF AIR BAHRAIN GRAND PRIX 2025)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1089
Successfully fetched and parsed data for Event

In [2]:
import requests
import json
import time
import pandas as pd
import io
import os
import re
from typing import Dict, List, Optional, Tuple, Union, Any

# Configuration constants
DEFAULT_TIMEOUT = 10
EVENT_DATA_URL = "https://inmotion.dhl/api/f1-award-element-data/6367"
EVENT_SPECIFIC_URL = "https://inmotion.dhl/api/f1-award-element-data/6365"
DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
DELAY_BETWEEN_REQUESTS = 0.5  # seconds
FUEL_CORRECTION_FACTOR = 0.03
INITIAL_FUEL_LOAD = {"Race": 100, "Sprint": 30}

# 2025 F1 Race Calendar - Official race names without sponsors
F1_RACES_2025 = {
    "AUSTRALIAN": "Australian Grand Prix",
    "CHINESE": "Chinese Grand Prix",
    "JAPANESE": "Japanese Grand Prix",
    "BAHRAIN": "Bahrain Grand Prix",
    "SAUDI ARABIAN": "Saudi Arabian Grand Prix",
    "MIAMI": "Miami Grand Prix",
    "EMILIA-ROMAGNA": "Emilia Romagna Grand Prix",
    "MONACO": "Monaco Grand Prix",
    "ESPAÑA": "Spanish Grand Prix",
    "CANADA": "Canadian Grand Prix",
    "AUSTRIAN": "Austrian Grand Prix",
    "BRITISH": "British Grand Prix",
    "BELGIAN": "Belgian Grand Prix",
    "HUNGARIAN": "Hungarian Grand Prix",
    "DUTCH": "Dutch Grand Prix",
    "ITALIA": "Italian Grand Prix",
    "AZERBAIJAN": "Azerbaijan Grand Prix",
    "SINGAPORE": "Singapore Grand Prix",
    "UNITED STATES": "United States Grand Prix",
    "MÉXICO": "Mexico City Grand Prix",
    "SÃO PAULO": "São Paulo Grand Prix",
    "LAS VEGAS": "Las Vegas Grand Prix",
    "QATAR": "Qatar Grand Prix",
    "ABU DHABI": "Abu Dhabi Grand Prix"
}


class F1DataFetcher:
    """Class for fetching and processing Formula 1 data."""

    def __init__(self, timeout: int = DEFAULT_TIMEOUT, headers: Dict = None):
        """
        Initialize the F1DataFetcher.

        Args:
            timeout: Request timeout in seconds
            headers: HTTP headers for requests
        """
        self.timeout = timeout
        self.headers = headers or DEFAULT_HEADERS
        self.event_data_cache = {}
        self.event_specific_data_cache = {}

    def fetch_events_data(self, url: str = EVENT_DATA_URL) -> Dict:
        """
        Fetch events data from the specified URL.

        Args:
            url: URL to fetch data from

        Returns:
            Dictionary containing events data

        Raises:
            requests.exceptions.RequestException: If request fails
            json.JSONDecodeError: If response is not valid JSON
        """
        print(f"Attempting to fetch data from: {url}")

        try:
            response = self._make_request(url)
            parsed_data = response.json()

            # Extract events list from the nested structure
            data_section = parsed_data.get("data", {})
            chart_section = data_section.get("chart", {})
            events_data = chart_section.get("events", [])

            if events_data:
                print("\nSuccessfully extracted events data")
                return events_data
            else:
                print("\nError: Could not find the 'events' data at the expected path")
                return []

        except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
            print(f"\nError fetching events data: {e}")
            return []

    def fetch_event_specific_data(self, events_data: List[Dict],
                                  base_url: str = EVENT_SPECIFIC_URL) -> Dict[str, Any]:
        """
        Fetch specific data for each event.

        Args:
            events_data: List of event dictionaries
            base_url: Base URL for event-specific data

        Returns:
            Dictionary mapping event IDs to their specific data
        """
        all_event_specific_data = {}

        print(f"Found {len(events_data)} events to process.")

        for event in events_data:
            event_id = event.get("id")
            event_title = event.get("title", "Unknown Title")

            if not event_id:
                print(f"Warning: Skipping event with missing ID: {event_title}")
                continue

            # Check cache first
            if event_id in self.event_specific_data_cache:
                all_event_specific_data[event_id] = self.event_specific_data_cache[event_id]
                print(f"Using cached data for Event ID: {event_id} ({event_title})")
                continue

            specific_url = f"{base_url}?event={event_id}"
            print(f"\nAttempting to fetch data for Event ID: {event_id} ({event_title})")
            print(f"URL: {specific_url}")

            try:
                response = self._make_request(specific_url)
                event_specific_data = response.json()

                # Cache the result
                self.event_specific_data_cache[event_id] = event_specific_data
                all_event_specific_data[event_id] = event_specific_data

                print(f"Successfully fetched and parsed data for Event ID: {event_id}")

            except requests.exceptions.RequestException as e:
                error_info = {"error": str(e)}
                all_event_specific_data[event_id] = error_info
                print(f"Error during request for Event ID: {event_id}: {e}")

            except json.JSONDecodeError:
                error_info = {
                    "error": "JSONDecodeError",
                    "response_text": response.text[:500] if hasattr(response, 'text') else "No response text"
                }
                all_event_specific_data[event_id] = error_info
                print(f"Error: Failed to decode JSON for Event ID: {event_id}")

            # Add delay between requests
            if DELAY_BETWEEN_REQUESTS > 0:
                time.sleep(DELAY_BETWEEN_REQUESTS)

        # Print summary
        self._print_fetch_summary(events_data, all_event_specific_data)
        return all_event_specific_data

    def _make_request(self, url: str) -> requests.Response:
        """
        Make an HTTP request with error handling.

        Args:
            url: URL to request

        Returns:
            Response object

        Raises:
            requests.exceptions.RequestException: If request fails
        """
        try:
            response = requests.get(url, headers=self.headers, timeout=self.timeout)
            response.raise_for_status()
            return response
        except requests.exceptions.Timeout:
            print(f"Error: The request to {url} timed out.")
            raise
        except requests.exceptions.RequestException:
            print(f"Error during request to {url}")
            raise

    def _print_fetch_summary(self, events_data: List[Dict],
                            all_event_specific_data: Dict[str, Any]) -> None:
        """Print a summary of the fetch operation."""
        print("\n--- Processing Complete ---")
        successful_fetches = sum(
            1 for data in all_event_specific_data.values()
            if isinstance(data, dict) and "error" not in data
        )
        failed_fetches = len(events_data) - successful_fetches
        print(f"Successfully fetched data for {successful_fetches} events.")
        print(f"Failed to fetch data for {failed_fetches} events.")


class DataProcessor:
    """Class for processing F1 data."""

    @staticmethod
    def html_table_to_dataframe(event_json_data: Dict) -> Optional[pd.DataFrame]:
        """
        Extract HTML table from event JSON data and convert to DataFrame.

        Args:
            event_json_data: JSON data dictionary for a specific event

        Returns:
            DataFrame created from HTML table, or None if extraction fails
        """
        if not isinstance(event_json_data, dict):
            print("Error: Input must be a dictionary.")
            return None

        # Extract HTML table string
        html_table_str = event_json_data.get("htmlList", {}).get("table")

        if not html_table_str:
            print("Error: Could not find 'htmlList' -> 'table' in the provided JSON data or it's empty.")
            return None

        if not isinstance(html_table_str, str):
            print("Error: The value at ['htmlList']['table'] is not a string.")
            return None

        print("Found HTML table string. Attempting to parse...")
        try:
            # Parse HTML table into DataFrame
            list_of_dfs = pd.read_html(io.StringIO(html_table_str))

            if list_of_dfs:
                print("Successfully parsed HTML table into DataFrame.")
                return list_of_dfs[0]
            else:
                print("Warning: No tables found by pd.read_html, although HTML string was present.")
                return None

        except ValueError as ve:
            print(f"Error parsing HTML with pandas (ValueError): {ve}")
            print("Check if the HTML string actually contains a <table> tag.")
            return None
        except ImportError:
            print("Error: The 'lxml' library might be required by pd.read_html. Please install it (`pip install lxml`).")
            return None
        except Exception as e:
            print(f"An unexpected error occurred during HTML parsing: {e}")
            return None

    @staticmethod
    def apply_fuel_correction(lap_time: float, lap_number: int,
                             total_laps: int, session_type: str) -> float:
        """
        Apply fuel correction to lap time.

        Args:
            lap_time: Original lap time in seconds
            lap_number: Current lap number
            total_laps: Total number of laps in the session
            session_type: Type of session (Race, Sprint, etc.)

        Returns:
            Corrected lap time
        """
        initial_fuel = INITIAL_FUEL_LOAD.get(session_type, 0)
        if initial_fuel == 0:
            return lap_time

        fuel_remaining = initial_fuel * (1 - (lap_number / total_laps))
        fuel_penalty = fuel_remaining * FUEL_CORRECTION_FACTOR
        return lap_time - fuel_penalty

    @staticmethod
    def save_dataframe_to_json(df: pd.DataFrame, event_title: str, year: int = 2025, output_dir: str = None) -> str:
        """
        Save DataFrame to a JSON file with a standardized filename.

        Args:
            df: DataFrame to save
            event_title: Title of the event (e.g., "FORMULA 1 LOUIS VUITTON AUSTRALIAN GRAND PRIX 2025")
            year: Year of the event
            output_dir: Directory to save the JSON file (defaults to year folder)

        Returns:
            Path to the saved JSON file
        """
        if df is None:
            print(f"Error: Cannot save None DataFrame for {event_title}")
            return ""

        # If no output directory is specified, use the year as the directory name
        if output_dir is None:
            output_dir = str(year)

        # Find the proper race name from the event title
        race_name = None

        # First, try to match with our predefined race names
        for key, proper_name in F1_RACES_2025.items():
            if key in event_title.upper():
                race_name = proper_name
                break

        # If no match found, try to extract from the title
        if race_name is None:
            # Try to extract Grand Prix name using regex
            match = re.search(r'([A-Z]+(?:\s+[A-Z]+)*)\s+GRAND\s+PRIX', event_title, re.IGNORECASE)
            if match:
                location = match.group(1).strip()
                race_name = f"{location} Grand Prix"
            else:
                # Fallback to a generic name with the event ID
                race_name = "Unknown Grand Prix"
                print(f"Warning: Could not determine race name for: {event_title}")

        # # Clean up the filename - remove special characters and replace spaces with underscores
        filename = race_name
        # filename = race_name.replace(' ', '_')
        # filename = re.sub(r'[^\w\s_-]', '', filename)  # Remove special characters except underscores and hyphens

        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Save the DataFrame to a JSON file
        file_path = os.path.join(output_dir, f"{filename}.json")
        df.to_json(file_path, orient="records", indent=4)

        print(f"Saved data to {file_path}")
        return file_path


def main():
    """Main function to demonstrate the usage of the classes."""
    # Initialize the data fetcher
    fetcher = F1DataFetcher()

    # Fetch events data
    events_data = fetcher.fetch_events_data()
    if not events_data:
        print("No events data found. Exiting.")
        return

    # Fetch specific data for each event
    all_event_specific_data = fetcher.fetch_event_specific_data(events_data)

    # Create a DataProcessor instance
    processor = DataProcessor()

    # Process each event and save to JSON
    saved_files = []
    year = 2025  # Default year
    output_dir = str(year)  # Use year as directory name

    for event in events_data:
        event_id = event.get("id")
        event_title = event.get("title", "Unknown Title")

        if not event_id or event_id not in all_event_specific_data:
            print(f"Skipping event {event_title}: No data available")
            continue

        event_data = all_event_specific_data[event_id]

        # Check if there was an error fetching this event's data
        if isinstance(event_data, dict) and "error" in event_data:
            print(f"Skipping event {event_title}: Error in data - {event_data.get('error')}")
            continue

        # Convert HTML table to DataFrame
        print(f"\nProcessing event: {event_title}")
        event_dataframe = processor.html_table_to_dataframe(event_data)

        if event_dataframe is not None:
            # Extract year from title or use default
            year_match = re.search(r'(\d{4})', event_title)
            if year_match:
                year = int(year_match.group(1))
                output_dir = str(year)  # Update output directory based on year

            # Save DataFrame to JSON
            file_path = processor.save_dataframe_to_json(
                event_dataframe,
                event_title,
                year,
                output_dir
            )

            if file_path:
                saved_files.append(file_path)
        else:
            print(f"Failed to create DataFrame for {event_title}")

    # Print summary
    print(f"\n--- JSON Export Complete ---")
    print(f"Successfully saved {len(saved_files)} event data files to the '{output_dir}' directory.")
    if saved_files:
        print("Files saved:")
        for file_path in saved_files:
            print(f"  - {file_path}")


if __name__ == "__main__":
    main()


Attempting to fetch data from: https://inmotion.dhl/api/f1-award-element-data/6367



Successfully extracted events data
Found 24 events to process.

Attempting to fetch data for Event ID: 1086 (FORMULA 1 LOUIS VUITTON AUSTRALIAN GRAND PRIX 2025)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1086
Successfully fetched and parsed data for Event ID: 1086

Attempting to fetch data for Event ID: 1087 (FORMULA 1 HEINEKEN CHINESE GRAND PRIX 2025)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1087
Successfully fetched and parsed data for Event ID: 1087

Attempting to fetch data for Event ID: 1088 (FORMULA 1 LENOVO JAPANESE GRAND PRIX 2025)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1088
Successfully fetched and parsed data for Event ID: 1088

Attempting to fetch data for Event ID: 1089 (FORMULA 1 GULF AIR BAHRAIN GRAND PRIX 2025)
URL: https://inmotion.dhl/api/f1-award-element-data/6365?event=1089
Successfully fetched and parsed data for Event ID: 1089

Attempting to fetch data for Event ID: 1090 (FORMULA 1 STC SAUDI ARABIAN

In [3]:
import io
import json
import os
import re
import time
from typing import Any, Dict, List, Optional, Tuple, Union

import pandas as pd
import requests

# Configuration constants
DEFAULT_TIMEOUT = 10
# URLs by year
F1_URLS = {
    2024: {
        "EVENT_DATA_URL": "https://inmotion.dhl/api/f1-award-element-data/6276",
        "EVENT_SPECIFIC_URL": "https://inmotion.dhl/api/f1-award-element-data/6273"
    },
    2025: {
        "EVENT_DATA_URL": "https://inmotion.dhl/api/f1-award-element-data/6367",
        "EVENT_SPECIFIC_URL": "https://inmotion.dhl/api/f1-award-element-data/6365"
    }
}
DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
DELAY_BETWEEN_REQUESTS = 0.5  # seconds

# F1 race names by year
F1_RACES = {
    2025: {
        "AUSTRALIAN": "Australian Grand Prix",
        "CHINESE": "Chinese Grand Prix",
        "JAPANESE": "Japanese Grand Prix",
        "BAHRAIN": "Bahrain Grand Prix",
        "SAUDI ARABIAN": "Saudi Arabian Grand Prix",
        "MIAMI": "Miami Grand Prix",
        "EMILIA-ROMAGNA": "Emilia Romagna Grand Prix",
        "MONACO": "Monaco Grand Prix",
        "ESPAÑA": "Spanish Grand Prix",
        "CANADA": "Canadian Grand Prix",
        "AUSTRIAN": "Austrian Grand Prix",
        "BRITISH": "British Grand Prix",
        "BELGIAN": "Belgian Grand Prix",
        "HUNGARIAN": "Hungarian Grand Prix",
        "DUTCH": "Dutch Grand Prix",
        "ITALIA": "Italian Grand Prix",
        "AZERBAIJAN": "Azerbaijan Grand Prix",
        "SINGAPORE": "Singapore Grand Prix",
        "UNITED STATES": "United States Grand Prix",
        "MÉXICO": "Mexico City Grand Prix",
        "SÃO PAULO": "São Paulo Grand Prix",
        "LAS VEGAS": "Las Vegas Grand Prix",
        "QATAR": "Qatar Grand Prix",
        "ABU DHABI": "Abu Dhabi Grand Prix",
    },
    2024: {
        "AUSTRALIAN": "Australian Grand Prix",
        "CHINESE": "Chinese Grand Prix",
        "JAPANESE": "Japanese Grand Prix",
        "BAHRAIN": "Bahrain Grand Prix",
        "SAUDI ARABIAN": "Saudi Arabian Grand Prix",
        "MIAMI": "Miami Grand Prix",
        "EMILIA-ROMAGNA": "Emilia Romagna Grand Prix",
        "MONACO": "Monaco Grand Prix",
        "ESPAÑA": "Spanish Grand Prix",
        "CANADA": "Canadian Grand Prix",
        "AUSTRIAN": "Austrian Grand Prix",
        "BRITISH": "British Grand Prix",
        "BELGIAN": "Belgian Grand Prix",
        "HUNGARIAN": "Hungarian Grand Prix",
        "DUTCH": "Dutch Grand Prix",
        "ITALIA": "Italian Grand Prix",
        "AZERBAIJAN": "Azerbaijan Grand Prix",
        "SINGAPORE": "Singapore Grand Prix",
        "UNITED STATES": "United States Grand Prix",
        "MÉXICO": "Mexico City Grand Prix",
        "SÃO PAULO": "São Paulo Grand Prix",
        "LAS VEGAS": "Las Vegas Grand Prix",
        "QATAR": "Qatar Grand Prix",
        "ABU DHABI": "Abu Dhabi Grand Prix",
    }
}


class F1DataFetcher:
    """Class for fetching and processing Formula 1 data."""

    def __init__(self, year: int = 2025, timeout: int = DEFAULT_TIMEOUT, headers: Dict = None):
        """
        Initialize the F1DataFetcher.

        Args:
            year: Year for which to fetch F1 data
            timeout: Request timeout in seconds
            headers: HTTP headers for requests
        """
        self.year = year
        self.timeout = timeout
        self.headers = headers or DEFAULT_HEADERS
        self.event_data_cache = {}
        self.event_specific_data_cache = {}

        # Set URLs based on year
        self.set_year(year)

    def set_year(self, year: int) -> None:
        """
        Set the year and update URLs accordingly.

        Args:
            year: Year for which to fetch F1 data
        """
        self.year = year
        if year in F1_URLS:
            self.event_data_url = F1_URLS[year]["EVENT_DATA_URL"]
            self.event_specific_url = F1_URLS[year]["EVENT_SPECIFIC_URL"]
        else:
            # Default to latest year if requested year is not available
            latest_year = max(F1_URLS.keys())
            print(f"Warning: Data for year {year} not available. Using {latest_year} instead.")
            self.year = latest_year
            self.event_data_url = F1_URLS[latest_year]["EVENT_DATA_URL"]
            self.event_specific_url = F1_URLS[latest_year]["EVENT_SPECIFIC_URL"]

    def fetch_events_data(self, url: str = None) -> Dict:
        """
        Fetch events data from the specified URL.

        Args:
            url: URL to fetch data from (defaults to year-specific URL)

        Returns:
            Dictionary containing events data

        Raises:
            requests.exceptions.RequestException: If request fails
            json.JSONDecodeError: If response is not valid JSON
        """
        if url is None:
            url = self.event_data_url

        print(f"Attempting to fetch data from: {url}")

        try:
            response = self._make_request(url)
            parsed_data = response.json()

            # Extract events list from the nested structure
            data_section = parsed_data.get("data", {})
            chart_section = data_section.get("chart", {})
            events_data = chart_section.get("events", [])

            if events_data:
                print("\nSuccessfully extracted events data")
                return events_data
            else:
                print("\nError: Could not find the 'events' data at the expected path")
                return []

        except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
            print(f"\nError fetching events data: {e}")
            return []

    def fetch_event_specific_data(
        self, events_data: List[Dict], base_url: str = None
    ) -> Dict[str, Any]:
        """
        Fetch specific data for each event.

        Args:
            events_data: List of event dictionaries
            base_url: Base URL for event-specific data (defaults to year-specific URL)

        Returns:
            Dictionary mapping event IDs to their specific data
        """
        if base_url is None:
            base_url = self.event_specific_url

        all_event_specific_data = {}

        print(f"Found {len(events_data)} events to process.")

        for event in events_data:
            event_id = event.get("id")
            event_title = event.get("title", "Unknown Title")

            if not event_id:
                print(f"Warning: Skipping event with missing ID: {event_title}")
                continue

            # Check cache first
            if event_id in self.event_specific_data_cache:
                all_event_specific_data[event_id] = self.event_specific_data_cache[
                    event_id
                ]
                print(f"Using cached data for Event ID: {event_id} ({event_title})")
                continue

            specific_url = f"{base_url}?event={event_id}"
            print(
                f"\nAttempting to fetch data for Event ID: {event_id} ({event_title})"
            )
            print(f"URL: {specific_url}")

            try:
                response = self._make_request(specific_url)
                event_specific_data = response.json()

                # Cache the result
                self.event_specific_data_cache[event_id] = event_specific_data
                all_event_specific_data[event_id] = event_specific_data

                print(f"Successfully fetched and parsed data for Event ID: {event_id}")

            except requests.exceptions.RequestException as e:
                error_info = {"error": str(e)}
                all_event_specific_data[event_id] = error_info
                print(f"Error during request for Event ID: {event_id}: {e}")

            except json.JSONDecodeError:
                error_info = {
                    "error": "JSONDecodeError",
                    "response_text": (
                        response.text[:500]
                        if hasattr(response, "text")
                        else "No response text"
                    ),
                }
                all_event_specific_data[event_id] = error_info
                print(f"Error: Failed to decode JSON for Event ID: {event_id}")

            # Add delay between requests
            if DELAY_BETWEEN_REQUESTS > 0:
                time.sleep(DELAY_BETWEEN_REQUESTS)

        # Print summary
        self._print_fetch_summary(events_data, all_event_specific_data)
        return all_event_specific_data

    def _make_request(self, url: str) -> requests.Response:
        """
        Make an HTTP request with error handling.

        Args:
            url: URL to request

        Returns:
            Response object

        Raises:
            requests.exceptions.RequestException: If request fails
        """
        try:
            response = requests.get(url, headers=self.headers, timeout=self.timeout)
            response.raise_for_status()
            return response
        except requests.exceptions.Timeout:
            print(f"Error: The request to {url} timed out.")
            raise
        except requests.exceptions.RequestException:
            print(f"Error during request to {url}")
            raise

    def _print_fetch_summary(
        self, events_data: List[Dict], all_event_specific_data: Dict[str, Any]
    ) -> None:
        """Print a summary of the fetch operation."""
        print("\n--- Processing Complete ---")
        successful_fetches = sum(
            1
            for data in all_event_specific_data.values()
            if isinstance(data, dict) and "error" not in data
        )
        failed_fetches = len(events_data) - successful_fetches
        print(f"Successfully fetched data for {successful_fetches} events.")
        print(f"Failed to fetch data for {failed_fetches} events.")


class DataProcessor:
    """Class for processing F1 data."""

    @staticmethod
    def html_table_to_dataframe(event_json_data: Dict) -> Optional[pd.DataFrame]:
        """
        Extract HTML table from event JSON data and convert to DataFrame.

        Args:
            event_json_data: JSON data dictionary for a specific event

        Returns:
            DataFrame created from HTML table, or None if extraction fails
        """
        if not isinstance(event_json_data, dict):
            print("Error: Input must be a dictionary.")
            return None

        # Extract HTML table string
        html_table_str = event_json_data.get("htmlList", {}).get("table")

        if not html_table_str:
            print(
                "Error: Could not find 'htmlList' -> 'table' in the provided JSON data or it's empty."
            )
            return None

        if not isinstance(html_table_str, str):
            print("Error: The value at ['htmlList']['table'] is not a string.")
            return None

        print("Found HTML table string. Attempting to parse...")
        try:
            # Parse HTML table into DataFrame
            list_of_dfs = pd.read_html(io.StringIO(html_table_str))

            if list_of_dfs:
                print("Successfully parsed HTML table into DataFrame.")
                return list_of_dfs[0]
            else:
                print(
                    "Warning: No tables found by pd.read_html, although HTML string was present."
                )
                return None

        except ValueError as ve:
            print(f"Error parsing HTML with pandas (ValueError): {ve}")
            print("Check if the HTML string actually contains a <table> tag.")
            return None
        except ImportError:
            print(
                "Error: The 'lxml' library might be required by pd.read_html. Please install it (`pip install lxml`)."
            )
            return None
        except Exception as e:
            print(f"An unexpected error occurred during HTML parsing: {e}")
            return None

    @staticmethod
    def save_dataframe_to_json(
        df: pd.DataFrame, event_title: str, year: int, output_dir: str = None
    ) -> str:
        """
        Save DataFrame to a JSON file with a standardized filename.

        Args:
            df: DataFrame to save
            event_title: Title of the event (e.g., "FORMULA 1 LOUIS VUITTON AUSTRALIAN GRAND PRIX 2025")
            year: Year of the event
            output_dir: Directory to save the JSON file (defaults to year folder)

        Returns:
            Path to the saved JSON file
        """
        if df is None:
            print(f"Error: Cannot save None DataFrame for {event_title}")
            return ""

        # If no output directory is specified, use the year as the directory name
        if output_dir is None:
            output_dir = str(year)

        # Find the proper race name from the event title
        race_name = None

        # First, try to match with our predefined race names for the specific year
        if year in F1_RACES:
            year_races = F1_RACES[year]
            for key, proper_name in year_races.items():
                if key in event_title.upper():
                    race_name = proper_name
                    break
        else:
            # If year not in F1_RACES, use the latest year's race names
            latest_year = max(F1_RACES.keys())
            year_races = F1_RACES[latest_year]
            for key, proper_name in year_races.items():
                if key in event_title.upper():
                    race_name = proper_name
                    break

        # If no match found, try to extract from the title
        if race_name is None:
            # Try to extract Grand Prix name using regex
            match = re.search(
                r"([A-Z]+(?:\s+[A-Z]+)*)\s+GRAND\s+PRIX", event_title, re.IGNORECASE
            )
            if match:
                location = match.group(1).strip()
                race_name = f"{location} Grand Prix"
            else:
                # Fallback to a generic name with the event ID
                race_name = "Unknown Grand Prix"
                print(f"Warning: Could not determine race name for: {event_title}")

        # Clean up the filename
        filename = race_name

        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Save the DataFrame to a JSON file
        file_path = os.path.join(output_dir, f"{filename}.json")
        df.to_json(file_path, orient="records", indent=4)

        print(f"Saved data to {file_path}")
        return file_path


def main(year: int = 2025):
    """
    Main function to fetch and process F1 data for a specific year.

    Args:
        year: Year to fetch data for (default: 2025)
    """
    print(f"Fetching F1 data for year: {year}")

    # Initialize the data fetcher with the specified year
    fetcher = F1DataFetcher(year=year)

    # Fetch events data
    events_data = fetcher.fetch_events_data()
    if not events_data:
        print("No events data found. Exiting.")
        return

    # Fetch specific data for each event
    all_event_specific_data = fetcher.fetch_event_specific_data(events_data)

    # Create a DataProcessor instance
    processor = DataProcessor()

    # Process each event and save to JSON
    saved_files = []
    output_dir = str(year)  # Use year as directory name

    for event in events_data:
        event_id = event.get("id")
        event_title = event.get("title", "Unknown Title")

        if not event_id or event_id not in all_event_specific_data:
            print(f"Skipping event {event_title}: No data available")
            continue

        event_data = all_event_specific_data[event_id]

        # Check if there was an error fetching this event's data
        if isinstance(event_data, dict) and "error" in event_data:
            print(
                f"Skipping event {event_title}: Error in data - {event_data.get('error')}"
            )
            continue

        # Convert HTML table to DataFrame
        print(f"\nProcessing event: {event_title}")
        event_dataframe = processor.html_table_to_dataframe(event_data)

        if event_dataframe is not None:
            # Save DataFrame to JSON
            file_path = processor.save_dataframe_to_json(
                event_dataframe, event_title, year, output_dir
            )

            if file_path:
                saved_files.append(file_path)
        else:
            print(f"Failed to create DataFrame for {event_title}")

    # Print summary
    print(f"\n--- JSON Export Complete ---")
    print(
        f"Successfully saved {len(saved_files)} event data files to the '{output_dir}' directory."
    )
    if saved_files:
        print("Files saved:")
        for file_path in saved_files:
            print(f"  - {file_path}")


if __name__ == "__main__":
    # Call main function with desired year
    # Example: main(2024) to fetch 2024 data
    main(2024)  # Default to 2025 if no year specified

Fetching F1 data for year: 2024
Attempting to fetch data from: https://inmotion.dhl/api/f1-award-element-data/6276

Successfully extracted events data
Found 24 events to process.

Attempting to fetch data for Event ID: 1016 (Formula 1 Gulf Air Bahrain Grand Prix 2024)
URL: https://inmotion.dhl/api/f1-award-element-data/6273?event=1016
Successfully fetched and parsed data for Event ID: 1016

Attempting to fetch data for Event ID: 1017 (Formula 1 stc Saudi Arabian Grand Prix 2024)
URL: https://inmotion.dhl/api/f1-award-element-data/6273?event=1017
Successfully fetched and parsed data for Event ID: 1017

Attempting to fetch data for Event ID: 1018 (Formula 1 Rolex Australian Grand Prix 2024)
URL: https://inmotion.dhl/api/f1-award-element-data/6273?event=1018
Successfully fetched and parsed data for Event ID: 1018

Attempting to fetch data for Event ID: 1019 (Formula 1 MSC Cruises Japanese Grand Prix 2024)
URL: https://inmotion.dhl/api/f1-award-element-data/6273?event=1019
Successfully fet

In [4]:
import io
import json
import os
import re
import time
from typing import Any, Dict, List, Optional, Tuple, Union

import pandas as pd
import requests

# Configuration constants
DEFAULT_TIMEOUT = 10
# URLs by year
F1_URLS = {
    2023: {
        "EVENT_DATA_URL": "https://inmotion.dhl/api/f1-award-element-data/6284",
        "EVENT_SPECIFIC_URL": "https://inmotion.dhl/api/f1-award-element-data/6282",
    },
    2024: {
        "EVENT_DATA_URL": "https://inmotion.dhl/api/f1-award-element-data/6276",
        "EVENT_SPECIFIC_URL": "https://inmotion.dhl/api/f1-award-element-data/6273",
    },
    2025: {
        "EVENT_DATA_URL": "https://inmotion.dhl/api/f1-award-element-data/6367",
        "EVENT_SPECIFIC_URL": "https://inmotion.dhl/api/f1-award-element-data/6365",
    },
}
DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
DELAY_BETWEEN_REQUESTS = 0.5  # seconds

# F1 race names by year
F1_RACES = {
    2025: {
        "AUSTRALIAN": "Australian Grand Prix",
        "CHINESE": "Chinese Grand Prix",
        "JAPANESE": "Japanese Grand Prix",
        "BAHRAIN": "Bahrain Grand Prix",
        "SAUDI ARABIAN": "Saudi Arabian Grand Prix",
        "MIAMI": "Miami Grand Prix",
        "EMILIA-ROMAGNA": "Emilia Romagna Grand Prix",
        "MONACO": "Monaco Grand Prix",
        "ESPAÑA": "Spanish Grand Prix",
        "CANADA": "Canadian Grand Prix",
        "AUSTRIAN": "Austrian Grand Prix",
        "BRITISH": "British Grand Prix",
        "BELGIAN": "Belgian Grand Prix",
        "HUNGARIAN": "Hungarian Grand Prix",
        "DUTCH": "Dutch Grand Prix",
        "ITALIA": "Italian Grand Prix",
        "AZERBAIJAN": "Azerbaijan Grand Prix",
        "SINGAPORE": "Singapore Grand Prix",
        "UNITED STATES": "United States Grand Prix",
        "MÉXICO": "Mexico City Grand Prix",
        "SÃO PAULO": "São Paulo Grand Prix",
        "LAS VEGAS": "Las Vegas Grand Prix",
        "QATAR": "Qatar Grand Prix",
        "ABU DHABI": "Abu Dhabi Grand Prix",
    },
    2024: {
        "AUSTRALIAN": "Australian Grand Prix",
        "CHINESE": "Chinese Grand Prix",
        "JAPANESE": "Japanese Grand Prix",
        "BAHRAIN": "Bahrain Grand Prix",
        "SAUDI ARABIAN": "Saudi Arabian Grand Prix",
        "MIAMI": "Miami Grand Prix",
        "EMILIA-ROMAGNA": "Emilia Romagna Grand Prix",
        "MONACO": "Monaco Grand Prix",
        "ESPAÑA": "Spanish Grand Prix",
        "CANADA": "Canadian Grand Prix",
        "AUSTRIAN": "Austrian Grand Prix",
        "BRITISH": "British Grand Prix",
        "BELGIAN": "Belgian Grand Prix",
        "HUNGARIAN": "Hungarian Grand Prix",
        "DUTCH": "Dutch Grand Prix",
        "ITALIA": "Italian Grand Prix",
        "AZERBAIJAN": "Azerbaijan Grand Prix",
        "SINGAPORE": "Singapore Grand Prix",
        "UNITED STATES": "United States Grand Prix",
        "MEXICO": "Mexico City Grand Prix",
        "SÃO PAOLO": "São Paulo Grand Prix",
        "LAS VEGAS": "Las Vegas Grand Prix",
        "QATAR": "Qatar Grand Prix",
        "ABU DHABI": "Abu Dhabi Grand Prix",
    },
}


class F1DataFetcher:
    """Class for fetching and processing Formula 1 data."""

    def __init__(
        self, year: int = 2025, timeout: int = DEFAULT_TIMEOUT, headers: Dict = None
    ):
        """
        Initialize the F1DataFetcher.

        Args:
            year: Year for which to fetch F1 data
            timeout: Request timeout in seconds
            headers: HTTP headers for requests
        """
        self.year = year
        self.timeout = timeout
        self.headers = headers or DEFAULT_HEADERS
        self.event_data_cache = {}
        self.event_specific_data_cache = {}

        # Set URLs based on year
        self.set_year(year)

    def set_year(self, year: int) -> None:
        """
        Set the year and update URLs accordingly.

        Args:
            year: Year for which to fetch F1 data
        """
        self.year = year
        if year in F1_URLS:
            self.event_data_url = F1_URLS[year]["EVENT_DATA_URL"]
            self.event_specific_url = F1_URLS[year]["EVENT_SPECIFIC_URL"]
        else:
            # Default to latest year if requested year is not available
            latest_year = max(F1_URLS.keys())
            print(
                f"Warning: Data for year {year} not available. Using {latest_year} instead."
            )
            self.year = latest_year
            self.event_data_url = F1_URLS[latest_year]["EVENT_DATA_URL"]
            self.event_specific_url = F1_URLS[latest_year]["EVENT_SPECIFIC_URL"]

    def fetch_events_data(self, url: str = None) -> Dict:
        """
        Fetch events data from the specified URL.

        Args:
            url: URL to fetch data from (defaults to year-specific URL)

        Returns:
            Dictionary containing events data

        Raises:
            requests.exceptions.RequestException: If request fails
            json.JSONDecodeError: If response is not valid JSON
        """
        if url is None:
            url = self.event_data_url

        print(f"Attempting to fetch data from: {url}")

        try:
            response = self._make_request(url)
            parsed_data = response.json()

            # Extract events list from the nested structure
            data_section = parsed_data.get("data", {})
            chart_section = data_section.get("chart", {})
            events_data = chart_section.get("events", [])

            if events_data:
                print("\nSuccessfully extracted events data")
                return events_data
            else:
                print("\nError: Could not find the 'events' data at the expected path")
                return []

        except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
            print(f"\nError fetching events data: {e}")
            return []

    def fetch_event_specific_data(
        self, events_data: List[Dict], base_url: str = None
    ) -> Dict[str, Any]:
        """
        Fetch specific data for each event.

        Args:
            events_data: List of event dictionaries
            base_url: Base URL for event-specific data (defaults to year-specific URL)

        Returns:
            Dictionary mapping event IDs to their specific data
        """
        if base_url is None:
            base_url = self.event_specific_url

        all_event_specific_data = {}

        print(f"Found {len(events_data)} events to process.")

        for event in events_data:
            event_id = event.get("id")
            event_title = event.get("title", "Unknown Title")

            if not event_id:
                print(f"Warning: Skipping event with missing ID: {event_title}")
                continue

            # Check cache first
            if event_id in self.event_specific_data_cache:
                all_event_specific_data[event_id] = self.event_specific_data_cache[
                    event_id
                ]
                print(f"Using cached data for Event ID: {event_id} ({event_title})")
                continue

            specific_url = f"{base_url}?event={event_id}"
            print(
                f"\nAttempting to fetch data for Event ID: {event_id} ({event_title})"
            )
            print(f"URL: {specific_url}")

            try:
                response = self._make_request(specific_url)
                event_specific_data = response.json()

                # Cache the result
                self.event_specific_data_cache[event_id] = event_specific_data
                all_event_specific_data[event_id] = event_specific_data

                print(f"Successfully fetched and parsed data for Event ID: {event_id}")

            except requests.exceptions.RequestException as e:
                error_info = {"error": str(e)}
                all_event_specific_data[event_id] = error_info
                print(f"Error during request for Event ID: {event_id}: {e}")

            except json.JSONDecodeError:
                error_info = {
                    "error": "JSONDecodeError",
                    "response_text": (
                        response.text[:500]
                        if hasattr(response, "text")
                        else "No response text"
                    ),
                }
                all_event_specific_data[event_id] = error_info
                print(f"Error: Failed to decode JSON for Event ID: {event_id}")

            # Add delay between requests
            if DELAY_BETWEEN_REQUESTS > 0:
                time.sleep(DELAY_BETWEEN_REQUESTS)

        # Print summary
        self._print_fetch_summary(events_data, all_event_specific_data)
        return all_event_specific_data

    def _make_request(self, url: str) -> requests.Response:
        """
        Make an HTTP request with error handling.

        Args:
            url: URL to request

        Returns:
            Response object

        Raises:
            requests.exceptions.RequestException: If request fails
        """
        try:
            response = requests.get(url, headers=self.headers, timeout=self.timeout)
            response.raise_for_status()
            return response
        except requests.exceptions.Timeout:
            print(f"Error: The request to {url} timed out.")
            raise
        except requests.exceptions.RequestException:
            print(f"Error during request to {url}")
            raise

    def _print_fetch_summary(
        self, events_data: List[Dict], all_event_specific_data: Dict[str, Any]
    ) -> None:
        """Print a summary of the fetch operation."""
        print("\n--- Processing Complete ---")
        successful_fetches = sum(
            1
            for data in all_event_specific_data.values()
            if isinstance(data, dict) and "error" not in data
        )
        failed_fetches = len(events_data) - successful_fetches
        print(f"Successfully fetched data for {successful_fetches} events.")
        print(f"Failed to fetch data for {failed_fetches} events.")


class DataProcessor:
    """Class for processing F1 data."""

    @staticmethod
    def html_table_to_dataframe(event_json_data: Dict) -> Optional[pd.DataFrame]:
        """
        Extract HTML table from event JSON data and convert to DataFrame.

        Args:
            event_json_data: JSON data dictionary for a specific event

        Returns:
            DataFrame created from HTML table, or None if extraction fails
        """
        if not isinstance(event_json_data, dict):
            print("Error: Input must be a dictionary.")
            return None

        # Extract HTML table string
        html_table_str = event_json_data.get("htmlList", {}).get("table")

        if not html_table_str:
            print(
                "Error: Could not find 'htmlList' -> 'table' in the provided JSON data or it's empty."
            )
            return None

        if not isinstance(html_table_str, str):
            print("Error: The value at ['htmlList']['table'] is not a string.")
            return None

        print("Found HTML table string. Attempting to parse...")
        try:
            # Parse HTML table into DataFrame
            list_of_dfs = pd.read_html(io.StringIO(html_table_str))

            if list_of_dfs:
                print("Successfully parsed HTML table into DataFrame.")
                return list_of_dfs[0]
            else:
                print(
                    "Warning: No tables found by pd.read_html, although HTML string was present."
                )
                return None

        except ValueError as ve:
            print(f"Error parsing HTML with pandas (ValueError): {ve}")
            print("Check if the HTML string actually contains a <table> tag.")
            return None
        except ImportError:
            print(
                "Error: The 'lxml' library might be required by pd.read_html. Please install it (`pip install lxml`)."
            )
            return None
        except Exception as e:
            print(f"An unexpected error occurred during HTML parsing: {e}")
            return None

    @staticmethod
    def save_dataframe_to_json(
        df: pd.DataFrame, event_title: str, year: int, output_dir: str = None
    ) -> str:
        """
        Save DataFrame to a JSON file with a standardized filename.

        Args:
            df: DataFrame to save
            event_title: Title of the event (e.g., "FORMULA 1 LOUIS VUITTON AUSTRALIAN GRAND PRIX 2025")
            year: Year of the event
            output_dir: Directory to save the JSON file (defaults to year folder)

        Returns:
            Path to the saved JSON file
        """
        if df is None:
            print(f"Error: Cannot save None DataFrame for {event_title}")
            return ""

        # If no output directory is specified, use the year as the directory name
        if output_dir is None:
            output_dir = str(year)

        # Find the proper race name from the event title
        race_name = None

        # First, try to match with our predefined race names for the specific year
        if year in F1_RACES:
            year_races = F1_RACES[year]
            for key, proper_name in year_races.items():
                if key in event_title.upper():
                    race_name = proper_name
                    break
        else:
            # If year not in F1_RACES, use the latest year's race names
            latest_year = max(F1_RACES.keys())
            year_races = F1_RACES[latest_year]
            for key, proper_name in year_races.items():
                if key in event_title.upper():
                    race_name = proper_name
                    break

        # If no match found, try to extract from the title
        if race_name is None:
            # Try to extract Grand Prix name using regex
            match = re.search(
                r"([A-Z]+(?:\s+[A-Z]+)*)\s+GRAND\s+PRIX", event_title, re.IGNORECASE
            )
            if match:
                location = match.group(1).strip()
                race_name = f"{location} Grand Prix"
            else:
                # Fallback to a generic name with the event ID
                race_name = "Unknown Grand Prix"
                print(f"Warning: Could not determine race name for: {event_title}")

        # Clean up the filename
        filename = race_name

        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Save the DataFrame to a JSON file
        file_path = os.path.join(output_dir, f"{filename}.json")
        df.to_json(file_path, orient="records", indent=4)

        print(f"Saved data to {file_path}")
        return file_path


def main(year: int = 2025):
    """
    Main function to fetch and process F1 data for a specific year.

    Args:
        year: Year to fetch data for (default: 2025)
    """
    print(f"Fetching F1 data for year: {year}")

    # Initialize the data fetcher with the specified year
    fetcher = F1DataFetcher(year=year)

    # Fetch events data
    events_data = fetcher.fetch_events_data()
    if not events_data:
        print("No events data found. Exiting.")
        return

    # Fetch specific data for each event
    all_event_specific_data = fetcher.fetch_event_specific_data(events_data)

    # Create a DataProcessor instance
    processor = DataProcessor()

    # Process each event and save to JSON
    saved_files = []
    output_dir = str(year)  # Use year as directory name

    for event in events_data:
        event_id = event.get("id")
        event_title = event.get("title", "Unknown Title")

        if not event_id or event_id not in all_event_specific_data:
            print(f"Skipping event {event_title}: No data available")
            continue

        event_data = all_event_specific_data[event_id]

        # Check if there was an error fetching this event's data
        if isinstance(event_data, dict) and "error" in event_data:
            print(
                f"Skipping event {event_title}: Error in data - {event_data.get('error')}"
            )
            continue

        # Convert HTML table to DataFrame
        print(f"\nProcessing event: {event_title}")
        event_dataframe = processor.html_table_to_dataframe(event_data)

        if event_dataframe is not None:
            # Save DataFrame to JSON
            file_path = processor.save_dataframe_to_json(
                event_dataframe, event_title, year, output_dir
            )

            if file_path:
                saved_files.append(file_path)
        else:
            print(f"Failed to create DataFrame for {event_title}")

    # Print summary
    print(f"\n--- JSON Export Complete ---")
    print(
        f"Successfully saved {len(saved_files)} event data files to the '{output_dir}' directory."
    )
    if saved_files:
        print("Files saved:")
        for file_path in saved_files:
            print(f"  - {file_path}")


if __name__ == "__main__":
    main(2024)


Fetching F1 data for year: 2024
Attempting to fetch data from: https://inmotion.dhl/api/f1-award-element-data/6276

Successfully extracted events data
Found 24 events to process.

Attempting to fetch data for Event ID: 1016 (Formula 1 Gulf Air Bahrain Grand Prix 2024)
URL: https://inmotion.dhl/api/f1-award-element-data/6273?event=1016
Successfully fetched and parsed data for Event ID: 1016

Attempting to fetch data for Event ID: 1017 (Formula 1 stc Saudi Arabian Grand Prix 2024)
URL: https://inmotion.dhl/api/f1-award-element-data/6273?event=1017
Successfully fetched and parsed data for Event ID: 1017

Attempting to fetch data for Event ID: 1018 (Formula 1 Rolex Australian Grand Prix 2024)
URL: https://inmotion.dhl/api/f1-award-element-data/6273?event=1018
Successfully fetched and parsed data for Event ID: 1018

Attempting to fetch data for Event ID: 1019 (Formula 1 MSC Cruises Japanese Grand Prix 2024)
URL: https://inmotion.dhl/api/f1-award-element-data/6273?event=1019
Successfully fet

In [5]:
import io
import json
import os
import re
import time
from typing import Any, Dict, List, Optional, Tuple, Union

import pandas as pd
import requests

# Configuration constants
DEFAULT_TIMEOUT = 10
# URLs by year
F1_URLS = {
    2023: {
        "EVENT_DATA_URL": "https://inmotion.dhl/api/f1-award-element-data/6284",
        "EVENT_SPECIFIC_URL": "https://inmotion.dhl/api/f1-award-element-data/6282",
    },
    2024: {
        "EVENT_DATA_URL": "https://inmotion.dhl/api/f1-award-element-data/6276",
        "EVENT_SPECIFIC_URL": "https://inmotion.dhl/api/f1-award-element-data/6273",
    },
    2025: {
        "EVENT_DATA_URL": "https://inmotion.dhl/api/f1-award-element-data/6367",
        "EVENT_SPECIFIC_URL": "https://inmotion.dhl/api/f1-award-element-data/6365",
    },
}
DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
DELAY_BETWEEN_REQUESTS = 0.5  # seconds

# F1 race names by year
F1_RACES = {
    2025: {
        "AUSTRALIAN": "Australian Grand Prix",
        "CHINESE": "Chinese Grand Prix",
        "JAPANESE": "Japanese Grand Prix",
        "BAHRAIN": "Bahrain Grand Prix",
        "SAUDI ARABIAN": "Saudi Arabian Grand Prix",
        "MIAMI": "Miami Grand Prix",
        "EMILIA-ROMAGNA": "Emilia Romagna Grand Prix",
        "MONACO": "Monaco Grand Prix",
        "ESPAÑA": "Spanish Grand Prix",
        "CANADA": "Canadian Grand Prix",
        "AUSTRIAN": "Austrian Grand Prix",
        "BRITISH": "British Grand Prix",
        "BELGIAN": "Belgian Grand Prix",
        "HUNGARIAN": "Hungarian Grand Prix",
        "DUTCH": "Dutch Grand Prix",
        "ITALIA": "Italian Grand Prix",
        "AZERBAIJAN": "Azerbaijan Grand Prix",
        "SINGAPORE": "Singapore Grand Prix",
        "UNITED STATES": "United States Grand Prix",
        "MÉXICO": "Mexico City Grand Prix",
        "SÃO PAULO": "São Paulo Grand Prix",
        "LAS VEGAS": "Las Vegas Grand Prix",
        "QATAR": "Qatar Grand Prix",
        "ABU DHABI": "Abu Dhabi Grand Prix",
    },
    2024: {
        "AUSTRALIAN": "Australian Grand Prix",
        "CHINESE": "Chinese Grand Prix",
        "JAPANESE": "Japanese Grand Prix",
        "BAHRAIN": "Bahrain Grand Prix",
        "SAUDI ARABIAN": "Saudi Arabian Grand Prix",
        "MIAMI": "Miami Grand Prix",
        "EMILIA-ROMAGNA": "Emilia Romagna Grand Prix",
        "MONACO": "Monaco Grand Prix",
        "ESPAÑA": "Spanish Grand Prix",
        "CANADA": "Canadian Grand Prix",
        "AUSTRIAN": "Austrian Grand Prix",
        "BRITISH": "British Grand Prix",
        "BELGIAN": "Belgian Grand Prix",
        "HUNGARIAN": "Hungarian Grand Prix",
        "DUTCH": "Dutch Grand Prix",
        "ITALIA": "Italian Grand Prix",
        "AZERBAIJAN": "Azerbaijan Grand Prix",
        "SINGAPORE": "Singapore Grand Prix",
        "UNITED STATES": "United States Grand Prix",
        "MEXICO": "Mexico City Grand Prix",
        "SÃO PAOLO": "São Paulo Grand Prix",
        "LAS VEGAS": "Las Vegas Grand Prix",
        "QATAR": "Qatar Grand Prix",
        "ABU DHABI": "Abu Dhabi Grand Prix",
    },
    2023: {
        "ABU DHABI": "Abu Dhabi Grand Prix",
        "LAS VEGAS": "Las Vegas Grand Prix",
        "SÃO PAULO": "São Paulo Grand Prix",
        "MEXICO": "Mexico City Grand Prix",
        "UNITED STATES": "United States Grand Prix",
        "QATAR": "Qatar Grand Prix",
        "JAPANESE": "Japanese Grand Prix",
        "SINGAPORE": "Singapore Grand Prix",
        "ITALIA": "Italian Grand Prix",
        "DUTCH": "Dutch Grand Prix",
        "BELGIAN": "Belgian Grand Prix",
        "HUNGARIAN": "Hungarian Grand Prix",
        "BRITISH": "British Grand Prix",
        "AUSTRIAN": "Austrian Grand Prix",
        "CANADA": "Canadian Grand Prix",
        "ESPAÑA": "Spanish Grand Prix",
        "MONACO": "Monaco Grand Prix",
        "MIAMI": "Miami Grand Prix",
        "AZERBAIJAN": "Azerbaijan Grand Prix",
        "AUSTRALIAN": "Australian Grand Prix",
        "SAUDI ARABIAN": "Saudi Arabian Grand Prix",
        "BAHRAIN": "Bahrain Grand Prix"
},

}


class F1DataFetcher:
    """Class for fetching and processing Formula 1 data."""

    def __init__(
        self, year: int = 2025, timeout: int = DEFAULT_TIMEOUT, headers: Dict = None
    ):
        """
        Initialize the F1DataFetcher.

        Args:
            year: Year for which to fetch F1 data
            timeout: Request timeout in seconds
            headers: HTTP headers for requests
        """
        self.year = year
        self.timeout = timeout
        self.headers = headers or DEFAULT_HEADERS
        self.event_data_cache = {}
        self.event_specific_data_cache = {}

        # Set URLs based on year
        self.set_year(year)

    def set_year(self, year: int) -> None:
        """
        Set the year and update URLs accordingly.

        Args:
            year: Year for which to fetch F1 data
        """
        self.year = year
        if year in F1_URLS:
            self.event_data_url = F1_URLS[year]["EVENT_DATA_URL"]
            self.event_specific_url = F1_URLS[year]["EVENT_SPECIFIC_URL"]
        else:
            # Default to latest year if requested year is not available
            latest_year = max(F1_URLS.keys())
            print(
                f"Warning: Data for year {year} not available. Using {latest_year} instead."
            )
            self.year = latest_year
            self.event_data_url = F1_URLS[latest_year]["EVENT_DATA_URL"]
            self.event_specific_url = F1_URLS[latest_year]["EVENT_SPECIFIC_URL"]

    def fetch_events_data(self, url: str = None) -> Dict:
        """
        Fetch events data from the specified URL.

        Args:
            url: URL to fetch data from (defaults to year-specific URL)

        Returns:
            Dictionary containing events data

        Raises:
            requests.exceptions.RequestException: If request fails
            json.JSONDecodeError: If response is not valid JSON
        """
        if url is None:
            url = self.event_data_url

        print(f"Attempting to fetch data from: {url}")

        try:
            response = self._make_request(url)
            parsed_data = response.json()

            # Extract events list from the nested structure
            data_section = parsed_data.get("data", {})
            chart_section = data_section.get("chart", {})
            events_data = chart_section.get("events", [])

            if events_data:
                print("\nSuccessfully extracted events data")
                return events_data
            else:
                print("\nError: Could not find the 'events' data at the expected path")
                return []

        except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
            print(f"\nError fetching events data: {e}")
            return []

    def fetch_event_specific_data(
        self, events_data: List[Dict], base_url: str = None
    ) -> Dict[str, Any]:
        """
        Fetch specific data for each event.

        Args:
            events_data: List of event dictionaries
            base_url: Base URL for event-specific data (defaults to year-specific URL)

        Returns:
            Dictionary mapping event IDs to their specific data
        """
        if base_url is None:
            base_url = self.event_specific_url

        all_event_specific_data = {}

        print(f"Found {len(events_data)} events to process.")

        for event in events_data:
            event_id = event.get("id")
            event_title = event.get("title", "Unknown Title")

            if not event_id:
                print(f"Warning: Skipping event with missing ID: {event_title}")
                continue

            # Check cache first
            if event_id in self.event_specific_data_cache:
                all_event_specific_data[event_id] = self.event_specific_data_cache[
                    event_id
                ]
                print(f"Using cached data for Event ID: {event_id} ({event_title})")
                continue

            specific_url = f"{base_url}?event={event_id}"
            print(
                f"\nAttempting to fetch data for Event ID: {event_id} ({event_title})"
            )
            print(f"URL: {specific_url}")

            try:
                response = self._make_request(specific_url)
                event_specific_data = response.json()

                # Cache the result
                self.event_specific_data_cache[event_id] = event_specific_data
                all_event_specific_data[event_id] = event_specific_data

                print(f"Successfully fetched and parsed data for Event ID: {event_id}")

            except requests.exceptions.RequestException as e:
                error_info = {"error": str(e)}
                all_event_specific_data[event_id] = error_info
                print(f"Error during request for Event ID: {event_id}: {e}")

            except json.JSONDecodeError:
                error_info = {
                    "error": "JSONDecodeError",
                    "response_text": (
                        response.text[:500]
                        if hasattr(response, "text")
                        else "No response text"
                    ),
                }
                all_event_specific_data[event_id] = error_info
                print(f"Error: Failed to decode JSON for Event ID: {event_id}")

            # Add delay between requests
            if DELAY_BETWEEN_REQUESTS > 0:
                time.sleep(DELAY_BETWEEN_REQUESTS)

        # Print summary
        self._print_fetch_summary(events_data, all_event_specific_data)
        return all_event_specific_data

    def _make_request(self, url: str) -> requests.Response:
        """
        Make an HTTP request with error handling.

        Args:
            url: URL to request

        Returns:
            Response object

        Raises:
            requests.exceptions.RequestException: If request fails
        """
        try:
            response = requests.get(url, headers=self.headers, timeout=self.timeout)
            response.raise_for_status()
            return response
        except requests.exceptions.Timeout:
            print(f"Error: The request to {url} timed out.")
            raise
        except requests.exceptions.RequestException:
            print(f"Error during request to {url}")
            raise

    def _print_fetch_summary(
        self, events_data: List[Dict], all_event_specific_data: Dict[str, Any]
    ) -> None:
        """Print a summary of the fetch operation."""
        print("\n--- Processing Complete ---")
        successful_fetches = sum(
            1
            for data in all_event_specific_data.values()
            if isinstance(data, dict) and "error" not in data
        )
        failed_fetches = len(events_data) - successful_fetches
        print(f"Successfully fetched data for {successful_fetches} events.")
        print(f"Failed to fetch data for {failed_fetches} events.")


class DataProcessor:
    """Class for processing F1 data."""

    @staticmethod
    def html_table_to_dataframe(event_json_data: Dict) -> Optional[pd.DataFrame]:
        """
        Extract HTML table from event JSON data and convert to DataFrame.

        Args:
            event_json_data: JSON data dictionary for a specific event

        Returns:
            DataFrame created from HTML table, or None if extraction fails
        """
        if not isinstance(event_json_data, dict):
            print("Error: Input must be a dictionary.")
            return None

        # Extract HTML table string
        html_table_str = event_json_data.get("htmlList", {}).get("table")

        if not html_table_str:
            print(
                "Error: Could not find 'htmlList' -> 'table' in the provided JSON data or it's empty."
            )
            return None

        if not isinstance(html_table_str, str):
            print("Error: The value at ['htmlList']['table'] is not a string.")
            return None

        print("Found HTML table string. Attempting to parse...")
        try:
            # Parse HTML table into DataFrame
            list_of_dfs = pd.read_html(io.StringIO(html_table_str))

            if list_of_dfs:
                print("Successfully parsed HTML table into DataFrame.")
                return list_of_dfs[0]
            else:
                print(
                    "Warning: No tables found by pd.read_html, although HTML string was present."
                )
                return None

        except ValueError as ve:
            print(f"Error parsing HTML with pandas (ValueError): {ve}")
            print("Check if the HTML string actually contains a <table> tag.")
            return None
        except ImportError:
            print(
                "Error: The 'lxml' library might be required by pd.read_html. Please install it (`pip install lxml`)."
            )
            return None
        except Exception as e:
            print(f"An unexpected error occurred during HTML parsing: {e}")
            return None

    @staticmethod
    def save_dataframe_to_json(
        df: pd.DataFrame, event_title: str, year: int, output_dir: str = None
    ) -> str:
        """
        Save DataFrame to a JSON file with a standardized filename.

        Args:
            df: DataFrame to save
            event_title: Title of the event (e.g., "FORMULA 1 LOUIS VUITTON AUSTRALIAN GRAND PRIX 2025")
            year: Year of the event
            output_dir: Directory to save the JSON file (defaults to year folder)

        Returns:
            Path to the saved JSON file
        """
        if df is None:
            print(f"Error: Cannot save None DataFrame for {event_title}")
            return ""

        # If no output directory is specified, use the year as the directory name
        if output_dir is None:
            output_dir = str(year)

        # Find the proper race name from the event title
        race_name = None

        # First, try to match with our predefined race names for the specific year
        if year in F1_RACES:
            year_races = F1_RACES[year]
            for key, proper_name in year_races.items():
                if key in event_title.upper():
                    race_name = proper_name
                    break
        else:
            # If year not in F1_RACES, use the latest year's race names
            latest_year = max(F1_RACES.keys())
            year_races = F1_RACES[latest_year]
            for key, proper_name in year_races.items():
                if key in event_title.upper():
                    race_name = proper_name
                    break

        # If no match found, try to extract from the title
        if race_name is None:
            # Try to extract Grand Prix name using regex
            match = re.search(
                r"([A-Z]+(?:\s+[A-Z]+)*)\s+GRAND\s+PRIX", event_title, re.IGNORECASE
            )
            if match:
                location = match.group(1).strip()
                race_name = f"{location} Grand Prix"
            else:
                # Fallback to a generic name with the event ID
                race_name = "Unknown Grand Prix"
                print(f"Warning: Could not determine race name for: {event_title}")

        # Clean up the filename
        filename = race_name

        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Save the DataFrame to a JSON file
        file_path = os.path.join(output_dir, f"{filename}.json")
        df.to_json(file_path, orient="records", indent=4)

        print(f"Saved data to {file_path}")
        return file_path


def main(year: int = 2025):
    """
    Main function to fetch and process F1 data for a specific year.

    Args:
        year: Year to fetch data for (default: 2025)
    """
    print(f"Fetching F1 data for year: {year}")

    # Initialize the data fetcher with the specified year
    fetcher = F1DataFetcher(year=year)

    # Fetch events data
    events_data = fetcher.fetch_events_data()
    if not events_data:
        print("No events data found. Exiting.")
        return

    # Fetch specific data for each event
    all_event_specific_data = fetcher.fetch_event_specific_data(events_data)

    # Create a DataProcessor instance
    processor = DataProcessor()

    # Process each event and save to JSON
    saved_files = []
    output_dir = str(year)  # Use year as directory name

    for event in events_data:
        event_id = event.get("id")
        event_title = event.get("title", "Unknown Title")

        if not event_id or event_id not in all_event_specific_data:
            print(f"Skipping event {event_title}: No data available")
            continue

        event_data = all_event_specific_data[event_id]

        # Check if there was an error fetching this event's data
        if isinstance(event_data, dict) and "error" in event_data:
            print(
                f"Skipping event {event_title}: Error in data - {event_data.get('error')}"
            )
            continue

        # Convert HTML table to DataFrame
        print(f"\nProcessing event: {event_title}")
        event_dataframe = processor.html_table_to_dataframe(event_data)

        if event_dataframe is not None:
            # Save DataFrame to JSON
            file_path = processor.save_dataframe_to_json(
                event_dataframe, event_title, year, output_dir
            )

            if file_path:
                saved_files.append(file_path)
        else:
            print(f"Failed to create DataFrame for {event_title}")

    # Print summary
    print(f"\n--- JSON Export Complete ---")
    print(
        f"Successfully saved {len(saved_files)} event data files to the '{output_dir}' directory."
    )
    if saved_files:
        print("Files saved:")
        for file_path in saved_files:
            print(f"  - {file_path}")


if __name__ == "__main__":
    main(2023)


Fetching F1 data for year: 2023
Attempting to fetch data from: https://inmotion.dhl/api/f1-award-element-data/6284

Successfully extracted events data
Found 22 events to process.

Attempting to fetch data for Event ID: 837 (Formula 1 Gulf Air Bahrain Grand Prix 2023)
URL: https://inmotion.dhl/api/f1-award-element-data/6282?event=837
Successfully fetched and parsed data for Event ID: 837

Attempting to fetch data for Event ID: 838 (Formula 1 stc Saudi Arabian Grand Prix 2023)
URL: https://inmotion.dhl/api/f1-award-element-data/6282?event=838
Successfully fetched and parsed data for Event ID: 838

Attempting to fetch data for Event ID: 839 (Formula 1 Rolex Australian Grand Prix 2023)
URL: https://inmotion.dhl/api/f1-award-element-data/6282?event=839
Successfully fetched and parsed data for Event ID: 839

Attempting to fetch data for Event ID: 841 (Formula 1 Azerbaijan Grand Prix 2023)
URL: https://inmotion.dhl/api/f1-award-element-data/6282?event=841
Successfully fetched and parsed data 

In [6]:
import io
import json
import os
import re
import time
from typing import Any, Dict, List, Optional, Tuple, Union

import pandas as pd
import requests

# Configuration constants
DEFAULT_TIMEOUT = 10
# URLs by year
F1_URLS = {
    2023: {
        "EVENT_DATA_URL": "https://inmotion.dhl/api/f1-award-element-data/6284",
        "EVENT_SPECIFIC_URL": "https://inmotion.dhl/api/f1-award-element-data/6282",
    },
    2024: {
        "EVENT_DATA_URL": "https://inmotion.dhl/api/f1-award-element-data/6276",
        "EVENT_SPECIFIC_URL": "https://inmotion.dhl/api/f1-award-element-data/6273",
    },
    2025: {
        "EVENT_DATA_URL": "https://inmotion.dhl/api/f1-award-element-data/6367",
        "EVENT_SPECIFIC_URL": "https://inmotion.dhl/api/f1-award-element-data/6365",
    },
}
DEFAULT_HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
DELAY_BETWEEN_REQUESTS = 0.5  # seconds

# F1 race names by year
F1_RACES = {
    2025: {
        "AUSTRALIAN": "Australian Grand Prix",
        "CHINESE": "Chinese Grand Prix",
        "JAPANESE": "Japanese Grand Prix",
        "BAHRAIN": "Bahrain Grand Prix",
        "SAUDI ARABIAN": "Saudi Arabian Grand Prix",
        "MIAMI": "Miami Grand Prix",
        "EMILIA-ROMAGNA": "Emilia Romagna Grand Prix",
        "MONACO": "Monaco Grand Prix",
        "ESPAÑA": "Spanish Grand Prix",
        "CANADA": "Canadian Grand Prix",
        "AUSTRIAN": "Austrian Grand Prix",
        "BRITISH": "British Grand Prix",
        "BELGIAN": "Belgian Grand Prix",
        "HUNGARIAN": "Hungarian Grand Prix",
        "DUTCH": "Dutch Grand Prix",
        "ITALIA": "Italian Grand Prix",
        "AZERBAIJAN": "Azerbaijan Grand Prix",
        "SINGAPORE": "Singapore Grand Prix",
        "UNITED STATES": "United States Grand Prix",
        "MÉXICO": "Mexico City Grand Prix",
        "SÃO PAULO": "São Paulo Grand Prix",
        "LAS VEGAS": "Las Vegas Grand Prix",
        "QATAR": "Qatar Grand Prix",
        "ABU DHABI": "Abu Dhabi Grand Prix",
    },
    2024: {
        "AUSTRALIAN": "Australian Grand Prix",
        "CHINESE": "Chinese Grand Prix",
        "JAPANESE": "Japanese Grand Prix",
        "BAHRAIN": "Bahrain Grand Prix",
        "SAUDI ARABIAN": "Saudi Arabian Grand Prix",
        "MIAMI": "Miami Grand Prix",
        "EMILIA-ROMAGNA": "Emilia Romagna Grand Prix",
        "MONACO": "Monaco Grand Prix",
        "ESPAÑA": "Spanish Grand Prix",
        "CANADA": "Canadian Grand Prix",
        "AUSTRIAN": "Austrian Grand Prix",
        "BRITISH": "British Grand Prix",
        "BELGIAN": "Belgian Grand Prix",
        "HUNGARIAN": "Hungarian Grand Prix",
        "DUTCH": "Dutch Grand Prix",
        "ITALIA": "Italian Grand Prix",
        "AZERBAIJAN": "Azerbaijan Grand Prix",
        "SINGAPORE": "Singapore Grand Prix",
        "UNITED STATES": "United States Grand Prix",
        "MEXICO": "Mexico City Grand Prix",
        "SÃO PAOLO": "São Paulo Grand Prix",
        "LAS VEGAS": "Las Vegas Grand Prix",
        "QATAR": "Qatar Grand Prix",
        "ABU DHABI": "Abu Dhabi Grand Prix",
    },
    2023: {
        "ABU DHABI": "Abu Dhabi Grand Prix",
        "LAS VEGAS": "Las Vegas Grand Prix",
        "SÃO PAULO": "São Paulo Grand Prix",
        "MÉXICO": "Mexico City Grand Prix",
        "UNITED STATES": "United States Grand Prix",
        "QATAR": "Qatar Grand Prix",
        "JAPANESE": "Japanese Grand Prix",
        "SINGAPORE": "Singapore Grand Prix",
        "ITALIA": "Italian Grand Prix",
        "DUTCH": "Dutch Grand Prix",
        "BELGIAN": "Belgian Grand Prix",
        "HUNGARIAN": "Hungarian Grand Prix",
        "BRITISH": "British Grand Prix",
        "ÖSTERREICH": "Austrian Grand Prix",
        "CANADA": "Canadian Grand Prix",
        "ESPAÑA": "Spanish Grand Prix",
        "MONACO": "Monaco Grand Prix",
        "MIAMI": "Miami Grand Prix",
        "AZERBAIJAN": "Azerbaijan Grand Prix",
        "AUSTRALIAN": "Australian Grand Prix",
        "SAUDI ARABIAN": "Saudi Arabian Grand Prix",
        "BAHRAIN": "Bahrain Grand Prix",
    },
}


class F1DataFetcher:
    """Class for fetching and processing Formula 1 data."""

    def __init__(
        self, year: int = 2025, timeout: int = DEFAULT_TIMEOUT, headers: Dict = None
    ):
        """
        Initialize the F1DataFetcher.

        Args:
            year: Year for which to fetch F1 data
            timeout: Request timeout in seconds
            headers: HTTP headers for requests
        """
        self.year = year
        self.timeout = timeout
        self.headers = headers or DEFAULT_HEADERS
        self.event_data_cache = {}
        self.event_specific_data_cache = {}

        # Set URLs based on year
        self.set_year(year)

    def set_year(self, year: int) -> None:
        """
        Set the year and update URLs accordingly.

        Args:
            year: Year for which to fetch F1 data
        """
        self.year = year
        if year in F1_URLS:
            self.event_data_url = F1_URLS[year]["EVENT_DATA_URL"]
            self.event_specific_url = F1_URLS[year]["EVENT_SPECIFIC_URL"]
        else:
            # Default to latest year if requested year is not available
            latest_year = max(F1_URLS.keys())
            print(
                f"Warning: Data for year {year} not available. Using {latest_year} instead."
            )
            self.year = latest_year
            self.event_data_url = F1_URLS[latest_year]["EVENT_DATA_URL"]
            self.event_specific_url = F1_URLS[latest_year]["EVENT_SPECIFIC_URL"]

    def fetch_events_data(self, url: str = None) -> Dict:
        """
        Fetch events data from the specified URL.

        Args:
            url: URL to fetch data from (defaults to year-specific URL)

        Returns:
            Dictionary containing events data

        Raises:
            requests.exceptions.RequestException: If request fails
            json.JSONDecodeError: If response is not valid JSON
        """
        if url is None:
            url = self.event_data_url

        print(f"Attempting to fetch data from: {url}")

        try:
            response = self._make_request(url)
            parsed_data = response.json()

            # Extract events list from the nested structure
            data_section = parsed_data.get("data", {})
            chart_section = data_section.get("chart", {})
            events_data = chart_section.get("events", [])

            if events_data:
                print("\nSuccessfully extracted events data")
                return events_data
            else:
                print("\nError: Could not find the 'events' data at the expected path")
                return []

        except (requests.exceptions.RequestException, json.JSONDecodeError) as e:
            print(f"\nError fetching events data: {e}")
            return []

    def fetch_event_specific_data(
        self, events_data: List[Dict], base_url: str = None
    ) -> Dict[str, Any]:
        """
        Fetch specific data for each event.

        Args:
            events_data: List of event dictionaries
            base_url: Base URL for event-specific data (defaults to year-specific URL)

        Returns:
            Dictionary mapping event IDs to their specific data
        """
        if base_url is None:
            base_url = self.event_specific_url

        all_event_specific_data = {}

        print(f"Found {len(events_data)} events to process.")

        for event in events_data:
            event_id = event.get("id")
            event_title = event.get("title", "Unknown Title")

            if not event_id:
                print(f"Warning: Skipping event with missing ID: {event_title}")
                continue

            # Check cache first
            if event_id in self.event_specific_data_cache:
                all_event_specific_data[event_id] = self.event_specific_data_cache[
                    event_id
                ]
                print(f"Using cached data for Event ID: {event_id} ({event_title})")
                continue

            specific_url = f"{base_url}?event={event_id}"
            print(
                f"\nAttempting to fetch data for Event ID: {event_id} ({event_title})"
            )
            print(f"URL: {specific_url}")

            try:
                response = self._make_request(specific_url)
                event_specific_data = response.json()

                # Cache the result
                self.event_specific_data_cache[event_id] = event_specific_data
                all_event_specific_data[event_id] = event_specific_data

                print(f"Successfully fetched and parsed data for Event ID: {event_id}")

            except requests.exceptions.RequestException as e:
                error_info = {"error": str(e)}
                all_event_specific_data[event_id] = error_info
                print(f"Error during request for Event ID: {event_id}: {e}")

            except json.JSONDecodeError:
                error_info = {
                    "error": "JSONDecodeError",
                    "response_text": (
                        response.text[:500]
                        if hasattr(response, "text")
                        else "No response text"
                    ),
                }
                all_event_specific_data[event_id] = error_info
                print(f"Error: Failed to decode JSON for Event ID: {event_id}")

            # Add delay between requests
            if DELAY_BETWEEN_REQUESTS > 0:
                time.sleep(DELAY_BETWEEN_REQUESTS)

        # Print summary
        self._print_fetch_summary(events_data, all_event_specific_data)
        return all_event_specific_data

    def _make_request(self, url: str) -> requests.Response:
        """
        Make an HTTP request with error handling.

        Args:
            url: URL to request

        Returns:
            Response object

        Raises:
            requests.exceptions.RequestException: If request fails
        """
        try:
            response = requests.get(url, headers=self.headers, timeout=self.timeout)
            response.raise_for_status()
            return response
        except requests.exceptions.Timeout:
            print(f"Error: The request to {url} timed out.")
            raise
        except requests.exceptions.RequestException:
            print(f"Error during request to {url}")
            raise

    def _print_fetch_summary(
        self, events_data: List[Dict], all_event_specific_data: Dict[str, Any]
    ) -> None:
        """Print a summary of the fetch operation."""
        print("\n--- Processing Complete ---")
        successful_fetches = sum(
            1
            for data in all_event_specific_data.values()
            if isinstance(data, dict) and "error" not in data
        )
        failed_fetches = len(events_data) - successful_fetches
        print(f"Successfully fetched data for {successful_fetches} events.")
        print(f"Failed to fetch data for {failed_fetches} events.")


class DataProcessor:
    """Class for processing F1 data."""

    @staticmethod
    def html_table_to_dataframe(event_json_data: Dict) -> Optional[pd.DataFrame]:
        """
        Extract HTML table from event JSON data and convert to DataFrame.

        Args:
            event_json_data: JSON data dictionary for a specific event

        Returns:
            DataFrame created from HTML table, or None if extraction fails
        """
        if not isinstance(event_json_data, dict):
            print("Error: Input must be a dictionary.")
            return None

        # Extract HTML table string
        html_table_str = event_json_data.get("htmlList", {}).get("table")

        if not html_table_str:
            print(
                "Error: Could not find 'htmlList' -> 'table' in the provided JSON data or it's empty."
            )
            return None

        if not isinstance(html_table_str, str):
            print("Error: The value at ['htmlList']['table'] is not a string.")
            return None

        print("Found HTML table string. Attempting to parse...")
        try:
            # Parse HTML table into DataFrame
            list_of_dfs = pd.read_html(io.StringIO(html_table_str))

            if list_of_dfs:
                print("Successfully parsed HTML table into DataFrame.")
                return list_of_dfs[0]
            else:
                print(
                    "Warning: No tables found by pd.read_html, although HTML string was present."
                )
                return None

        except ValueError as ve:
            print(f"Error parsing HTML with pandas (ValueError): {ve}")
            print("Check if the HTML string actually contains a <table> tag.")
            return None
        except ImportError:
            print(
                "Error: The 'lxml' library might be required by pd.read_html. Please install it (`pip install lxml`)."
            )
            return None
        except Exception as e:
            print(f"An unexpected error occurred during HTML parsing: {e}")
            return None

    @staticmethod
    def save_dataframe_to_json(
        df: pd.DataFrame, event_title: str, year: int, output_dir: str = None
    ) -> str:
        """
        Save DataFrame to a JSON file with a standardized filename.

        Args:
            df: DataFrame to save
            event_title: Title of the event (e.g., "FORMULA 1 LOUIS VUITTON AUSTRALIAN GRAND PRIX 2025")
            year: Year of the event
            output_dir: Directory to save the JSON file (defaults to year folder)

        Returns:
            Path to the saved JSON file
        """
        if df is None:
            print(f"Error: Cannot save None DataFrame for {event_title}")
            return ""

        # If no output directory is specified, use the year as the directory name
        if output_dir is None:
            output_dir = str(year)

        # Find the proper race name from the event title
        race_name = None

        # First, try to match with our predefined race names for the specific year
        if year in F1_RACES:
            year_races = F1_RACES[year]
            for key, proper_name in year_races.items():
                if key in event_title.upper():
                    race_name = proper_name
                    break
        else:
            # If year not in F1_RACES, use the latest year's race names
            latest_year = max(F1_RACES.keys())
            year_races = F1_RACES[latest_year]
            for key, proper_name in year_races.items():
                if key in event_title.upper():
                    race_name = proper_name
                    break

        # If no match found, try to extract from the title
        if race_name is None:
            # Try to extract Grand Prix name using regex
            match = re.search(
                r"([A-Z]+(?:\s+[A-Z]+)*)\s+GRAND\s+PRIX", event_title, re.IGNORECASE
            )
            if match:
                location = match.group(1).strip()
                race_name = f"{location} Grand Prix"
            else:
                # Fallback to a generic name with the event ID
                race_name = "Unknown Grand Prix"
                print(f"Warning: Could not determine race name for: {event_title}")

        # Clean up the filename
        filename = race_name

        # Create the output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Save the DataFrame to a JSON file
        file_path = os.path.join(output_dir, f"{filename}.json")
        df.to_json(file_path, orient="records", indent=4)

        print(f"Saved data to {file_path}")
        return file_path


def main(year: int = 2025):
    """
    Main function to fetch and process F1 data for a specific year.

    Args:
        year: Year to fetch data for (default: 2025)
    """
    print(f"Fetching F1 data for year: {year}")

    # Initialize the data fetcher with the specified year
    fetcher = F1DataFetcher(year=year)

    # Fetch events data
    events_data = fetcher.fetch_events_data()
    if not events_data:
        print("No events data found. Exiting.")
        return

    # Fetch specific data for each event
    all_event_specific_data = fetcher.fetch_event_specific_data(events_data)

    # Create a DataProcessor instance
    processor = DataProcessor()

    # Process each event and save to JSON
    saved_files = []
    output_dir = str(year)  # Use year as directory name

    for event in events_data:
        event_id = event.get("id")
        event_title = event.get("title", "Unknown Title")

        if not event_id or event_id not in all_event_specific_data:
            print(f"Skipping event {event_title}: No data available")
            continue

        event_data = all_event_specific_data[event_id]

        # Check if there was an error fetching this event's data
        if isinstance(event_data, dict) and "error" in event_data:
            print(
                f"Skipping event {event_title}: Error in data - {event_data.get('error')}"
            )
            continue

        # Convert HTML table to DataFrame
        print(f"\nProcessing event: {event_title}")
        event_dataframe = processor.html_table_to_dataframe(event_data)

        if event_dataframe is not None:
            # Save DataFrame to JSON
            file_path = processor.save_dataframe_to_json(
                event_dataframe, event_title, year, output_dir
            )

            if file_path:
                saved_files.append(file_path)
        else:
            print(f"Failed to create DataFrame for {event_title}")

    # Print summary
    print(f"\n--- JSON Export Complete ---")
    print(
        f"Successfully saved {len(saved_files)} event data files to the '{output_dir}' directory."
    )
    if saved_files:
        print("Files saved:")
        for file_path in saved_files:
            print(f"  - {file_path}")


if __name__ == "__main__":
    main(2023)


Fetching F1 data for year: 2023
Attempting to fetch data from: https://inmotion.dhl/api/f1-award-element-data/6284

Successfully extracted events data
Found 22 events to process.

Attempting to fetch data for Event ID: 837 (Formula 1 Gulf Air Bahrain Grand Prix 2023)
URL: https://inmotion.dhl/api/f1-award-element-data/6282?event=837
Successfully fetched and parsed data for Event ID: 837

Attempting to fetch data for Event ID: 838 (Formula 1 stc Saudi Arabian Grand Prix 2023)
URL: https://inmotion.dhl/api/f1-award-element-data/6282?event=838
Successfully fetched and parsed data for Event ID: 838

Attempting to fetch data for Event ID: 839 (Formula 1 Rolex Australian Grand Prix 2023)
URL: https://inmotion.dhl/api/f1-award-element-data/6282?event=839
Successfully fetched and parsed data for Event ID: 839

Attempting to fetch data for Event ID: 841 (Formula 1 Azerbaijan Grand Prix 2023)
URL: https://inmotion.dhl/api/f1-award-element-data/6282?event=841
Successfully fetched and parsed data 

In [8]:
pip install requests tqdm


Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [12]:
import requests
import time
import json
from tqdm import tqdm  # For progress bar

def check_url(id_number):
    """Check if the URL with the given ID returns data"""
    url = f"https://inmotion.dhl/api/f1-award-element-data/{id_number}"
    try:
        response = requests.get(url, timeout=10)

        # Check if response is successful
        if response.status_code == 200:
            # Try to parse JSON
            try:
                data = response.json()
                # Check if data is not empty (this might need adjustment based on actual response structure)
                if data and not (isinstance(data, dict) and len(data) == 0):
                    return True, data
            except json.JSONDecodeError:
                pass
        return False, None
    except requests.exceptions.RequestException:
        return False, None

def main():
    # List to store IDs with data
    valid_ids = []

    # File to save detailed results
    with open("valid_dhl_ids.txt", "w") as file:
        file.write("IDs that return data from https://inmotion.dhl/api/f1-award-element-data/:\n\n")

        # Iterate through IDs with progress bar (from 6284 down to 6000)
        for id_number in tqdm(range(6284, 5999, -1)):  # 6284 down to 6000 inclusive
            has_data, data = check_url(id_number)

            if has_data:
                valid_ids.append(id_number)
                file.write(f"ID: {id_number}\n")
                file.write(f"Data: {json.dumps(data, indent=2)}\n\n")

            # Rate limiting - sleep to avoid overwhelming the server
            time.sleep(0.2)  # 200ms delay between requests

    # Save summary of valid IDs
    with open("valid_dhl_ids_summary.txt", "w") as file:
        file.write("Summary of IDs that return data:\n")
        file.write(f"Total valid IDs: {len(valid_ids)}\n\n")
        file.write("Valid IDs: " + ", ".join(map(str, valid_ids)))

    print(f"Scan complete. Found {len(valid_ids)} valid IDs.")
    print(f"Valid IDs: {valid_ids}")

if __name__ == "__main__":
    main()


 15%|█▌        | 43/285 [01:05<06:08,  1.52s/it]


KeyboardInterrupt: 

In [13]:
import requests
import json
import concurrent.futures
from tqdm import tqdm

def check_url(id_number):
    """Check if the URL with the given ID returns data"""
    url = f"https://inmotion.dhl/api/f1-award-element-data/{id_number}"
    try:
        response = requests.get(url, timeout=5)

        # Check if response is successful
        if response.status_code == 200:
            # Try to parse JSON
            try:
                data = response.json()
                # Check if data is not empty
                if data and not (isinstance(data, dict) and len(data) == 0):
                    return id_number, True, data
            except json.JSONDecodeError:
                pass
        return id_number, False, None
    except requests.exceptions.RequestException:
        return id_number, False, None

def main():
    # Configuration
    start_id = 6284
    end_id = 6000
    max_workers = 20  # Number of parallel requests

    # Create a list of IDs to check
    ids_to_check = list(range(start_id, end_id - 1, -1))
    valid_results = []

    # Use ThreadPoolExecutor for parallel requests
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks and get future objects
        future_to_id = {executor.submit(check_url, id_num): id_num for id_num in ids_to_check}

        # Process results as they complete with a progress bar
        for future in tqdm(concurrent.futures.as_completed(future_to_id), total=len(ids_to_check)):
            id_num, has_data, data = future.result()
            if has_data:
                valid_results.append((id_num, data))

    # Sort results by ID for consistent output
    valid_results.sort(key=lambda x: x[0])
    valid_ids = [id_num for id_num, _ in valid_results]

    # Save detailed results
    with open("valid_dhl_ids.txt", "w") as file:
        file.write("IDs that return data from https://inmotion.dhl/api/f1-award-element-data/:\n\n")
        for id_num, data in valid_results:
            file.write(f"ID: {id_num}\n")
            file.write(f"Data: {json.dumps(data, indent=2)}\n\n")

    # Save summary of valid IDs
    with open("valid_dhl_ids_summary.txt", "w") as file:
        file.write("Summary of IDs that return data:\n")
        file.write(f"Total valid IDs: {len(valid_ids)}\n\n")
        file.write("Valid IDs: " + ", ".join(map(str, valid_ids)))

    print(f"Scan complete. Found {len(valid_ids)} valid IDs.")
    print(f"Valid IDs: {valid_ids}")

if __name__ == "__main__":
    main()


100%|██████████| 285/285 [00:22<00:00, 12.94it/s]


Scan complete. Found 12 valid IDs.
Valid IDs: [6268, 6269, 6271, 6273, 6275, 6276, 6279, 6280, 6281, 6282, 6283, 6284]


In [15]:
import requests
import json
import concurrent.futures
from tqdm import tqdm

def check_url(id_number):
    """Check if the URL with the given ID returns data"""
    url = f"https://inmotion.dhl/api/f1-award-element-data/{id_number}"
    try:
        response = requests.get(url, timeout=5)

        # Check if response is successful
        if response.status_code == 200:
            # Try to parse JSON
            try:
                data = response.json()
                # Check if data is not empty
                if data and not (isinstance(data, dict) and len(data) == 0):
                    return id_number, True, data
            except json.JSONDecodeError:
                pass
        return id_number, False, None
    except requests.exceptions.RequestException:
        return id_number, False, None

def main():
    # Configuration
    start_id = 9999
    end_id = 6284
    max_workers = 200  # Number of parallel requests

    # Create a list of IDs to check
    ids_to_check = list(range(start_id, end_id - 1, -1))
    valid_results = []

    # Use ThreadPoolExecutor for parallel requests
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all tasks and get future objects
        future_to_id = {executor.submit(check_url, id_num): id_num for id_num in ids_to_check}

        # Process results as they complete with a progress bar
        for future in tqdm(concurrent.futures.as_completed(future_to_id), total=len(ids_to_check)):
            id_num, has_data, data = future.result()
            if has_data:
                valid_results.append((id_num, data))

    # Sort results by ID for consistent output
    valid_results.sort(key=lambda x: x[0])
    valid_ids = [id_num for id_num, _ in valid_results]

    # Save detailed results
    with open("valid_dhl_ids.txt", "w") as file:
        file.write("IDs that return data from https://inmotion.dhl/api/f1-award-element-data/:\n\n")
        for id_num, data in valid_results:
            file.write(f"ID: {id_num}\n")
            file.write(f"Data: {json.dumps(data, indent=2)}\n\n")

    # Save summary of valid IDs
    with open("valid_dhl_ids_summary.txt", "w") as file:
        file.write("Summary of IDs that return data:\n")
        file.write(f"Total valid IDs: {len(valid_ids)}\n\n")
        file.write("Valid IDs: " + ", ".join(map(str, valid_ids)))

    print(f"Scan complete. Found {len(valid_ids)} valid IDs.")
    print(f"Valid IDs: {valid_ids}")

if __name__ == "__main__":
    main()


100%|██████████| 3716/3716 [00:24<00:00, 151.89it/s]

Scan complete. Found 0 valid IDs.
Valid IDs: []



