In [0]:
# DESCRIPTION: Iterates through every F1 season (1950-Current) to extract race results.
# ---------------------------------------------------------

import requests
import json
from datetime import datetime
from pyspark.sql.functions import lit

# 1. Configuration
# ---------------------------------------------------------
base_url = "http://ergast.com/api/f1"
start_year = 1950
end_year = datetime.now().year 
storage_account_name = "YOUR_STORAGE_ACCOUNT"
container_name = "bronze"
target_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/results"

# 2. Ingestion Loop (The "Heavy Lifting")
# ---------------------------------------------------------
print(f"🚀 Starting Historical Ingestion from {start_year} to {end_year}...")

for year in range(start_year, end_year + 1):
    # API limit is usually 30, but we can request up to 1000 to get a whole race grid
    url = f"{base_url}/{year}/results.json?limit=1000"
    
    try:
        response = requests.get(url)
        if response.status_code != 200:
            print(f"⚠️ Failed to fetch {year}: Status {response.status_code}")
            continue

        data = response.json()
        race_table = data['MRData']['RaceTable']['Races']

        if not race_table:
            print(f"   No data found for {year} (Skipping)")
            continue

        # 3. Parallelize & Save (Partition by Year)
        # -----------------------------------------------------
        rdd = spark.sparkContext.parallelize([json.dumps(r) for r in race_table])
        df = spark.read.json(rdd)

        df_final = df.withColumn("ingestion_date", lit(datetime.now())) \
                     .withColumn("source_system", lit("Ergast API"))

        # Write to Data Lake (Partitioned by Season for performance)
        save_path = f"{target_path}/season={year}"
        
        df_final.write.mode("overwrite").format("json").save(save_path)
        print(f"✅ Ingested Season {year}: {len(race_table)} races processed.")

    except Exception as e:
        print(f"❌ Error processing {year}: {str(e)}")

print("🎉 Full History Ingestion Complete!")