In [3]:
import time
import pandas as pd
import openmeteo_requests
import requests_cache
from retry_requests import retry
import os

# 1. API Setup with Cache
cache_session = requests_cache.CachedSession('.cache', expire_after=-1)
retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
openmeteo = openmeteo_requests.Client(session=retry_session)

def fetch_openmeteo_final(lat, lon, city_name, start_year=2010, end_year=2024):
    url = "https://archive-api.open-meteo.com/v1/archive"
    
    # Using variables confirmed to work in testing
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": f"{start_year}-01-01",
        "end_date": f"{end_year}-12-31",
        "hourly": ["temperature_2m", "pressure_msl", "relative_humidity_2m", "dewpoint_2m"],
        "timezone": "Europe/Berlin"
    }

    print(f"   -> Loading data for {city_name}...")
    responses = openmeteo.weather_api(url, params=params)
    res = responses[0]
    hourly = res.Hourly()

    # Extract data (order according to params['hourly'])
    t2m = hourly.Variables(0).ValuesAsNumpy()
    pres = hourly.Variables(1).ValuesAsNumpy()
    humi = hourly.Variables(2).ValuesAsNumpy()
    dew = hourly.Variables(3).ValuesAsNumpy()

    # Create timeline
    date_range = pd.date_range(
        start=pd.to_datetime(hourly.Time(), unit="s", utc=True),
        periods=len(t2m),
        freq=pd.Timedelta(seconds=hourly.Interval())
    )

    # Hourly DataFrame
    df_h = pd.DataFrame({
        "date": date_range,
        "temp_ref": t2m,
        "gph": pres,
        "humi": humi,
        "dew": dew
    })

    # Daily Aggregation
    df_d = df_h.resample('D', on='date').agg({
        'temp_ref': 'max',
        'gph': 'mean',
        'humi': 'mean',
        'dew': 'mean'
    }).reset_index()

    # Feature Engineering: The Spread (Difference between surface heat and dew point)
    # A high spread is an indicator for extreme, dry heatwaves.
    df_d['temp_dew_spread'] = df_d['temp_ref'] - df_d['dew']
    
    # Rename columns for compatibility with the rest of the code
    df_d = df_d.rename(columns={'temp_ref': 'temp_max_ref', 'gph': 'gph500_mean'})
    df_d['city'] = city_name
    
    # Normalize date (Remove timezone, keep date only)
    df_d['date'] = df_d['date'].dt.tz_localize(None).dt.normalize()
    
    print(f"      [OK] {len(df_d)} days loaded. Avg Spread: {df_d['temp_dew_spread'].mean():.2f}")
    return df_d

# --- CONFIGURATION & EXECUTION ---

# Precise coordinates (slightly inland for better data coverage)
cities_coords = {
    "Bordeaux": (44.83, -0.60), 
    "Marseille": (43.30, 5.40),
    "Paris": (48.8566, 2.3522),
    "Lyon": (45.7640, 4.8357)
}

all_results = []

print("Starting final data download (2010-2024)...")
for city, (lat, lon) in cities_coords.items():
    try:
        df_city = fetch_openmeteo_final(lat, lon, city)
        all_results.append(df_city)
        time.sleep(1) # API cooling period
    except Exception as e:
        print(f"   ‚ùå Error at {city}: {e}")


# ======================================================
# 1. Merging Results
# ======================================================
if all_results:
    df_atm_all = pd.concat(all_results, ignore_index=True)
    
    # --- DATA REPORT & PLAUSIBILITY CHECK ---
    print("\n" + "‚ïê"*70)
    print("üìä DATA REPORT & PLAUSIBILITY CHECK")
    print("‚ïê"*70)
    
    # Calculate key markers
    marker_stats = df_atm_all.groupby('city').agg({
        'temp_max_ref': 'mean',
        'temp_dew_spread': 'mean',
        'gph500_mean': 'mean'
    }).round(2)

    # Rename columns for display with units
    marker_stats.columns = [
        'Avg Max Temp [¬∞C]', 
        'Avg Spread [Œî¬∞C]', 
        'Avg Pressure [hPa]'
    ]

    # PLAUSIBILITY LOGIC
    def check_plausibility(row):
        temp_ok = 10 <= row['Avg Max Temp [¬∞C]'] <= 22
        pres_ok = 1000 <= row['Avg Pressure [hPa]'] <= 1025
        spread_ok = 5 <= row['Avg Spread [Œî¬∞C]'] <= 15
        
        if temp_ok and pres_ok and spread_ok:
            return "‚úÖ Plausible"
        else:
            return "‚ö†Ô∏è Review needed"

    # Apply new column
    marker_stats['Status'] = marker_stats.apply(check_plausibility, axis=1)
    
    # Print table
    print(marker_stats.to_string())

    print("\n" + "‚îÄ"*70)
    print(f"üîπ Total data points: {len(df_atm_all)} (~5480 per city)")
    print(f"üîπ NaNs found: {df_atm_all.isnull().sum().sum()}")
    print(f"üîπ Columns: {df_atm_all.columns.tolist()}")

    # ======================================================
    # 2. Saving Data (for GitHub)
    # ======================================================
    output_dir = "../data/2_outputs"
    os.makedirs(output_dir, exist_ok=True) 
    
    output_path = f"{output_dir}/openmeteo_atm_data_merged.csv"
    df_atm_all.to_csv(output_path, index=False)
    
    print("‚îÄ"*70)
    print(f"üíæ FILE SAVED: {output_path}")
    print("‚ïê"*70)

else:
    print("\n‚ùå Error: No data available for analysis or saving.")

Starting final data download (2010-2024)...
   -> Loading data for Bordeaux...
      [OK] 5480 days loaded. Avg Spread: 9.46
   -> Loading data for Marseille...
      [OK] 5480 days loaded. Avg Spread: 9.05
   -> Loading data for Paris...
      [OK] 5480 days loaded. Avg Spread: 8.57
   -> Loading data for Lyon...
      [OK] 5480 days loaded. Avg Spread: 9.70

‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
üìä DATA REPORT & PLAUSIBILITY CHECK
‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
           Avg Max Temp [¬∞C]  Avg Spread [Œî¬∞C]  Avg Pressure [hPa]       Status
city                                                                           
Bordeaux      