In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import requests
from bs4 import BeautifulSoup
from io import StringIO
import seaborn as sns
import re  
import os

In [None]:
# =============================================================================
# 1. Set up urls and functions
# =============================================================================

# Repo containing snapshots for ILI incidence
repo_url = "https://github.com/european-modelling-hubs/RespiCast-SyndromicIndicators/tree/main/target-data/ERVISS/snapshots"

# Base URL for raw snapshot files
raw_base_url = "https://raw.githubusercontent.com/european-modelling-hubs/RespiCast-SyndromicIndicators/main/target-data/ERVISS/snapshots"

# URL for the final reported data
final_data_url = "https://raw.githubusercontent.com/european-modelling-hubs/RespiCast-SyndromicIndicators/refs/heads/main/target-data/ERVISS/latest-ILI_incidence.csv"

# Get all snapshot file names 
def get_all_snapshot_files(base_url):
    response = requests.get(base_url)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        # Find CSV files with "-ILI_incidence.csv"
        return [
            link.get('href').split('/')[-1]
            for link in soup.find_all('a', href=True)
            if link.get('href').endswith('-ILI_incidence.csv')
        ]
    else:
        raise Exception(f"Error loading file: {base_url}")

# Load CSV from a URL
def load_csv_from_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        return pd.read_csv(StringIO(response.text))
    else:
        raise Exception(f"Error loading file: {url}")
    
# Get the snapshot date from the file name 
def extract_date_from_filename(filename):
    match = re.match(r'(\d{4}-\d{2}-\d{2})', filename) # (format: YYYY-MM-DD)
    return match.group(1) if match else None

# Compute the week in season from a given date 
def calculate_week_in_season(date):
    season_start = pd.Timestamp(year=date.year, month=9, day=1)
    if date.month < 9:
        season_start = pd.Timestamp(year=date.year - 1, month=9, day=1)
    weeks_in_season = (date - season_start).days // 7 + 1
    return weeks_in_season


In [30]:
# =============================================================================
# 2. Get snapshot files and load data
# =============================================================================

# Get all snapshot names
snapshot_files = get_all_snapshot_files(repo_url)
print(f"{len(snapshot_files)} snapshot file found: {snapshot_files}")

all_snapshot_data = pd.DataFrame()

# Create dataframe with all snapshot files
final_data = load_csv_from_url(final_data_url)

# Iterate trough snapshot files and keep data 
for file_name in snapshot_files:
    snapshot_url = f"{raw_base_url}/{file_name}"
    try:
        snapshot_data = load_csv_from_url(snapshot_url)
        snapshot_data['source_file'] = file_name  # Add column to identify file 
        all_snapshot_data = pd.concat([all_snapshot_data, snapshot_data], ignore_index=True)
    except Exception as e:
        print(f"Error loading file {snapshot_url}: {e}")

print(all_snapshot_data.head())

38 snapshot file found: ['2024-10-11-ILI_incidence.csv', '2024-10-11-ILI_incidence.csv', '2024-10-18-ILI_incidence.csv', '2024-10-18-ILI_incidence.csv', '2024-10-25-ILI_incidence.csv', '2024-10-25-ILI_incidence.csv', '2024-11-08-ILI_incidence.csv', '2024-11-08-ILI_incidence.csv', '2024-11-15-ILI_incidence.csv', '2024-11-15-ILI_incidence.csv', '2024-11-22-ILI_incidence.csv', '2024-11-22-ILI_incidence.csv', '2024-11-29-ILI_incidence.csv', '2024-11-29-ILI_incidence.csv', '2024-12-06-ILI_incidence.csv', '2024-12-06-ILI_incidence.csv', '2024-12-13-ILI_incidence.csv', '2024-12-13-ILI_incidence.csv', '2024-12-20-ILI_incidence.csv', '2024-12-20-ILI_incidence.csv', '2025-01-03-ILI_incidence.csv', '2025-01-03-ILI_incidence.csv', '2025-01-10-ILI_incidence.csv', '2025-01-10-ILI_incidence.csv', '2025-01-17-ILI_incidence.csv', '2025-01-17-ILI_incidence.csv', '2025-01-24-ILI_incidence.csv', '2025-01-24-ILI_incidence.csv', '2025-01-31-ILI_incidence.csv', '2025-01-31-ILI_incidence.csv', '2025-02-07-ILI

In [31]:
# =============================================================================
# 3. Convert date columns and filter data 
# =============================================================================


# Convert date in datetime 
all_snapshot_data['truth_date'] = pd.to_datetime(all_snapshot_data['truth_date'])
final_data['truth_date'] = pd.to_datetime(final_data['truth_date'])

# Filter data from a fixed date 
all_snapshot_data = all_snapshot_data[all_snapshot_data['truth_date'] >= '2024-10']
final_data = final_data[final_data['truth_date'] >= '2024-10']

# Filter to include only dates in snapshot 
common_dates = all_snapshot_data['truth_date'].unique()
filtered_final_data = final_data[final_data['truth_date'].isin(common_dates)]

In [None]:
# =============================================================================
# 4. Process data for each country 
# =============================================================================
# For each country extact incidence value,
# check reevision status,
# compute age of data 

final_dataset = []

for country in all_snapshot_data['location'].unique():
    print(f"Processing country: {country}")

    # Select data for current country 
    country_snapshot_data = all_snapshot_data[all_snapshot_data['location'] == country].copy()
    country_final_data = filtered_final_data[filtered_final_data['location'] == country].copy()

    # Extact source file date from file name 
    country_snapshot_data['source_file_date'] = country_snapshot_data['source_file'].apply(extract_date_from_filename)
    country_snapshot_data['source_file_date'] = pd.to_datetime(country_snapshot_data['source_file_date'], errors='coerce')
    country_snapshot_data = country_snapshot_data.dropna(subset=['source_file_date'])
    
    # Check missing data
    if country_snapshot_data.empty or country_final_data.empty:
        print(f"ERROR: Missing data for country {country}")
        continue

    for idx, snapshot_row in country_snapshot_data.iterrows():
        date = pd.to_datetime(snapshot_row['truth_date'])

        # Incidence value
        snapshot_value = snapshot_row['value']

        #Revision status
        final_value = country_final_data[country_final_data['truth_date'] == date]['value'].iloc[0]
        revised = 1 if not np.isclose(snapshot_value, final_value, atol=1e-6) else 0

        # Age of data (weeks after original report)
        weeks_after = (snapshot_row['source_file_date'] - date).days // 7


        final_dataset.append({
            'truth_date': date,
            'country': country,
            'snapshot_date': snapshot_row['source_file_date'],
            'value': snapshot_value,
            'age': weeks_after,
            'revision_status': revised,
            'week_in_season': calculate_week_in_season(date),
            'revision_amount': final_value - snapshot_value
        })




Processing country: BE
Processing country: CZ
Processing country: DK
Processing country: GR
Processing country: HU
Processing country: LV
Processing country: LT
Processing country: LU
Processing country: MT
Processing country: NL
Processing country: NO
Processing country: PL
Processing country: RO
Processing country: SI
Processing country: AT
Processing country: HR
Processing country: FR
Processing country: IE
Processing country: EE
Processing country: IS
Processing country: IT
Processing country: FI


In [35]:
# =============================================================================
# 5 Save final dataset to a csv file
# =============================================================================

folder_path = "ILI_datasets"  # Folder name
os.makedirs(folder_path, exist_ok=True)  # Create folder

file_path = os.path.join(folder_path, "1_ILI_revisions_dataset_season_24_25_prova.csv")

# Save final dataset 
if final_dataset:
    final_df = pd.DataFrame(final_dataset)
    final_df.to_csv(file_path, index=False)
    print(f"Revision dataset for season 24/25: {len(final_dataset)} lines saved in {file_path}")
else:
    print("ERROR: Revision dataset for season 24/25 is empty.")

Revision dataset for season 24/25: 8958 lines saved in ILI_datasets/1_ILI_revisions_dataset_season_24_25_prova.csv
