mean +- standard error of mean of pollutants on day without fire, with fire and up to 5 days after fire outbreak


In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Portugal'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000    

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Portugal):")
print(df.to_string(index=False))


In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Italy'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Italy):")
print(df.to_string(index=False))


In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Spain'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Spain):")
print(df.to_string(index=False))


In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Greece'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Greece):")
print(df.to_string(index=False))


Divide daily concentrations of pollutants into quartiles Q1 (lowest), Q2, Q3 and Q4 (highest) for days when fires occurred and days they did not - Assess the impact of fire events on the concentration of air pollutants. Calculation of percentage of days (non-wildfire and wildfire) in each of the four qaurtiles

Splits pollutant concentrations into quartiles (25% intervals) separately for fire and no-fire cases:

Q1 = lowest 25%

Q2 = 25–50%

Q3 = 50–75%

Q4 = highest 25%

In [None]:
import xarray as xr
import pandas as pd
import numpy as np

# --- List of pollutant files ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

all_results = []
all_ranges = []   # <-- NEW

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if needed
    fire_flag = ds["fire_binary_Portugal"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Drop NaNs before analysis
    df = df.dropna(subset=["pollutant", "fire"])

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # --- Counts ---
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # --- Percentages ---
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

    # --- Quartile ranges (NEW) ---
    for fire_status, subset in [("fire", df_fire), ("no-fire", df_nofire)]:
        if len(subset) > 0:
            edges = np.percentile(subset["pollutant"], [0, 25, 50, 75, 100])
            all_ranges.append({
                "pollutant": pol_name,
                "fire_status": fire_status,
                "min": edges[0],
                "Q1_cut": edges[1],
                "median": edges[2],
                "Q3_cut": edges[3],
                "max": edges[4]
            })

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()
ranges_table = pd.DataFrame(all_ranges)

print("\nQuartile ranges table:")
print(ranges_table)

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)

In [None]:
import xarray as xr
import pandas as pd
import numpy as np

# --- List of pollutant files ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

all_results = []
all_ranges = []   # <-- NEW

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if needed
    fire_flag = ds["fire_binary_Italy"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Drop NaNs before analysis
    df = df.dropna(subset=["pollutant", "fire"])

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # --- Counts ---
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # --- Percentages ---
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

    # --- Quartile ranges (NEW) ---
    for fire_status, subset in [("fire", df_fire), ("no-fire", df_nofire)]:
        if len(subset) > 0:
            edges = np.percentile(subset["pollutant"], [0, 25, 50, 75, 100])
            all_ranges.append({
                "pollutant": pol_name,
                "fire_status": fire_status,
                "min": edges[0],
                "Q1_cut": edges[1],
                "median": edges[2],
                "Q3_cut": edges[3],
                "max": edges[4]
            })

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()
ranges_table = pd.DataFrame(all_ranges)

print("\nQuartile ranges table:")
print(ranges_table)

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)


In [None]:
import xarray as xr
import pandas as pd
import numpy as np

# --- List of pollutant files ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

all_results = []
all_ranges = []   # <-- NEW

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if needed
    fire_flag = ds["fire_binary_Spain"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Drop NaNs before analysis
    df = df.dropna(subset=["pollutant", "fire"])

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # --- Counts ---
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # --- Percentages ---
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

    # --- Quartile ranges (NEW) ---
    for fire_status, subset in [("fire", df_fire), ("no-fire", df_nofire)]:
        if len(subset) > 0:
            edges = np.percentile(subset["pollutant"], [0, 25, 50, 75, 100])
            all_ranges.append({
                "pollutant": pol_name,
                "fire_status": fire_status,
                "min": edges[0],
                "Q1_cut": edges[1],
                "median": edges[2],
                "Q3_cut": edges[3],
                "max": edges[4]
            })

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()
ranges_table = pd.DataFrame(all_ranges)

print("\nQuartile ranges table:")
print(ranges_table)

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)


In [None]:
import xarray as xr
import pandas as pd
import numpy as np

# --- List of pollutant files ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

all_results = []
all_ranges = []   # <-- NEW

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if needed
    fire_flag = ds["fire_binary_Greece"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Drop NaNs before analysis
    df = df.dropna(subset=["pollutant", "fire"])

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # --- Counts ---
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # --- Percentages ---
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

    # --- Quartile ranges (NEW) ---
    for fire_status, subset in [("fire", df_fire), ("no-fire", df_nofire)]:
        if len(subset) > 0:
            edges = np.percentile(subset["pollutant"], [0, 25, 50, 75, 100])
            all_ranges.append({
                "pollutant": pol_name,
                "fire_status": fire_status,
                "min": edges[0],
                "Q1_cut": edges[1],
                "median": edges[2],
                "Q3_cut": edges[3],
                "max": edges[4]
            })

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()
ranges_table = pd.DataFrame(all_ranges)

print("\nQuartile ranges table:")
print(ranges_table)

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)


Pearson correlation performed between meteorological variables and air pollutants, for all days and every label established

Wind speed

In [None]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

pollutant_series = {}
fire_ds = None  # store the first dataset for fire labels

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    
    # Daily spatial mean (skip NaNs)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    
    if fire_ds is None:
        fire_ds = ds  # keep dataset with fire labels

# --------------------
# 2. Wind dataset (corrected multi-year loading)
# --------------------
wind_files = glob.glob(r"D:\IPMA\ERA5\UV_wind\daily_wind_speed_stats_yearly_regridded\daily_wind_speed_stats_*_regrid.nc")
datasets = [xr.open_dataset(f) for f in wind_files]
ds_wind = xr.concat(datasets, dim="Year")

# Build datetime index
years = ds_wind["Year"].values
months = ds_wind["Month"].values
days = ds_wind["Day"].values

time_list = []
for y in years:
    for m in months:
        for d in days:
            try:
                time_list.append(pd.Timestamp(year=int(y), month=int(m), day=int(d)))
            except ValueError:
                continue  # skip invalid dates

time_index = pd.DatetimeIndex(time_list)

# Stack Year, Month, Day into single dimension
ds_wind = ds_wind.stack(date=("Year","Month","Day"))
ds_wind = ds_wind.assign_coords(time=("date", time_index))
ds_wind = ds_wind.swap_dims({"date":"time"}).drop_vars("date")

# Daily spatial mean wind speed
wind_ts = ds_wind["Mean"].mean(dim=["latitude","longitude"], skipna=True).to_series()
wind_ts.name = "WindSpeed"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0, 7):
    mask = fire_ds["fire_label_Portugal"] == label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation functions
# --------------------
def correlation(df, pollutants, label_name="All", mask=None):
    results = []
    if mask is not None:
        df = df[mask]
    if len(df) < 2:
        return results
    for pol in pollutants:
        x = df[pol]
        y = df["WindSpeed"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame (pollutants + wind)
# --------------------
df_all = pd.concat(
    list(pollutant_series.values()) + [wind_ts],
    axis=1,
    keys=list(pollutant_series.keys()) + ["WindSpeed"]
)
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation(df_all, pollutant_names, label_name=label, mask=mask_aligned))

# --------------------
# 6. Save results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(
    r"D:\IPMA\CAMS\pollutant_wind_correlations_by_fire_label_Portugal.csv",
    index=False
)

print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


In [None]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

pollutant_series = {}
fire_ds = None  # store the first dataset for fire labels

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    
    # Daily spatial mean (skip NaNs)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    
    if fire_ds is None:
        fire_ds = ds  # keep dataset with fire labels

# --------------------
# 2. Wind dataset
# --------------------
wind_file = r"D:\IPMA\ERA5\UV_wind\daily_wind_speed_stats_regrid.nc"
ds_wind = xr.open_dataset(wind_file)

# Build datetime index
years = ds_wind["Year"].values
months = ds_wind["Month"].values
days = ds_wind["Day"].values

time_index = pd.to_datetime(
    [f"{y}-{m:02d}-{d:02d}" for y in years for m in months for d in days],
    errors="coerce"
).dropna()

# Stack Year, Month, Day into a single dimension
ds_wind = ds_wind.stack(date=("Year","Month","Day"))
ds_wind = ds_wind.assign_coords(time=("date", time_index))
ds_wind = ds_wind.swap_dims({"date":"time"}).drop_vars("date")

# Daily spatial mean wind speed
wind_ts = ds_wind["Mean"].mean(dim=["latitude","longitude"], skipna=True).to_series()
wind_ts.name = "WindSpeed"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Italy"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["WindSpeed"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["WindSpeed"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame (pollutants + wind)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [wind_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["WindSpeed"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_wind_correlations_by_fire_label_Italy.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


In [None]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

pollutant_series = {}
fire_ds = None  # store the first dataset for fire labels

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    
    # Daily spatial mean (skip NaNs)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    
    if fire_ds is None:
        fire_ds = ds  # keep dataset with fire labels

# --------------------
# 2. Wind dataset
# --------------------
wind_file = r"D:\IPMA\ERA5\UV_wind\daily_wind_speed_stats_regrid.nc"
ds_wind = xr.open_dataset(wind_file)

# Build datetime index
years = ds_wind["Year"].values
months = ds_wind["Month"].values
days = ds_wind["Day"].values

time_index = pd.to_datetime(
    [f"{y}-{m:02d}-{d:02d}" for y in years for m in months for d in days],
    errors="coerce"
).dropna()

# Stack Year, Month, Day into a single dimension
ds_wind = ds_wind.stack(date=("Year","Month","Day"))
ds_wind = ds_wind.assign_coords(time=("date", time_index))
ds_wind = ds_wind.swap_dims({"date":"time"}).drop_vars("date")

# Daily spatial mean wind speed
wind_ts = ds_wind["Mean"].mean(dim=["latitude","longitude"], skipna=True).to_series()
wind_ts.name = "WindSpeed"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Spain"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["WindSpeed"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["WindSpeed"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame (pollutants + wind)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [wind_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["WindSpeed"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_wind_correlations_by_fire_label_Spain.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


In [None]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

pollutant_series = {}
fire_ds = None  # store the first dataset for fire labels

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    
    # Daily spatial mean (skip NaNs)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    
    if fire_ds is None:
        fire_ds = ds  # keep dataset with fire labels

# --------------------
# 2. Wind dataset
# --------------------
wind_file = r"D:\IPMA\ERA5\UV_wind\daily_wind_speed_stats_regrid.nc"
ds_wind = xr.open_dataset(wind_file)

# Build datetime index
years = ds_wind["Year"].values
months = ds_wind["Month"].values
days = ds_wind["Day"].values

time_index = pd.to_datetime(
    [f"{y}-{m:02d}-{d:02d}" for y in years for m in months for d in days],
    errors="coerce"
).dropna()

# Stack Year, Month, Day into a single dimension
ds_wind = ds_wind.stack(date=("Year","Month","Day"))
ds_wind = ds_wind.assign_coords(time=("date", time_index))
ds_wind = ds_wind.swap_dims({"date":"time"}).drop_vars("date")

# Daily spatial mean wind speed
wind_ts = ds_wind["Mean"].mean(dim=["latitude","longitude"], skipna=True).to_series()
wind_ts.name = "WindSpeed"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Greece"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["WindSpeed"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["WindSpeed"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame (pollutants + wind)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [wind_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["WindSpeed"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_wind_correlations_by_fire_label_Greece.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Total Precipitation

In [8]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Precipitation dataset (multiple yearly files)
# --------------------
precip_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\Precipitation\daily_precipitation_stats_yearly_regridded\daily_precipitation_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in precip_files]
ds_precip = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_precip = ds_precip.stack(date=("Year", "Month", "Day"))

# Build datetime index from the stacked coordinates
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_precip = ds_precip.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_precip = ds_precip.assign_coords(time=("date", time_index))
ds_precip = ds_precip.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean of total precipitation
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0, 7):
    mask = fire_ds["fire_label_Portugal"] == label
    daily_label_present = mask.any(dim=["latitude", "longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation functions
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Precipitation"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Precipitation"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Save results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_precipitation_correlations_by_fire_label_Portugal.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r    Pearson_p  Spearman_r   Spearman_p
      All        CO  -0.080466 5.598159e-13   -0.122477 3.913681e-28
      All     PM2.5  -0.128727 6.301294e-31   -0.213593 2.957227e-83
      All      PM10  -0.108419 2.302336e-22   -0.166530 7.116551e-51
      All       NO2  -0.123881 9.492308e-29   -0.125814 1.315172e-29
      All        NO  -0.051974 3.278822e-06   -0.154166 9.079908e-44
        0        CO  -0.080466 5.598159e-13   -0.122477 3.913681e-28
        0     PM2.5  -0.128727 6.301294e-31   -0.213593 2.957227e-83
        0      PM10  -0.108419 2.302336e-22   -0.166530 7.116551e-51
        0       NO2  -0.123881 9.492308e-29   -0.125814 1.315172e-29
        0        NO  -0.051974 3.278822e-06   -0.154166 9.079908e-44
        1        CO  -0.056788 2.683047e-02   -0.141001 3.383884e-08
        1     PM2.5  -0.088396 5.601037e-04   -0.204364 8.549511e-16
        1      PM10  -0.084189 1.018225e-03   -0.190662 6.574106e-14


In [10]:
# --------------------
# Print invalid dates removed
# --------------------
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

invalid_dates = time_index[pd.isna(time_index)]
if len(invalid_dates) > 0:
    print("Invalid dates removed during datetime conversion:")
    for dt in invalid_dates:
        print(dt)
else:
    print("No invalid dates found.")

# --------------------
# Print number of days per fire label
# --------------------
# Build df_all (pollutants + precipitation)
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])

print("\nNumber of days per fire label:")
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    
    # Days before dropping NaNs
    days_before = mask_aligned.sum()
    
    # Days after dropping NaNs
    days_after = mask_aligned[mask_aligned & df_all.notna().all(axis=1)].sum()
    
    print(f"Fire Label {label}: Before drop NaN = {days_before}, After drop NaN = {days_after}")


No invalid dates found.

Number of days per fire label:
Fire Label 0: Before drop NaN = 8036, After drop NaN = 8005
Fire Label 1: Before drop NaN = 1520, After drop NaN = 1520
Fire Label 2: Before drop NaN = 471, After drop NaN = 471
Fire Label 3: Before drop NaN = 238, After drop NaN = 238
Fire Label 4: Before drop NaN = 137, After drop NaN = 137
Fire Label 5: Before drop NaN = 94, After drop NaN = 94
Fire Label 6: Before drop NaN = 59, After drop NaN = 59


In [11]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels


# --------------------
# 2. Precipitation dataset (multiple yearly files)
# --------------------
precip_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\Precipitation\daily_precipitation_stats_yearly_regridded\daily_precipitation_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in precip_files]
ds_precip = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_precip = ds_precip.stack(date=("Year", "Month", "Day"))

# Build datetime index from the stacked coordinates
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_precip = ds_precip.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_precip = ds_precip.assign_coords(time=("date", time_index))
ds_precip = ds_precip.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean of total precipitation
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Italy"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude", "longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Precipitation"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Precipitation"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame (pollutants + precipitation)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_precipitation_correlations_by_fire_label_Italy.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r    Pearson_p  Spearman_r   Spearman_p
      All        CO   0.002178 8.455022e-01    0.013340 2.327232e-01
      All     PM2.5  -0.122759 2.948227e-28   -0.145867 2.563738e-39
      All      PM10  -0.105296 3.531798e-21   -0.120618 2.489953e-27
      All       NO2   0.058834 1.381854e-07    0.071907 1.187644e-10
      All        NO   0.099689 3.886345e-19    0.100217 2.523323e-19
        0        CO   0.002178 8.455022e-01    0.013340 2.327232e-01
        0     PM2.5  -0.122759 2.948227e-28   -0.145867 2.563738e-39
        0      PM10  -0.105296 3.531798e-21   -0.120618 2.489953e-27
        0       NO2   0.058834 1.381854e-07    0.071907 1.187644e-10
        0        NO   0.099689 3.886345e-19    0.100217 2.523323e-19
        1        CO  -0.128928 7.839295e-09   -0.161975 3.618210e-13
        1     PM2.5  -0.127873 1.038728e-08   -0.136565 9.539817e-10
        1      PM10  -0.115236 2.542326e-07   -0.116911 1.695159e-07


In [12]:
# --------------------
# Print invalid dates removed
# --------------------
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

invalid_dates = time_index[pd.isna(time_index)]
if len(invalid_dates) > 0:
    print("Invalid dates removed during datetime conversion:")
    for dt in invalid_dates:
        print(dt)
else:
    print("No invalid dates found.")

# --------------------
# Print number of days per fire label
# --------------------
# Build df_all (pollutants + precipitation)
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])

print("\nNumber of days per fire label:")
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    
    # Days before dropping NaNs
    days_before = mask_aligned.sum()
    
    # Days after dropping NaNs
    days_after = mask_aligned[mask_aligned & df_all.notna().all(axis=1)].sum()
    
    print(f"Fire Label {label}: Before drop NaN = {days_before}, After drop NaN = {days_after}")


No invalid dates found.

Number of days per fire label:
Fire Label 0: Before drop NaN = 8036, After drop NaN = 8005
Fire Label 1: Before drop NaN = 1990, After drop NaN = 1990
Fire Label 2: Before drop NaN = 450, After drop NaN = 450
Fire Label 3: Before drop NaN = 165, After drop NaN = 165
Fire Label 4: Before drop NaN = 62, After drop NaN = 62
Fire Label 5: Before drop NaN = 25, After drop NaN = 25
Fire Label 6: Before drop NaN = 11, After drop NaN = 11


In [13]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Precipitation dataset (multiple yearly files)
# --------------------
precip_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\Precipitation\daily_precipitation_stats_yearly_regridded\daily_precipitation_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in precip_files]
ds_precip = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_precip = ds_precip.stack(date=("Year", "Month", "Day"))

# Build datetime index from the stacked coordinates
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_precip = ds_precip.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_precip = ds_precip.assign_coords(time=("date", time_index))
ds_precip = ds_precip.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean of total precipitation
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Spain"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Precipitation"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Precipitation"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame (pollutants + precipitation)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_precipitation_correlations_by_fire_label_Spain.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r     Pearson_p  Spearman_r    Spearman_p
      All        CO  -0.096539  4.874706e-18   -0.074079  3.224354e-11
      All     PM2.5  -0.316072 3.482753e-185   -0.372187 1.802676e-261
      All      PM10  -0.293053 2.735504e-158   -0.331956 3.407424e-205
      All       NO2  -0.007655  4.934977e-01   -0.014001  2.103857e-01
      All        NO  -0.002853  7.985872e-01   -0.046701  2.914665e-05
        0        CO  -0.096539  4.874706e-18   -0.074079  3.224354e-11
        0     PM2.5  -0.316072 3.482753e-185   -0.372187 1.802676e-261
        0      PM10  -0.293053 2.735504e-158   -0.331956 3.407424e-205
        0       NO2  -0.007655  4.934977e-01   -0.014001  2.103857e-01
        0        NO  -0.002853  7.985872e-01   -0.046701  2.914665e-05
        1        CO  -0.105420  2.512263e-07   -0.116667  1.126186e-08
        1     PM2.5  -0.278283  1.287661e-43   -0.322564  8.366587e-59
        1      PM10  -0.262461  8.048397e-3

In [14]:
# --------------------
# Print invalid dates removed
# --------------------
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

invalid_dates = time_index[pd.isna(time_index)]
if len(invalid_dates) > 0:
    print("Invalid dates removed during datetime conversion:")
    for dt in invalid_dates:
        print(dt)
else:
    print("No invalid dates found.")

# --------------------
# Print number of days per fire label
# --------------------
# Build df_all (pollutants + precipitation)
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])

print("\nNumber of days per fire label:")
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    
    # Days before dropping NaNs
    days_before = mask_aligned.sum()
    
    # Days after dropping NaNs
    days_after = mask_aligned[mask_aligned & df_all.notna().all(axis=1)].sum()
    
    print(f"Fire Label {label}: Before drop NaN = {days_before}, After drop NaN = {days_after}")


No invalid dates found.

Number of days per fire label:
Fire Label 0: Before drop NaN = 8036, After drop NaN = 8005
Fire Label 1: Before drop NaN = 2384, After drop NaN = 2382
Fire Label 2: Before drop NaN = 493, After drop NaN = 493
Fire Label 3: Before drop NaN = 180, After drop NaN = 180
Fire Label 4: Before drop NaN = 85, After drop NaN = 85
Fire Label 5: Before drop NaN = 41, After drop NaN = 41
Fire Label 6: Before drop NaN = 17, After drop NaN = 17


In [15]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Precipitation dataset (multiple yearly files)
# --------------------
precip_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\Precipitation\daily_precipitation_stats_yearly_regridded\daily_precipitation_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in precip_files]
ds_precip = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_precip = ds_precip.stack(date=("Year", "Month", "Day"))

# Build datetime index from the stacked coordinates
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_precip = ds_precip.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_precip = ds_precip.assign_coords(time=("date", time_index))
ds_precip = ds_precip.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean of total precipitation
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Greece"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Precipitation"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Precipitation"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame (pollutants + precipitation)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_precipitation_correlations_by_fire_label_Greece.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r    Pearson_p  Spearman_r   Spearman_p
      All        CO  -0.046102 3.686414e-05   -0.034874 1.804434e-03
      All     PM2.5   0.008725 4.351080e-01   -0.072716 7.337158e-11
      All      PM10   0.012042 2.813469e-01   -0.065907 3.585211e-09
      All       NO2   0.147757 2.616319e-40    0.180305 1.807327e-59
      All        NO   0.093468 5.307814e-17    0.159656 7.474738e-47
        0        CO  -0.046102 3.686414e-05   -0.034874 1.804434e-03
        0     PM2.5   0.008725 4.351080e-01   -0.072716 7.337158e-11
        0      PM10   0.012042 2.813469e-01   -0.065907 3.585211e-09
        0       NO2   0.147757 2.616319e-40    0.180305 1.807327e-59
        0        NO   0.093468 5.307814e-17    0.159656 7.474738e-47
        1        CO  -0.030378 1.814088e-01   -0.031897 1.605378e-01
        1     PM2.5  -0.001266 9.556030e-01   -0.049859 2.821161e-02
        1      PM10  -0.002738 9.041320e-01   -0.050827 2.528841e-02


In [16]:
# --------------------
# Print invalid dates removed
# --------------------
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

invalid_dates = time_index[pd.isna(time_index)]
if len(invalid_dates) > 0:
    print("Invalid dates removed during datetime conversion:")
    for dt in invalid_dates:
        print(dt)
else:
    print("No invalid dates found.")

# --------------------
# Print number of days per fire label
# --------------------
# Build df_all (pollutants + precipitation)
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])

print("\nNumber of days per fire label:")
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    
    # Days before dropping NaNs
    days_before = mask_aligned.sum()
    
    # Days after dropping NaNs
    days_after = mask_aligned[mask_aligned & df_all.notna().all(axis=1)].sum()
    
    print(f"Fire Label {label}: Before drop NaN = {days_before}, After drop NaN = {days_after}")


No invalid dates found.

Number of days per fire label:
Fire Label 0: Before drop NaN = 8036, After drop NaN = 8005
Fire Label 1: Before drop NaN = 1937, After drop NaN = 1937
Fire Label 2: Before drop NaN = 379, After drop NaN = 379
Fire Label 3: Before drop NaN = 127, After drop NaN = 127
Fire Label 4: Before drop NaN = 57, After drop NaN = 57
Fire Label 5: Before drop NaN = 28, After drop NaN = 28
Fire Label 6: Before drop NaN = 21, After drop NaN = 21


Temperature

In [None]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

pollutant_series = {}
fire_ds = None  # store the first dataset for fire labels

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    
    # Daily spatial mean (skip NaNs)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    
    if fire_ds is None:
        fire_ds = ds  # keep dataset with fire labels

# --------------------
# 2. Temperature dataset
# --------------------
temp_file = r"D:\IPMA\ERA5\Temperature\daily_temperature_stats_regrid.nc"
ds_temp = xr.open_dataset(temp_file)

# Build datetime index
years = ds_temp["Year"].values
months = ds_temp["Month"].values
days = ds_temp["Day"].values

time_index = pd.to_datetime(
    [f"{y}-{m:02d}-{d:02d}" for y in years for m in months for d in days],
    errors="coerce"
).dropna()

# Stack Year, Month, Day into a single dimension
ds_temp = ds_temp.stack(date=("Year","Month","Day"))
ds_temp = ds_temp.assign_coords(time=("date", time_index))
ds_temp = ds_temp.swap_dims({"date":"time"}).drop_vars("date")

# Daily spatial mean temperature (°C, mean across grid)
temp_ts = ds_temp["Mean"].mean(dim=["latitude","longitude"], skipna=True).to_series()
temp_ts.name = "Temperature"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Portugal"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Temperature"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Temperature"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build combined DataFrame (pollutants + temperature)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [temp_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Temperature"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_temperature_correlations_by_fire_label_Portugal.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


In [None]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

pollutant_series = {}
fire_ds = None  # store the first dataset for fire labels

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    
    # Daily spatial mean (skip NaNs)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    
    if fire_ds is None:
        fire_ds = ds  # keep dataset with fire labels

# --------------------
# 2. Temperature dataset
# --------------------
temp_file = r"D:\IPMA\ERA5\Temperature\daily_temperature_stats_regrid.nc"
ds_temp = xr.open_dataset(temp_file)

# Build datetime index
years = ds_temp["Year"].values
months = ds_temp["Month"].values
days = ds_temp["Day"].values

time_index = pd.to_datetime(
    [f"{y}-{m:02d}-{d:02d}" for y in years for m in months for d in days],
    errors="coerce"
).dropna()

# Stack Year, Month, Day into a single dimension
ds_temp = ds_temp.stack(date=("Year","Month","Day"))
ds_temp = ds_temp.assign_coords(time=("date", time_index))
ds_temp = ds_temp.swap_dims({"date":"time"}).drop_vars("date")

# Daily spatial mean temperature (°C, mean across grid)
temp_ts = ds_temp["Mean"].mean(dim=["latitude","longitude"], skipna=True).to_series()
temp_ts.name = "Temperature"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Italy"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Temperature"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Temperature"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build combined DataFrame (pollutants + temperature)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [temp_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Temperature"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_temperature_correlations_by_fire_label_Italy.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


In [None]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

pollutant_series = {}
fire_ds = None  # store the first dataset for fire labels

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    
    # Daily spatial mean (skip NaNs)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    
    if fire_ds is None:
        fire_ds = ds  # keep dataset with fire labels

# --------------------
# 2. Temperature dataset
# --------------------
temp_file = r"D:\IPMA\ERA5\Temperature\daily_temperature_stats_regrid.nc"
ds_temp = xr.open_dataset(temp_file)

# Build datetime index
years = ds_temp["Year"].values
months = ds_temp["Month"].values
days = ds_temp["Day"].values

time_index = pd.to_datetime(
    [f"{y}-{m:02d}-{d:02d}" for y in years for m in months for d in days],
    errors="coerce"
).dropna()

# Stack Year, Month, Day into a single dimension
ds_temp = ds_temp.stack(date=("Year","Month","Day"))
ds_temp = ds_temp.assign_coords(time=("date", time_index))
ds_temp = ds_temp.swap_dims({"date":"time"}).drop_vars("date")

# Daily spatial mean temperature (°C, mean across grid)
temp_ts = ds_temp["Mean"].mean(dim=["latitude","longitude"], skipna=True).to_series()
temp_ts.name = "Temperature"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Spain"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Temperature"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Temperature"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build combined DataFrame (pollutants + temperature)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [temp_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Temperature"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_temperature_correlations_by_fire_label_Spain.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


In [None]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

pollutant_series = {}
fire_ds = None  # store the first dataset for fire labels

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    
    # Daily spatial mean (skip NaNs)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    
    if fire_ds is None:
        fire_ds = ds  # keep dataset with fire labels

# --------------------
# 2. Temperature dataset
# --------------------
temp_file = r"D:\IPMA\ERA5\Temperature\daily_temperature_stats_regrid.nc"
ds_temp = xr.open_dataset(temp_file)

# Build datetime index
years = ds_temp["Year"].values
months = ds_temp["Month"].values
days = ds_temp["Day"].values

time_index = pd.to_datetime(
    [f"{y}-{m:02d}-{d:02d}" for y in years for m in months for d in days],
    errors="coerce"
).dropna()

# Stack Year, Month, Day into a single dimension
ds_temp = ds_temp.stack(date=("Year","Month","Day"))
ds_temp = ds_temp.assign_coords(time=("date", time_index))
ds_temp = ds_temp.swap_dims({"date":"time"}).drop_vars("date")

# Daily spatial mean temperature (°C, mean across grid)
temp_ts = ds_temp["Mean"].mean(dim=["latitude","longitude"], skipna=True).to_series()
temp_ts.name = "Temperature"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Greece"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Temperature"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Temperature"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build combined DataFrame (pollutants + temperature)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [temp_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Temperature"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_temperature_correlations_by_fire_label_Greece.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))
