mean +- standard error of mean of pollutants on day without fire, with fire and up to 5 days after fire outbreak


In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Portugal'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000    

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Portugal):")
print(df.to_string(index=False))



Pollutant concentrations by fire label (spatial average across Portugal):
                Label  PM10 Mean (µg/m³)  PM10 SEM (µg/m³)  PM2.5 Mean (µg/m³)  PM2.5 SEM (µg/m³)  NO2 Mean (µg/m³)  NO2 SEM (µg/m³)  CO Mean (µg/m³)  CO SEM (µg/m³)  NO Mean (µg/m³)  NO SEM (µg/m³)
              No fire          15.096757          0.240055           10.208306           0.173664          5.472955         0.036699       167.779115        1.175729         0.590265        0.022267
Day 0 (Fire outbreak)          45.886020          9.656258           32.863526           7.041121          7.475000         0.747153       299.751848       51.305623         2.228505        0.824601
                Day 1          97.716508         29.250056           70.534644          21.259187         10.964914         2.256131       506.716504      134.266926         4.769814        2.165469
                Day 2         182.308947         45.015734          132.549703          33.439520         17.231838         4.008

In [2]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Italy'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Italy):")
print(df.to_string(index=False))



Pollutant concentrations by fire label (spatial average across Italy):
                Label  PM10 Mean (µg/m³)  PM10 SEM (µg/m³)  PM2.5 Mean (µg/m³)  PM2.5 SEM (µg/m³)  NO2 Mean (µg/m³)  NO2 SEM (µg/m³)  CO Mean (µg/m³)  CO SEM (µg/m³)  NO Mean (µg/m³)  NO SEM (µg/m³)
              No fire          21.032259          0.151412           14.823289           0.109723          9.571507         0.055528       225.298092        0.911101         2.182623        0.038641
Day 0 (Fire outbreak)          28.344443          3.841608           20.155887           2.786949          9.392919         1.056645       233.280944       19.220328         1.646102        0.565466
                Day 1          45.923278          7.305911           32.983355           5.278086         11.120176         1.314471       298.765534       28.002509         1.362950        0.346480
                Day 2          48.739358         12.259807           34.876954           8.912328         10.923386         2.033146

In [3]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Spain'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Spain):")
print(df.to_string(index=False))



Pollutant concentrations by fire label (spatial average across Spain):
                Label  PM10 Mean (µg/m³)  PM10 SEM (µg/m³)  PM2.5 Mean (µg/m³)  PM2.5 SEM (µg/m³)  NO2 Mean (µg/m³)  NO2 SEM (µg/m³)  CO Mean (µg/m³)  CO SEM (µg/m³)  NO Mean (µg/m³)  NO SEM (µg/m³)
              No fire          15.303822          0.138710           10.660586           0.101369          6.713976         0.042117       179.675069        0.706041         1.065766        0.024024
Day 0 (Fire outbreak)          30.919225          7.826939           22.153211           5.730354          7.640597         0.949351       244.667172       40.292259         2.064513        0.923958
                Day 1          64.766813         15.502962           46.728961          11.290378          9.582287         1.350382       420.424068       82.593527         5.297486        1.760777
                Day 2          80.894601         16.983552           58.491989          12.370796         11.797910         1.810596

In [4]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Greece'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Greece):")
print(df.to_string(index=False))



Pollutant concentrations by fire label (spatial average across Greece):
                Label  PM10 Mean (µg/m³)  PM10 SEM (µg/m³)  PM2.5 Mean (µg/m³)  PM2.5 SEM (µg/m³)  NO2 Mean (µg/m³)  NO2 SEM (µg/m³)  CO Mean (µg/m³)  CO SEM (µg/m³)  NO Mean (µg/m³)  NO SEM (µg/m³)
              No fire          17.370958          0.178065           12.347840           0.129621          4.253410         0.030815       181.039304        0.637393         0.507762        0.011021
Day 0 (Fire outbreak)          41.560897         13.200809           29.979675           9.569354          5.979247         1.176284       264.980246       46.437584         1.140654        0.481704
                Day 1         115.732091         43.005361           83.915637          31.429443         12.473766         4.026267       613.641962      228.567842         4.726936        3.225457
                Day 2         141.804015         53.015578          104.165278          39.594776         17.518174         6.84916

Divide daily concentrations of pollutants into quartiles Q1 (lowest), Q2, Q3 and Q4 (highest) for days when fires occurred and days they did not - Assess the impact of fire events on the concentration of air pollutants. Calculation of percentage of days (non-wildfire and wildfire) in each of the four qaurtiles

Splits pollutant concentrations into quartiles (25% intervals) separately for fire and no-fire cases:

Q1 = lowest 25%

Q2 = 25–50%

Q3 = 50–75%

Q4 = highest 25%

In [5]:
import xarray as xr
import pandas as pd
import numpy as np

# --- List of pollutant files ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

all_results = []
all_ranges = []   # <-- NEW

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if needed
    fire_flag = ds["fire_binary_Portugal"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Drop NaNs before analysis
    df = df.dropna(subset=["pollutant", "fire"])

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # --- Counts ---
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # --- Percentages ---
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

    # --- Quartile ranges (NEW) ---
    for fire_status, subset in [("fire", df_fire), ("no-fire", df_nofire)]:
        if len(subset) > 0:
            edges = np.percentile(subset["pollutant"], [0, 25, 50, 75, 100])
            all_ranges.append({
                "pollutant": pol_name,
                "fire_status": fire_status,
                "min": edges[0],
                "Q1_cut": edges[1],
                "median": edges[2],
                "Q3_cut": edges[3],
                "max": edges[4]
            })

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()
ranges_table = pd.DataFrame(all_ranges)

print("\nQuartile ranges table:")
print(ranges_table)

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)



Quartile ranges table:
  pollutant fire_status       min    Q1_cut     median     Q3_cut         max
0        CO        fire  0.079913  0.123000   0.146981   0.170307    0.684976
1        CO     no-fire  0.077922  0.128411   0.153172   0.187197    1.635118
2     PM2.5        fire  1.291478  5.822477   8.450580  12.487320   84.005044
3     PM2.5     no-fire  0.901536  5.214086   7.818812  11.461342  226.269582
4      PM10        fire  2.129281  8.791217  12.123889  19.752628  116.111259
5      PM10     no-fire  1.281670  7.867608  11.779428  17.626056  312.896080
6       NO2        fire  0.459816  2.747020   3.615078   5.453642   18.560624
7       NO2     no-fire  0.331370  2.932381   4.368222   6.804293   33.008960
8        NO        fire  0.011899  0.140404   0.229597   0.406666   12.731893
9        NO     no-fire  0.020685  0.118617   0.214062   0.433333   33.712170
Counts table:
quartile  fire   Q1   Q2   Q3   Q4 pollutant
0          0.0  385  384  384  384        CO
1          1.0

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


In [6]:
import xarray as xr
import pandas as pd
import numpy as np

# --- List of pollutant files ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

all_results = []
all_ranges = []   # <-- NEW

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if needed
    fire_flag = ds["fire_binary_Italy"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Drop NaNs before analysis
    df = df.dropna(subset=["pollutant", "fire"])

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # --- Counts ---
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # --- Percentages ---
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

    # --- Quartile ranges (NEW) ---
    for fire_status, subset in [("fire", df_fire), ("no-fire", df_nofire)]:
        if len(subset) > 0:
            edges = np.percentile(subset["pollutant"], [0, 25, 50, 75, 100])
            all_ranges.append({
                "pollutant": pol_name,
                "fire_status": fire_status,
                "min": edges[0],
                "Q1_cut": edges[1],
                "median": edges[2],
                "Q3_cut": edges[3],
                "max": edges[4]
            })

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()
ranges_table = pd.DataFrame(all_ranges)

print("\nQuartile ranges table:")
print(ranges_table)

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)


  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)



Quartile ranges table:
  pollutant fire_status       min     Q1_cut     median     Q3_cut         max
0        CO        fire  0.118169   0.158251   0.201658   0.249816    0.446383
1        CO     no-fire  0.092919   0.160314   0.195127   0.254807    1.856855
2     PM2.5        fire  2.045881   6.835057  12.798281  19.501022   34.328284
3     PM2.5     no-fire  0.000000   7.756739  12.146608  18.635389  268.887881
4      PM10        fire  2.956275   9.678339  18.962145  27.736973   44.093082
5      PM10     no-fire  0.000000  11.305971  17.437176  26.344184  379.107154
6       NO2        fire  0.260109   2.659705   6.813898  14.021867   27.346491
7       NO2     no-fire  0.030481   3.377165   7.571072  13.641735   85.678398
8        NO        fire  0.011900   0.145023   0.469583   1.136128    8.694460
9        NO     no-fire  0.000000   0.140645   0.373592   1.417939  130.744216
Counts table:
quartile  fire     Q1     Q2     Q3     Q4 pollutant
0          0.0  10282  10282  10281  102

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


In [7]:
import xarray as xr
import pandas as pd
import numpy as np

# --- List of pollutant files ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

all_results = []
all_ranges = []   # <-- NEW

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if needed
    fire_flag = ds["fire_binary_Spain"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Drop NaNs before analysis
    df = df.dropna(subset=["pollutant", "fire"])

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # --- Counts ---
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # --- Percentages ---
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

    # --- Quartile ranges (NEW) ---
    for fire_status, subset in [("fire", df_fire), ("no-fire", df_nofire)]:
        if len(subset) > 0:
            edges = np.percentile(subset["pollutant"], [0, 25, 50, 75, 100])
            all_ranges.append({
                "pollutant": pol_name,
                "fire_status": fire_status,
                "min": edges[0],
                "Q1_cut": edges[1],
                "median": edges[2],
                "Q3_cut": edges[3],
                "max": edges[4]
            })

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()
ranges_table = pd.DataFrame(all_ranges)

print("\nQuartile ranges table:")
print(ranges_table)

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)


  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)



Quartile ranges table:
  pollutant fire_status       min    Q1_cut     median     Q3_cut          max
0        CO        fire  0.099439  0.141615   0.167788   0.202757     3.578842
1        CO     no-fire  0.078180  0.141680   0.166041   0.201856    11.108744
2     PM2.5        fire  0.987569  6.050161   8.260741  11.927139   422.341080
3     PM2.5     no-fire  0.094829  5.959718   8.610981  12.733974  1285.176813
4      PM10        fire  1.395989  8.914573  12.131431  17.455286   582.338838
5      PM10     no-fire  0.121319  8.838515  12.657086  18.305592  1771.925230
6       NO2        fire  0.178757  3.565339   5.640326   9.001415    37.212929
7       NO2     no-fire  0.074855  3.384412   5.571284   8.940925   104.585299
8        NO        fire  0.007268  0.128577   0.256333   0.726875    95.406514
9        NO     no-fire  0.000000  0.123136   0.245979   0.657960   258.257714
Counts table:
quartile  fire     Q1     Q2     Q3     Q4 pollutant
0          0.0  11124  11123  11123  111

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


In [8]:
import xarray as xr
import pandas as pd
import numpy as np

# --- List of pollutant files ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

all_results = []
all_ranges = []   # <-- NEW

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if needed
    fire_flag = ds["fire_binary_Greece"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Drop NaNs before analysis
    df = df.dropna(subset=["pollutant", "fire"])

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # --- Counts ---
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # --- Percentages ---
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

    # --- Quartile ranges (NEW) ---
    for fire_status, subset in [("fire", df_fire), ("no-fire", df_nofire)]:
        if len(subset) > 0:
            edges = np.percentile(subset["pollutant"], [0, 25, 50, 75, 100])
            all_ranges.append({
                "pollutant": pol_name,
                "fire_status": fire_status,
                "min": edges[0],
                "Q1_cut": edges[1],
                "median": edges[2],
                "Q3_cut": edges[3],
                "max": edges[4]
            })

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()
ranges_table = pd.DataFrame(all_ranges)

print("\nQuartile ranges table:")
print(ranges_table)

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)


  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)



Quartile ranges table:
  pollutant fire_status       min    Q1_cut     median     Q3_cut         max
0        CO        fire  0.112425  0.146658   0.171655   0.197587    0.394946
1        CO     no-fire  0.089007  0.147243   0.169660   0.201879    2.066146
2     PM2.5        fire  3.182889  6.928646   9.196265  11.783550   48.969473
3     PM2.5     no-fire  0.000000  6.473638  10.136619  15.173227  220.708740
4      PM10        fire  4.621994  9.749328  13.222781  17.162592   57.418474
5      PM10     no-fire  0.000000  9.320546  14.444010  21.343669  345.944777
6       NO2        fire  0.451333  1.173427   1.951272   4.157555   13.884953
7       NO2     no-fire  0.080533  1.098579   2.361537   5.665453   57.897680
8        NO        fire  0.009764  0.050419   0.107469   0.206957    6.030846
9        NO     no-fire  0.000000  0.052551   0.106479   0.258669   45.056572
Counts table:
quartile  fire    Q1    Q2    Q3    Q4 pollutant
0          0.0  4955  4954  4954  4954        CO
1     

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


Pearson correlation performed between meteorological variables and air pollutants, for all days and every label established

Understanding the p-values

Null hypothesis (H₀): There is no correlation between the variables.

Alternative hypothesis (H₁): There is a correlation.

p-value: Probability of observing the correlation (or more extreme) if H₀ is true.

Interpretation rule of thumb:

p < 0.05 → significant: reject the null → correlation is likely real.

p ≥ 0.05 → not significant: cannot reject the null → correlation might be due to chance.

..................................

How to read weak vs strong correlations

|r| < 0.1 → negligible

0.1 ≤ |r| < 0.3 → weak

0.3 ≤ |r| < 0.5 → moderate

|r| ≥ 0.5 → strong

Wind speed

In [9]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files (same as before)
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Wind speed dataset (multiple yearly files)
# --------------------
wind_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\UV_wind\daily_wind_speed_stats_yearly_regridded\daily_wind_speed_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in wind_files]
ds_wind = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_wind = ds_wind.stack(date=("Year", "Month", "Day"))

# Build datetime index from stacked coordinates
time_index = pd.to_datetime(
    {"year": ds_wind["Year"].values, 
     "month": ds_wind["Month"].values, 
     "day": ds_wind["Day"].values},
    errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_wind = ds_wind.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_wind = ds_wind.assign_coords(time=("date", time_index))
ds_wind = ds_wind.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean wind speed
wind_ts = ds_wind["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
wind_ts.name = "Wind_Speed"

# --------------------
# 3. Prepare FireLabel masks (same as before)
# --------------------
fire_labels = {}
for label in range(0, 7):
    mask = fire_ds["fire_label_Portugal"] == label
    daily_label_present = mask.any(dim=["latitude", "longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation functions
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Wind_Speed"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Wind_Speed"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [wind_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Wind_Speed"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Save results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_wind_correlations_by_fire_label_Portugal.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r    Pearson_p  Spearman_r    Spearman_p
      All        CO   0.041060 2.381489e-04    0.286097 1.226853e-150
      All     PM2.5  -0.063740 1.144764e-08    0.016218  1.468094e-01
      All      PM10  -0.049672 8.737263e-06    0.047543  2.086005e-05
      All       NO2   0.048673 1.320594e-05    0.089923  7.592559e-16
      All        NO   0.005809 6.033248e-01    0.051586  3.878156e-06
        0        CO   0.041060 2.381489e-04    0.286097 1.226853e-150
        0     PM2.5  -0.063740 1.144764e-08    0.016218  1.468094e-01
        0      PM10  -0.049672 8.737263e-06    0.047543  2.086005e-05
        0       NO2   0.048673 1.320594e-05    0.089923  7.592559e-16
        0        NO   0.005809 6.033248e-01    0.051586  3.878156e-06
        1        CO   0.010122 6.933594e-01    0.242094  1.032769e-21
        1     PM2.5  -0.077415 2.525781e-03    0.008616  7.371283e-01
        1      PM10  -0.077043 2.649744e-03    0.011140 

In [10]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Wind speed dataset (multiple yearly files)
# --------------------
wind_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\UV_wind\daily_wind_speed_stats_yearly_regridded\daily_wind_speed_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in wind_files]
ds_wind = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_wind = ds_wind.stack(date=("Year", "Month", "Day"))

# Build datetime index from stacked coordinates
time_index = pd.to_datetime(
    {"year": ds_wind["Year"].values, 
     "month": ds_wind["Month"].values, 
     "day": ds_wind["Day"].values},
    errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_wind = ds_wind.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_wind = ds_wind.assign_coords(time=("date", time_index))
ds_wind = ds_wind.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean wind speed
wind_ts = ds_wind["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
wind_ts.name = "Wind_Speed"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Italy"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["WindSpeed"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["WindSpeed"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame (pollutants + wind)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [wind_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["WindSpeed"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_wind_correlations_by_fire_label_Italy.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r     Pearson_p  Spearman_r    Spearman_p
      All        CO   0.353584 2.014737e-234    0.391134 6.501905e-291
      All     PM2.5   0.005227  6.400528e-01    0.005541  6.200967e-01
      All      PM10   0.033493  2.726529e-03    0.041978  1.720572e-04
      All       NO2   0.340207 4.535020e-216    0.358083 8.466310e-241
      All        NO   0.345617 2.198147e-223    0.404671  0.000000e+00
        0        CO   0.353584 2.014737e-234    0.391134 6.501905e-291
        0     PM2.5   0.005227  6.400528e-01    0.005541  6.200967e-01
        0      PM10   0.033493  2.726529e-03    0.041978  1.720572e-04
        0       NO2   0.340207 4.535020e-216    0.358083 8.466310e-241
        0        NO   0.345617 2.198147e-223    0.404671  0.000000e+00
        1        CO   0.219925  3.198423e-23    0.105803  2.244001e-06
        1     PM2.5  -0.053555  1.688152e-02   -0.074151  9.318907e-04
        1      PM10  -0.028385  2.056260e-0

In [11]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Wind speed dataset (multiple yearly files)
# --------------------
wind_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\UV_wind\daily_wind_speed_stats_yearly_regridded\daily_wind_speed_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in wind_files]
ds_wind = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_wind = ds_wind.stack(date=("Year", "Month", "Day"))

# Build datetime index from stacked coordinates
time_index = pd.to_datetime(
    {"year": ds_wind["Year"].values, 
     "month": ds_wind["Month"].values, 
     "day": ds_wind["Day"].values},
    errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_wind = ds_wind.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_wind = ds_wind.assign_coords(time=("date", time_index))
ds_wind = ds_wind.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean wind speed
wind_ts = ds_wind["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
wind_ts.name = "Wind_Speed"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Spain"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["WindSpeed"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["WindSpeed"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame (pollutants + wind)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [wind_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["WindSpeed"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_wind_correlations_by_fire_label_Spain.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r     Pearson_p  Spearman_r    Spearman_p
      All        CO   0.323513 2.084859e-194    0.382740 1.254085e-277
      All     PM2.5  -0.125007  3.011491e-29   -0.153974  1.157936e-43
      All      PM10  -0.095510  1.094429e-17   -0.104504  6.968924e-21
      All       NO2   0.299453 1.599382e-165    0.300355 1.474884e-166
      All        NO   0.242362 2.318118e-107    0.223441  3.952327e-91
        0        CO   0.323513 2.084859e-194    0.382740 1.254085e-277
        0     PM2.5  -0.125007  3.011491e-29   -0.153974  1.157936e-43
        0      PM10  -0.095510  1.094429e-17   -0.104504  6.968924e-21
        0       NO2   0.299453 1.599382e-165    0.300355 1.474884e-166
        0        NO   0.242362 2.318118e-107    0.223441  3.952327e-91
        1        CO   0.350368  9.551582e-70    0.410121  2.647556e-97
        1     PM2.5  -0.058724  4.143476e-03   -0.051387  1.213011e-02
        1      PM10  -0.037777  6.526553e-0

In [12]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Wind speed dataset (multiple yearly files)
# --------------------
wind_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\UV_wind\daily_wind_speed_stats_yearly_regridded\daily_wind_speed_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in wind_files]
ds_wind = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_wind = ds_wind.stack(date=("Year", "Month", "Day"))

# Build datetime index from stacked coordinates
time_index = pd.to_datetime(
    {"year": ds_wind["Year"].values, 
     "month": ds_wind["Month"].values, 
     "day": ds_wind["Day"].values},
    errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_wind = ds_wind.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_wind = ds_wind.assign_coords(time=("date", time_index))
ds_wind = ds_wind.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean wind speed
wind_ts = ds_wind["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
wind_ts.name = "Wind_Speed"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Greece"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["WindSpeed"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["WindSpeed"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame (pollutants + wind)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [wind_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["WindSpeed"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_wind_correlations_by_fire_label_Greece.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r     Pearson_p  Spearman_r    Spearman_p
      All        CO   0.191711  3.912663e-67    0.262411 3.502970e-126
      All     PM2.5   0.003496  7.544449e-01   -0.064205  8.951467e-09
      All      PM10   0.015418  1.677985e-01   -0.043965  8.325840e-05
      All       NO2   0.299295 2.420727e-165    0.335926 2.192978e-210
      All        NO   0.229949  1.509679e-96    0.344712 3.769150e-222
        0        CO   0.191711  3.912663e-67    0.262411 3.502970e-126
        0     PM2.5   0.003496  7.544449e-01   -0.064205  8.951467e-09
        0      PM10   0.015418  1.677985e-01   -0.043965  8.325840e-05
        0       NO2   0.299295 2.420727e-165    0.335926 2.192978e-210
        0        NO   0.229949  1.509679e-96    0.344712 3.769150e-222
        1        CO   0.078064  5.844512e-04    0.108506  1.696787e-06
        1     PM2.5  -0.060727  7.507997e-03   -0.125170  3.255635e-08
        1      PM10  -0.054336  1.677310e-0

Total Precipitation

In [13]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Precipitation dataset (multiple yearly files)
# --------------------
precip_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\Precipitation\daily_precipitation_stats_yearly_regridded\daily_precipitation_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in precip_files]
ds_precip = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_precip = ds_precip.stack(date=("Year", "Month", "Day"))

# Build datetime index from the stacked coordinates
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_precip = ds_precip.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_precip = ds_precip.assign_coords(time=("date", time_index))
ds_precip = ds_precip.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean of total precipitation
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0, 7):
    mask = fire_ds["fire_label_Portugal"] == label
    daily_label_present = mask.any(dim=["latitude", "longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation functions
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Precipitation"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Precipitation"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Save results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_precipitation_correlations_by_fire_label_Portugal.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r    Pearson_p  Spearman_r   Spearman_p
      All        CO  -0.080466 5.598159e-13   -0.122477 3.913681e-28
      All     PM2.5  -0.128727 6.301294e-31   -0.213593 2.957227e-83
      All      PM10  -0.108419 2.302336e-22   -0.166530 7.116551e-51
      All       NO2  -0.123881 9.492308e-29   -0.125814 1.315172e-29
      All        NO  -0.051974 3.278822e-06   -0.154166 9.079908e-44
        0        CO  -0.080466 5.598159e-13   -0.122477 3.913681e-28
        0     PM2.5  -0.128727 6.301294e-31   -0.213593 2.957227e-83
        0      PM10  -0.108419 2.302336e-22   -0.166530 7.116551e-51
        0       NO2  -0.123881 9.492308e-29   -0.125814 1.315172e-29
        0        NO  -0.051974 3.278822e-06   -0.154166 9.079908e-44
        1        CO  -0.056788 2.683047e-02   -0.141001 3.383884e-08
        1     PM2.5  -0.088396 5.601037e-04   -0.204364 8.549511e-16
        1      PM10  -0.084189 1.018225e-03   -0.190662 6.574106e-14


In [14]:
# --------------------
# Print invalid dates removed
# --------------------
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

invalid_dates = time_index[pd.isna(time_index)]
if len(invalid_dates) > 0:
    print("Invalid dates removed during datetime conversion:")
    for dt in invalid_dates:
        print(dt)
else:
    print("No invalid dates found.")

# --------------------
# Print number of days per fire label
# --------------------
# Build df_all (pollutants + precipitation)
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])

print("\nNumber of days per fire label:")
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    
    # Days before dropping NaNs
    days_before = mask_aligned.sum()
    
    # Days after dropping NaNs
    days_after = mask_aligned[mask_aligned & df_all.notna().all(axis=1)].sum()
    
    print(f"Fire Label {label}: Before drop NaN = {days_before}, After drop NaN = {days_after}")


No invalid dates found.

Number of days per fire label:
Fire Label 0: Before drop NaN = 8036, After drop NaN = 8005
Fire Label 1: Before drop NaN = 1520, After drop NaN = 1520
Fire Label 2: Before drop NaN = 471, After drop NaN = 471
Fire Label 3: Before drop NaN = 238, After drop NaN = 238
Fire Label 4: Before drop NaN = 137, After drop NaN = 137
Fire Label 5: Before drop NaN = 94, After drop NaN = 94
Fire Label 6: Before drop NaN = 59, After drop NaN = 59


In [15]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels


# --------------------
# 2. Precipitation dataset (multiple yearly files)
# --------------------
precip_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\Precipitation\daily_precipitation_stats_yearly_regridded\daily_precipitation_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in precip_files]
ds_precip = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_precip = ds_precip.stack(date=("Year", "Month", "Day"))

# Build datetime index from the stacked coordinates
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_precip = ds_precip.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_precip = ds_precip.assign_coords(time=("date", time_index))
ds_precip = ds_precip.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean of total precipitation
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Italy"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude", "longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Precipitation"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Precipitation"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame (pollutants + precipitation)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_precipitation_correlations_by_fire_label_Italy.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r    Pearson_p  Spearman_r   Spearman_p
      All        CO   0.002178 8.455022e-01    0.013340 2.327232e-01
      All     PM2.5  -0.122759 2.948227e-28   -0.145867 2.563738e-39
      All      PM10  -0.105296 3.531798e-21   -0.120618 2.489953e-27
      All       NO2   0.058834 1.381854e-07    0.071907 1.187644e-10
      All        NO   0.099689 3.886345e-19    0.100217 2.523323e-19
        0        CO   0.002178 8.455022e-01    0.013340 2.327232e-01
        0     PM2.5  -0.122759 2.948227e-28   -0.145867 2.563738e-39
        0      PM10  -0.105296 3.531798e-21   -0.120618 2.489953e-27
        0       NO2   0.058834 1.381854e-07    0.071907 1.187644e-10
        0        NO   0.099689 3.886345e-19    0.100217 2.523323e-19
        1        CO  -0.128928 7.839295e-09   -0.161975 3.618210e-13
        1     PM2.5  -0.127873 1.038728e-08   -0.136565 9.539817e-10
        1      PM10  -0.115236 2.542326e-07   -0.116911 1.695159e-07


In [16]:
# --------------------
# Print invalid dates removed
# --------------------
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

invalid_dates = time_index[pd.isna(time_index)]
if len(invalid_dates) > 0:
    print("Invalid dates removed during datetime conversion:")
    for dt in invalid_dates:
        print(dt)
else:
    print("No invalid dates found.")

# --------------------
# Print number of days per fire label
# --------------------
# Build df_all (pollutants + precipitation)
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])

print("\nNumber of days per fire label:")
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    
    # Days before dropping NaNs
    days_before = mask_aligned.sum()
    
    # Days after dropping NaNs
    days_after = mask_aligned[mask_aligned & df_all.notna().all(axis=1)].sum()
    
    print(f"Fire Label {label}: Before drop NaN = {days_before}, After drop NaN = {days_after}")


No invalid dates found.

Number of days per fire label:
Fire Label 0: Before drop NaN = 8036, After drop NaN = 8005
Fire Label 1: Before drop NaN = 1990, After drop NaN = 1990
Fire Label 2: Before drop NaN = 450, After drop NaN = 450
Fire Label 3: Before drop NaN = 165, After drop NaN = 165
Fire Label 4: Before drop NaN = 62, After drop NaN = 62
Fire Label 5: Before drop NaN = 25, After drop NaN = 25
Fire Label 6: Before drop NaN = 11, After drop NaN = 11


In [17]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Precipitation dataset (multiple yearly files)
# --------------------
precip_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\Precipitation\daily_precipitation_stats_yearly_regridded\daily_precipitation_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in precip_files]
ds_precip = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_precip = ds_precip.stack(date=("Year", "Month", "Day"))

# Build datetime index from the stacked coordinates
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_precip = ds_precip.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_precip = ds_precip.assign_coords(time=("date", time_index))
ds_precip = ds_precip.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean of total precipitation
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Spain"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Precipitation"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Precipitation"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame (pollutants + precipitation)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_precipitation_correlations_by_fire_label_Spain.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r     Pearson_p  Spearman_r    Spearman_p
      All        CO  -0.096539  4.874706e-18   -0.074079  3.224354e-11
      All     PM2.5  -0.316072 3.482753e-185   -0.372187 1.802676e-261
      All      PM10  -0.293053 2.735504e-158   -0.331956 3.407424e-205
      All       NO2  -0.007655  4.934977e-01   -0.014001  2.103857e-01
      All        NO  -0.002853  7.985872e-01   -0.046701  2.914665e-05
        0        CO  -0.096539  4.874706e-18   -0.074079  3.224354e-11
        0     PM2.5  -0.316072 3.482753e-185   -0.372187 1.802676e-261
        0      PM10  -0.293053 2.735504e-158   -0.331956 3.407424e-205
        0       NO2  -0.007655  4.934977e-01   -0.014001  2.103857e-01
        0        NO  -0.002853  7.985872e-01   -0.046701  2.914665e-05
        1        CO  -0.105420  2.512263e-07   -0.116667  1.126186e-08
        1     PM2.5  -0.278283  1.287661e-43   -0.322564  8.366587e-59
        1      PM10  -0.262461  8.048397e-3

In [18]:
# --------------------
# Print invalid dates removed
# --------------------
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

invalid_dates = time_index[pd.isna(time_index)]
if len(invalid_dates) > 0:
    print("Invalid dates removed during datetime conversion:")
    for dt in invalid_dates:
        print(dt)
else:
    print("No invalid dates found.")

# --------------------
# Print number of days per fire label
# --------------------
# Build df_all (pollutants + precipitation)
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])

print("\nNumber of days per fire label:")
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    
    # Days before dropping NaNs
    days_before = mask_aligned.sum()
    
    # Days after dropping NaNs
    days_after = mask_aligned[mask_aligned & df_all.notna().all(axis=1)].sum()
    
    print(f"Fire Label {label}: Before drop NaN = {days_before}, After drop NaN = {days_after}")


No invalid dates found.

Number of days per fire label:
Fire Label 0: Before drop NaN = 8036, After drop NaN = 8005
Fire Label 1: Before drop NaN = 2384, After drop NaN = 2382
Fire Label 2: Before drop NaN = 493, After drop NaN = 493
Fire Label 3: Before drop NaN = 180, After drop NaN = 180
Fire Label 4: Before drop NaN = 85, After drop NaN = 85
Fire Label 5: Before drop NaN = 41, After drop NaN = 41
Fire Label 6: Before drop NaN = 17, After drop NaN = 17


In [19]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Precipitation dataset (multiple yearly files)
# --------------------
precip_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\Precipitation\daily_precipitation_stats_yearly_regridded\daily_precipitation_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in precip_files]
ds_precip = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_precip = ds_precip.stack(date=("Year", "Month", "Day"))

# Build datetime index from the stacked coordinates
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_precip = ds_precip.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_precip = ds_precip.assign_coords(time=("date", time_index))
ds_precip = ds_precip.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean of total precipitation
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Greece"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Precipitation"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Precipitation"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame (pollutants + precipitation)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_precipitation_correlations_by_fire_label_Greece.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r    Pearson_p  Spearman_r   Spearman_p
      All        CO  -0.046102 3.686414e-05   -0.034874 1.804434e-03
      All     PM2.5   0.008725 4.351080e-01   -0.072716 7.337158e-11
      All      PM10   0.012042 2.813469e-01   -0.065907 3.585211e-09
      All       NO2   0.147757 2.616319e-40    0.180305 1.807327e-59
      All        NO   0.093468 5.307814e-17    0.159656 7.474738e-47
        0        CO  -0.046102 3.686414e-05   -0.034874 1.804434e-03
        0     PM2.5   0.008725 4.351080e-01   -0.072716 7.337158e-11
        0      PM10   0.012042 2.813469e-01   -0.065907 3.585211e-09
        0       NO2   0.147757 2.616319e-40    0.180305 1.807327e-59
        0        NO   0.093468 5.307814e-17    0.159656 7.474738e-47
        1        CO  -0.030378 1.814088e-01   -0.031897 1.605378e-01
        1     PM2.5  -0.001266 9.556030e-01   -0.049859 2.821161e-02
        1      PM10  -0.002738 9.041320e-01   -0.050827 2.528841e-02


In [20]:
# --------------------
# Print invalid dates removed
# --------------------
years = ds_precip["Year"].values
months = ds_precip["Month"].values
days = ds_precip["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

invalid_dates = time_index[pd.isna(time_index)]
if len(invalid_dates) > 0:
    print("Invalid dates removed during datetime conversion:")
    for dt in invalid_dates:
        print(dt)
else:
    print("No invalid dates found.")

# --------------------
# Print number of days per fire label
# --------------------
# Build df_all (pollutants + precipitation)
precip_ts = ds_precip["Total_Precipitation"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
precip_ts.name = "Precipitation"

df_all = pd.concat(list(pollutant_series.values()) + [precip_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Precipitation"])

print("\nNumber of days per fire label:")
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    
    # Days before dropping NaNs
    days_before = mask_aligned.sum()
    
    # Days after dropping NaNs
    days_after = mask_aligned[mask_aligned & df_all.notna().all(axis=1)].sum()
    
    print(f"Fire Label {label}: Before drop NaN = {days_before}, After drop NaN = {days_after}")


No invalid dates found.

Number of days per fire label:
Fire Label 0: Before drop NaN = 8036, After drop NaN = 8005
Fire Label 1: Before drop NaN = 1937, After drop NaN = 1937
Fire Label 2: Before drop NaN = 379, After drop NaN = 379
Fire Label 3: Before drop NaN = 127, After drop NaN = 127
Fire Label 4: Before drop NaN = 57, After drop NaN = 57
Fire Label 5: Before drop NaN = 28, After drop NaN = 28
Fire Label 6: Before drop NaN = 21, After drop NaN = 21


Temperature

In [21]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files (same as before)
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Temperature dataset (multiple yearly files)
# --------------------
temp_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\Temperature\daily_temperature_stats_yearly_regridded\daily_temperature_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in temp_files]
ds_temp = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_temp = ds_temp.stack(date=("Year", "Month", "Day"))

# Build datetime index from the stacked coordinates
years = ds_temp["Year"].values
months = ds_temp["Month"].values
days = ds_temp["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_temp = ds_temp.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_temp = ds_temp.assign_coords(time=("date", time_index))
ds_temp = ds_temp.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean temperature
temp_ts = ds_temp["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
temp_ts.name = "Temperature"

# --------------------
# 3. Prepare FireLabel masks (same as precipitation script)
# --------------------
fire_labels = {}
for label in range(0, 7):
    mask = fire_ds["fire_label_Portugal"] == label
    daily_label_present = mask.any(dim=["latitude", "longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation functions
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Temperature"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Temperature"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build full DataFrame
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [temp_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Temperature"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Save results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_temperature_correlations_by_fire_label_Portugal.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r    Pearson_p  Spearman_r   Spearman_p
      All        CO  -0.064985 5.903967e-09   -0.460717 0.000000e+00
      All     PM2.5   0.102367 4.251325e-20   -0.059410 1.041643e-07
      All      PM10   0.091438 2.466033e-16   -0.083385 7.875847e-14
      All       NO2  -0.176145 8.486392e-57   -0.229698 2.461385e-96
      All        NO  -0.013125 2.403177e-01   -0.162864 1.044045e-48
        0        CO  -0.064985 5.903967e-09   -0.460717 0.000000e+00
        0     PM2.5   0.102367 4.251325e-20   -0.059410 1.041643e-07
        0      PM10   0.091438 2.466033e-16   -0.083385 7.875847e-14
        0       NO2  -0.176145 8.486392e-57   -0.229698 2.461385e-96
        0        NO  -0.013125 2.403177e-01   -0.162864 1.044045e-48
        1        CO  -0.003970 8.771086e-01   -0.332064 1.888368e-40
        1     PM2.5   0.126393 7.674212e-07   -0.021416 4.040738e-01
        1      PM10   0.129262 4.267179e-07   -0.013475 5.996170e-01


In [22]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Temperature dataset (multiple yearly files)
# --------------------
temp_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\Temperature\daily_temperature_stats_yearly_regridded\daily_temperature_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in temp_files]
ds_temp = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_temp = ds_temp.stack(date=("Year", "Month", "Day"))

# Build datetime index from the stacked coordinates
years = ds_temp["Year"].values
months = ds_temp["Month"].values
days = ds_temp["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_temp = ds_temp.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_temp = ds_temp.assign_coords(time=("date", time_index))
ds_temp = ds_temp.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean temperature
temp_ts = ds_temp["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
temp_ts.name = "Temperature"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Italy"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Temperature"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Temperature"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build combined DataFrame (pollutants + temperature)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [temp_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Temperature"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_temperature_correlations_by_fire_label_Italy.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r     Pearson_p  Spearman_r   Spearman_p
      All        CO  -0.640711  0.000000e+00   -0.674164 0.000000e+00
      All     PM2.5  -0.163245  6.252091e-49   -0.169392 1.334731e-52
      All      PM10  -0.194930  2.177857e-69   -0.209066 9.136096e-80
      All       NO2  -0.593847  0.000000e+00   -0.604238 0.000000e+00
      All        NO  -0.530763  0.000000e+00   -0.630654 0.000000e+00
        0        CO  -0.640711  0.000000e+00   -0.674164 0.000000e+00
        0     PM2.5  -0.163245  6.252091e-49   -0.169392 1.334731e-52
        0      PM10  -0.194930  2.177857e-69   -0.209066 9.136096e-80
        0       NO2  -0.593847  0.000000e+00   -0.604238 0.000000e+00
        0        NO  -0.530763  0.000000e+00   -0.630654 0.000000e+00
        1        CO  -0.507012 1.881525e-130   -0.219978 3.121026e-23
        1     PM2.5  -0.088549  7.640043e-05    0.009874 6.597755e-01
        1      PM10  -0.120355  7.237430e-08   -0.018524

In [23]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Temperature dataset (multiple yearly files)
# --------------------
temp_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\Temperature\daily_temperature_stats_yearly_regridded\daily_temperature_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in temp_files]
ds_temp = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_temp = ds_temp.stack(date=("Year", "Month", "Day"))

# Build datetime index from the stacked coordinates
years = ds_temp["Year"].values
months = ds_temp["Month"].values
days = ds_temp["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_temp = ds_temp.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_temp = ds_temp.assign_coords(time=("date", time_index))
ds_temp = ds_temp.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean temperature
temp_ts = ds_temp["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
temp_ts.name = "Temperature"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Spain"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Temperature"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Temperature"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build combined DataFrame (pollutants + temperature)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [temp_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Temperature"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_temperature_correlations_by_fire_label_Spain.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r     Pearson_p  Spearman_r    Spearman_p
      All        CO  -0.589745  0.000000e+00   -0.654414  0.000000e+00
      All     PM2.5   0.030907  5.684269e-03    0.037728  7.349875e-04
      All      PM10   0.002222  8.424720e-01   -0.011092  3.210554e-01
      All       NO2  -0.528570  0.000000e+00   -0.524153  0.000000e+00
      All        NO  -0.388977 1.836533e-287   -0.427194  0.000000e+00
        0        CO  -0.589745  0.000000e+00   -0.654414  0.000000e+00
        0     PM2.5   0.030907  5.684269e-03    0.037728  7.349875e-04
        0      PM10   0.002222  8.424720e-01   -0.011092  3.210554e-01
        0       NO2  -0.528570  0.000000e+00   -0.524153  0.000000e+00
        0        NO  -0.388977 1.836533e-287   -0.427194  0.000000e+00
        1        CO  -0.584316 3.864292e-218   -0.610301 4.124670e-243
        1     PM2.5  -0.046593  2.296343e-02   -0.049856  1.495443e-02
        1      PM10  -0.067396  9.971404e-0

In [24]:
import xarray as xr
import pandas as pd
from scipy.stats import pearsonr, spearmanr
import glob

# --------------------
# 1. Pollutant files
# --------------------
pollutant_files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

pollutant_series = {}
fire_ds = None

for name, file in pollutant_files.items():
    ds = xr.open_dataset(file)
    ts = ds["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
    pollutant_series[name] = ts
    if fire_ds is None:
        fire_ds = ds  # keep dataset for fire labels

# --------------------
# 2. Temperature dataset (multiple yearly files)
# --------------------
temp_files = sorted(glob.glob(
    r"D:\IPMA\ERA5\Temperature\daily_temperature_stats_yearly_regridded\daily_temperature_stats_*_regrid.nc"
))

# Open all files and concatenate along 'Year'
ds_list = [xr.open_dataset(f) for f in temp_files]
ds_temp = xr.concat(ds_list, dim="Year")

# Stack Year, Month, Day into single dimension
ds_temp = ds_temp.stack(date=("Year", "Month", "Day"))

# Build datetime index from the stacked coordinates
years = ds_temp["Year"].values
months = ds_temp["Month"].values
days = ds_temp["Day"].values

time_index = pd.to_datetime(
    {"year": years, "month": months, "day": days}, errors="coerce"
)

# Keep only valid dates
valid_mask = ~pd.isna(time_index)
ds_temp = ds_temp.isel(date=valid_mask)
time_index = time_index[valid_mask]

# Assign time coordinate
ds_temp = ds_temp.assign_coords(time=("date", time_index))
ds_temp = ds_temp.swap_dims({"date": "time"}).drop_vars("date")

# Daily spatial mean temperature
temp_ts = ds_temp["Mean"].mean(dim=["latitude", "longitude"], skipna=True).to_series()
temp_ts.name = "Temperature"

# --------------------
# 3. Prepare FireLabel masks for each label (0-6)
# --------------------
fire_labels = {}
for label in range(0,7):
    mask = fire_ds["fire_label_Greece"] == label
    # True if any grid cell has this label
    daily_label_present = mask.any(dim=["latitude","longitude"])
    fire_labels[label] = daily_label_present.to_series()

# --------------------
# 4. Correlation function
# --------------------
def correlation_with_pvalues(df, pollutants):
    results = []
    for pol in pollutants:
        x = df[pol]
        y = df["Temperature"]
        if len(df) < 2 or x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": "All",
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

def correlation_by_label(df, label_name, mask, pollutants):
    results = []
    subset = df[mask]
    if len(subset) < 2:
        return results
    for pol in pollutants:
        x = subset[pol]
        y = subset["Temperature"]
        if x.nunique() < 2 or y.nunique() < 2:
            continue
        pearson_r, pearson_p = pearsonr(x, y)
        spearman_r, spearman_p = spearmanr(x, y)
        results.append({
            "FireLabel": label_name,
            "Pollutant": pol,
            "Pearson_r": pearson_r,
            "Pearson_p": pearson_p,
            "Spearman_r": spearman_r,
            "Spearman_p": spearman_p
        })
    return results

# --------------------
# 5. Build combined DataFrame (pollutants + temperature)
# --------------------
df_all = pd.concat(list(pollutant_series.values()) + [temp_ts], axis=1,
                   keys=list(pollutant_series.keys()) + ["Temperature"])
df_all = df_all.dropna()

pollutant_names = list(pollutant_series.keys())
all_results = []

# Overall correlation
all_results.extend(correlation_with_pvalues(df_all, pollutant_names))

# Correlation by fire label
for label, mask in fire_labels.items():
    # Align mask with df_all
    mask_aligned = mask.reindex(df_all.index, fill_value=False)
    all_results.extend(correlation_by_label(df_all, label, mask_aligned, pollutant_names))

# --------------------
# 6. Print results
# --------------------
results_df = pd.DataFrame(all_results)
results_df.to_csv(r"D:\IPMA\CAMS\pollutant_temperature_correlations_by_fire_label_Greece.csv", index=False)
print("Correlation results saved to CSV.")
print(results_df.to_string(index=False))


Correlation results saved to CSV.
FireLabel Pollutant  Pearson_r     Pearson_p  Spearman_r    Spearman_p
      All        CO  -0.366165 1.624080e-252   -0.501935  0.000000e+00
      All     PM2.5  -0.033242  2.934377e-03    0.033073  3.082572e-03
      All      PM10  -0.044908  5.835367e-05    0.009565  3.921574e-01
      All       NO2  -0.463600  0.000000e+00   -0.514868  0.000000e+00
      All        NO  -0.328754 4.604017e-201   -0.511462  0.000000e+00
        0        CO  -0.366165 1.624080e-252   -0.501935  0.000000e+00
        0     PM2.5  -0.033242  2.934377e-03    0.033073  3.082572e-03
        0      PM10  -0.044908  5.835367e-05    0.009565  3.921574e-01
        0       NO2  -0.463600  0.000000e+00   -0.514868  0.000000e+00
        0        NO  -0.328754 4.604017e-201   -0.511462  0.000000e+00
        1        CO  -0.151254  2.214539e-11   -0.219583  1.406943e-22
        1     PM2.5   0.103940  4.561350e-06    0.167608  1.132233e-13
        1      PM10   0.095472  2.567760e-0