mean +- standard error of mean of pollutants on day without fire, with fire and up to 5 days after fire outbreak


In [1]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Portugal'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000    

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Portugal):")
print(df.to_string(index=False))



Pollutant concentrations by fire label (spatial average across Portugal):
                Label  PM10 Mean (µg/m³)  PM10 SEM (µg/m³)  PM2.5 Mean (µg/m³)  PM2.5 SEM (µg/m³)  NO2 Mean (µg/m³)  NO2 SEM (µg/m³)  CO Mean (µg/m³)  CO SEM (µg/m³)  NO Mean (µg/m³)  NO SEM (µg/m³)
              No fire          15.096757          0.240055           10.208306           0.173664          5.472955         0.036699       167.779115        1.175729         0.590265        0.022267
Day 0 (Fire outbreak)          45.886020          9.656258           32.863526           7.041121          7.475000         0.747153       299.751848       51.305623         2.228505        0.824601
                Day 1          97.716508         29.250056           70.534644          21.259187         10.964914         2.256131       506.716504      134.266926         4.769814        2.165469
                Day 2         182.308947         45.015734          132.549703          33.439520         17.231838         4.008

In [2]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Italy'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Italy):")
print(df.to_string(index=False))



Pollutant concentrations by fire label (spatial average across Italy):
                Label  PM10 Mean (µg/m³)  PM10 SEM (µg/m³)  PM2.5 Mean (µg/m³)  PM2.5 SEM (µg/m³)  NO2 Mean (µg/m³)  NO2 SEM (µg/m³)  CO Mean (µg/m³)  CO SEM (µg/m³)  NO Mean (µg/m³)  NO SEM (µg/m³)
              No fire          21.032259          0.151412           14.823289           0.109723          9.571507         0.055528       225.298092        0.911101         2.182623        0.038641
Day 0 (Fire outbreak)          28.344443          3.841608           20.155887           2.786949          9.392919         1.056645       233.280944       19.220328         1.646102        0.565466
                Day 1          45.923278          7.305911           32.983355           5.278086         11.120176         1.314471       298.765534       28.002509         1.362950        0.346480
                Day 2          48.739358         12.259807           34.876954           8.912328         10.923386         2.033146

In [3]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Spain'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Spain):")
print(df.to_string(index=False))



Pollutant concentrations by fire label (spatial average across Spain):
                Label  PM10 Mean (µg/m³)  PM10 SEM (µg/m³)  PM2.5 Mean (µg/m³)  PM2.5 SEM (µg/m³)  NO2 Mean (µg/m³)  NO2 SEM (µg/m³)  CO Mean (µg/m³)  CO SEM (µg/m³)  NO Mean (µg/m³)  NO SEM (µg/m³)
              No fire          15.303822          0.138710           10.660586           0.101369          6.713976         0.042117       179.675069        0.706041         1.065766        0.024024
Day 0 (Fire outbreak)          30.919225          7.826939           22.153211           5.730354          7.640597         0.949351       244.667172       40.292259         2.064513        0.923958
                Day 1          64.766813         15.502962           46.728961          11.290378          9.582287         1.350382       420.424068       82.593527         5.297486        1.760777
                Day 2          80.894601         16.983552           58.491989          12.370796         11.797910         1.810596

In [4]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Greece'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Greece):")
print(df.to_string(index=False))



Pollutant concentrations by fire label (spatial average across Greece):
                Label  PM10 Mean (µg/m³)  PM10 SEM (µg/m³)  PM2.5 Mean (µg/m³)  PM2.5 SEM (µg/m³)  NO2 Mean (µg/m³)  NO2 SEM (µg/m³)  CO Mean (µg/m³)  CO SEM (µg/m³)  NO Mean (µg/m³)  NO SEM (µg/m³)
              No fire          17.370958          0.178065           12.347840           0.129621          4.253410         0.030815       181.039304        0.637393         0.507762        0.011021
Day 0 (Fire outbreak)          41.560897         13.200809           29.979675           9.569354          5.979247         1.176284       264.980246       46.437584         1.140654        0.481704
                Day 1         115.732091         43.005361           83.915637          31.429443         12.473766         4.026267       613.641962      228.567842         4.726936        3.225457
                Day 2         141.804015         53.015578          104.165278          39.594776         17.518174         6.84916

Divide daily concentrations of pollutants into quartiles Q1 (lowest), Q2, Q3 and Q4 (highest) for days when fires occurred and days they did not - Assess the impact of fire events on the concentration of air pollutants. Calculation of percentage of days (non-wildfire and wildfire) in each of the four qaurtiles

In [4]:
import xarray as xr
import pandas as pd

# --- List of pollutant files (example with CO and PM2.5) ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

all_results = []

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if different variable name
    fire_flag = ds["fire_binary_Portugal"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # Summary: counts
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # Summary: percentages
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)


  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


Counts table:
quartile  fire   Q1   Q2   Q3   Q4 pollutant
0          0.0  385  384  384  384        CO
1          1.0   36   36   36   36        CO
2          0.0  385  384  384  384     PM2.5
3          1.0   36   36   36   36     PM2.5
4          0.0  385  384  384  384      PM10
5          1.0   36   36   36   36      PM10
6          0.0  385  384  384  384       NO2
7          1.0   36   36   36   36       NO2
8          0.0  385  384  384  384        NO
9          1.0   36   36   36   36        NO

Percentages table:
quartile  fire         Q1         Q2         Q3         Q4 pollutant
0          0.0  25.048796  24.983735  24.983735  24.983735        CO
1          1.0  25.000000  25.000000  25.000000  25.000000        CO
2          0.0  25.048796  24.983735  24.983735  24.983735     PM2.5
3          1.0  25.000000  25.000000  25.000000  25.000000     PM2.5
4          0.0  25.048796  24.983735  24.983735  24.983735      PM10
5          1.0  25.000000  25.000000  25.000000  25.00000

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


In [11]:
import xarray as xr
import pandas as pd

# --- List of pollutant files (example with CO and PM2.5) ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

all_results = []

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if different variable name
    fire_flag = ds["fire_binary_Italy"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # Summary: counts
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # Summary: percentages
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)


  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


Counts table:
quartile  fire     Q1     Q2     Q3     Q4 pollutant
0          0.0  10282  10282  10281  10282        CO
1          1.0      6      6      6      6        CO
2          0.0  10282  10282  10281  10282     PM2.5
3          1.0      6      6      6      6     PM2.5
4          0.0  10282  10281  10281  10282      PM10
5          1.0      6      6      6      6      PM10
6          0.0  10282  10282  10281  10282       NO2
7          1.0      6      6      6      6       NO2
8          0.0  10277  10277  10277  10277        NO
9          1.0      6      6      6      6        NO

Percentages table:
quartile  fire         Q1         Q2         Q3         Q4 pollutant
0          0.0  25.000608  25.000608  24.998176  25.000608        CO
1          1.0  25.000000  25.000000  25.000000  25.000000        CO
2          0.0  25.000608  25.000608  24.998176  25.000608     PM2.5
3          1.0  25.000000  25.000000  25.000000  25.000000     PM2.5
4          0.0  25.001216  24.998784  

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


In [12]:
import xarray as xr
import pandas as pd

# --- List of pollutant files (example with CO and PM2.5) ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

all_results = []

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if different variable name
    fire_flag = ds["fire_binary_Spain"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # Summary: counts
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # Summary: percentages
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)


  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


Counts table:
quartile  fire     Q1     Q2     Q3     Q4 pollutant
0          0.0  11124  11123  11123  11123        CO
1          1.0     88     87     87     87        CO
2          0.0  11124  11123  11123  11123     PM2.5
3          1.0     88     87     87     87     PM2.5
4          0.0  11124  11123  11123  11123      PM10
5          1.0     88     87     87     87      PM10
6          0.0  11124  11123  11123  11123       NO2
7          1.0     88     87     87     87       NO2
8          0.0  11123  11122  11122  11123        NO
9          1.0     88     87     87     87        NO

Percentages table:
quartile  fire         Q1         Q2         Q3         Q4 pollutant
0          0.0  25.001686  24.999438  24.999438  24.999438        CO
1          1.0  25.214900  24.928367  24.928367  24.928367        CO
2          0.0  25.001686  24.999438  24.999438  24.999438     PM2.5
3          1.0  25.214900  24.928367  24.928367  24.928367     PM2.5
4          0.0  25.001686  24.999438  

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


In [13]:
import xarray as xr
import pandas as pd

# --- List of pollutant files (example with CO and PM2.5) ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

all_results = []

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if different variable name
    fire_flag = ds["fire_binary_Greece"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # Summary: counts
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # Summary: percentages
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)


  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


Counts table:
quartile  fire    Q1    Q2    Q3    Q4 pollutant
0          0.0  4955  4954  4954  4954        CO
1          1.0    15    14    14    15        CO
2          0.0  4955  4954  4954  4954     PM2.5
3          1.0    15    14    14    15     PM2.5
4          0.0  4953  4952  4952  4952      PM10
5          1.0    15    14    14    15      PM10
6          0.0  4955  4954  4954  4954       NO2
7          1.0    15    14    14    15       NO2
8          0.0  4953  4953  4953  4953        NO
9          1.0    15    14    14    15        NO

Percentages table:
quartile  fire         Q1         Q2         Q3         Q4 pollutant
0          0.0  25.003785  24.998738  24.998738  24.998738        CO
1          1.0  25.862069  24.137931  24.137931  25.862069        CO
2          0.0  25.003785  24.998738  24.998738  24.998738     PM2.5
3          1.0  25.862069  24.137931  24.137931  25.862069     PM2.5
4          0.0  25.003786  24.998738  24.998738  24.998738      PM10
5          1.

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


In [5]:
import xarray as xr
import pandas as pd

# Your pollutant NetCDF files
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

results_counts = {}
results_pct = {}

for pollutant, filepath in files.items():
    print(f"Processing {pollutant}...")

    # Open dataset
    ds = xr.open_dataset(filepath)

    # Change "Mean" if variable name differs
    df = ds["Mean"].to_dataframe(name="pollutant").reset_index()
    df["fire"] = ds["fire_binary_Portugal"].to_dataframe(name="fire").reset_index(drop=True)

    # Global quartiles (all days together)
    quantiles = df["pollutant"].quantile([0.25, 0.5, 0.75]).to_dict()

    # Assign quartile
    def assign_quartile(x):
        if x <= quantiles[0.25]:
            return "Q1"
        elif x <= quantiles[0.5]:
            return "Q2"
        elif x <= quantiles[0.75]:
            return "Q3"
        else:
            return "Q4"

    df["quartile"] = df["pollutant"].apply(assign_quartile)

    # Counts & percentages
    summary_counts = df.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    results_counts[pollutant] = summary_counts
    results_pct[pollutant] = summary_pct

# Combine into big tables
counts_table = pd.concat(results_counts, names=["Pollutant", "Fire"])
pct_table = pd.concat(results_pct, names=["Pollutant", "Fire"])

print("\nCounts of days in each quartile by pollutant & fire status:")
print(counts_table)

print("\nPercentages of days in each quartile by pollutant & fire status:")
print(pct_table)


Processing CO...
Processing PM2.5...
Processing PM10...
Processing NO2...
Processing NO...

Counts of days in each quartile by pollutant & fire status:
quartile         Q1   Q2   Q3      Q4
Pollutant Fire                       
CO        0.0   408  369  360  187967
          1.0    45   39   39    3637
PM2.5     0.0   408  363  411  187922
          1.0    32   33   38    3657
PM10      0.0   404  387  362  187951
          1.0    30   38   35    3657
NO2       0.0   401  381  381  187941
          1.0    53   36   32    3639
NO        0.0   389  383  389  187943
          1.0    32   37   42    3649

Percentages of days in each quartile by pollutant & fire status:
quartile              Q1        Q2        Q3         Q4
Pollutant Fire                                         
CO        0.0   0.215754  0.195131  0.190371  99.398744
          1.0   1.196809  1.037234  1.037234  96.728723
PM2.5     0.0   0.215754  0.191958  0.217341  99.374947
          1.0   0.851064  0.877660  1.010638  

In [6]:
import xarray as xr
import pandas as pd

# Your pollutant NetCDF files
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

results_counts = {}
results_pct = {}

for pollutant, filepath in files.items():
    print(f"Processing {pollutant}...")

    # Open dataset
    ds = xr.open_dataset(filepath)

    # Change "Mean" if variable name differs
    df = ds["Mean"].to_dataframe(name="pollutant").reset_index()
    df["fire"] = ds["fire_binary_Italy"].to_dataframe(name="fire").reset_index(drop=True)

    # Global quartiles (all days together)
    quantiles = df["pollutant"].quantile([0.25, 0.5, 0.75]).to_dict()

    # Assign quartile
    def assign_quartile(x):
        if x <= quantiles[0.25]:
            return "Q1"
        elif x <= quantiles[0.5]:
            return "Q2"
        elif x <= quantiles[0.75]:
            return "Q3"
        else:
            return "Q4"

    df["quartile"] = df["pollutant"].apply(assign_quartile)

    # Counts & percentages
    summary_counts = df.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    results_counts[pollutant] = summary_counts
    results_pct[pollutant] = summary_pct

# Combine into big tables
counts_table = pd.concat(results_counts, names=["Pollutant", "Fire"])
pct_table = pd.concat(results_pct, names=["Pollutant", "Fire"])

print("\nCounts of days in each quartile by pollutant & fire status:")
print(counts_table)

print("\nPercentages of days in each quartile by pollutant & fire status:")
print(pct_table)


Processing CO...
Processing PM2.5...
Processing PM10...
Processing NO2...
Processing NO...

Counts of days in each quartile by pollutant & fire status:
quartile           Q1     Q2     Q3      Q4
Pollutant Fire                             
CO        0.0   10237  10315  10354  928270
          1.0       9      3      6    5126
PM2.5     0.0   10498  10124  10325  928229
          1.0       7      5      5    5127
PM10      0.0   10493  10161  10244  928278
          1.0       7      4      6    5127
NO2       0.0   10239  10376  10399  928162
          1.0      10      3      4    5127
NO        0.0   10293  10163  10412  928308
          1.0       6      5      7    5126

Percentages of days in each quartile by pollutant & fire status:
quartile              Q1        Q2        Q3         Q4
Pollutant Fire                                         
CO        0.0   1.067270  1.075402  1.079468  96.777859
          1.0   0.174961  0.058320  0.116641  99.650078
PM2.5     0.0   1.094481  1.05

In [7]:
import xarray as xr
import pandas as pd

# Your pollutant NetCDF files
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

results_counts = {}
results_pct = {}

for pollutant, filepath in files.items():
    print(f"Processing {pollutant}...")

    # Open dataset
    ds = xr.open_dataset(filepath)

    # Change "Mean" if variable name differs
    df = ds["Mean"].to_dataframe(name="pollutant").reset_index()
    df["fire"] = ds["fire_binary_Spain"].to_dataframe(name="fire").reset_index(drop=True)

    # Global quartiles (all days together)
    quantiles = df["pollutant"].quantile([0.25, 0.5, 0.75]).to_dict()

    # Assign quartile
    def assign_quartile(x):
        if x <= quantiles[0.25]:
            return "Q1"
        elif x <= quantiles[0.5]:
            return "Q2"
        elif x <= quantiles[0.75]:
            return "Q3"
        else:
            return "Q4"

    df["quartile"] = df["pollutant"].apply(assign_quartile)

    # Counts & percentages
    summary_counts = df.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    results_counts[pollutant] = summary_counts
    results_pct[pollutant] = summary_pct

# Combine into big tables
counts_table = pd.concat(results_counts, names=["Pollutant", "Fire"])
pct_table = pd.concat(results_pct, names=["Pollutant", "Fire"])

print("\nCounts of days in each quartile by pollutant & fire status:")
print(counts_table)

print("\nPercentages of days in each quartile by pollutant & fire status:")
print(pct_table)


Processing CO...
Processing PM2.5...
Processing PM10...
Processing NO2...
Processing NO...

Counts of days in each quartile by pollutant & fire status:
quartile           Q1     Q2     Q3      Q4
Pollutant Fire                             
CO        0.0   11016  11245  11053  966247
          1.0      87     84     88    4680
PM2.5     0.0   10991  11374  11075  966121
          1.0      84    105     77    4673
PM10      0.0   11041  11318  11033  966169
          1.0      86     97     85    4671
NO2       0.0   11081  11005  11129  966346
          1.0      81     92     86    4680
NO        0.0   11124  11104  11107  966226
          1.0      82     86     90    4681

Percentages of days in each quartile by pollutant & fire status:
quartile              Q1        Q2        Q3         Q4
Pollutant Fire                                         
CO        0.0   1.102084  1.124994  1.105785  96.667137
          1.0   1.761490  1.700749  1.781737  94.756023
PM2.5     0.0   1.099583  1.13

In [8]:
import xarray as xr
import pandas as pd

# Your pollutant NetCDF files
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

results_counts = {}
results_pct = {}

for pollutant, filepath in files.items():
    print(f"Processing {pollutant}...")

    # Open dataset
    ds = xr.open_dataset(filepath)

    # Change "Mean" if variable name differs
    df = ds["Mean"].to_dataframe(name="pollutant").reset_index()
    df["fire"] = ds["fire_binary_Greece"].to_dataframe(name="fire").reset_index(drop=True)

    # Global quartiles (all days together)
    quantiles = df["pollutant"].quantile([0.25, 0.5, 0.75]).to_dict()

    # Assign quartile
    def assign_quartile(x):
        if x <= quantiles[0.25]:
            return "Q1"
        elif x <= quantiles[0.5]:
            return "Q2"
        elif x <= quantiles[0.75]:
            return "Q3"
        else:
            return "Q4"

    df["quartile"] = df["pollutant"].apply(assign_quartile)

    # Counts & percentages
    summary_counts = df.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    results_counts[pollutant] = summary_counts
    results_pct[pollutant] = summary_pct

# Combine into big tables
counts_table = pd.concat(results_counts, names=["Pollutant", "Fire"])
pct_table = pd.concat(results_pct, names=["Pollutant", "Fire"])

print("\nCounts of days in each quartile by pollutant & fire status:")
print(counts_table)

print("\nPercentages of days in each quartile by pollutant & fire status:")
print(pct_table)


Processing CO...
Processing PM2.5...
Processing PM10...
Processing NO2...
Processing NO...

Counts of days in each quartile by pollutant & fire status:
quartile          Q1    Q2    Q3      Q4
Pollutant Fire                          
CO        0.0   4929  4764  4946  648445
          1.0     15    13    16    3860
PM2.5     0.0   4795  5050  5108  648131
          1.0     11    23    16    3854
PM10      0.0   4798  5040  5084  648162
          1.0     13    20    17    3854
NO2       0.0   4758  5003  5058  648265
          1.0     12    20    15    3857
NO        0.0   4789  4957  5103  648235
          1.0     15    12    20    3857

Percentages of days in each quartile by pollutant & fire status:
quartile              Q1        Q2        Q3         Q4
Pollutant Fire                                         
CO        0.0   0.743345  0.718461  0.745909  97.792286
          1.0   0.384221  0.332992  0.409836  98.872951
PM2.5     0.0   0.723136  0.761593  0.770340  97.744931
          

Pearson correlation performed between meteorological variables and air pollutants, for all days and every label established

In [17]:
import xarray as xr
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import glob
import os

# --- Paths ---
wind_folder = r"D:\IPMA\ERA5\UV_wind\2wind_speed_direction"
pollutants = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

# --- Load and combine all wind speed files ---
wind_files = sorted(glob.glob(os.path.join(wind_folder, "*.nc")))
wind_list = []

for f in wind_files:
    ds = xr.open_dataset(f)
    ds = ds.sel(latitude=slice(34,42), longitude=slice(19,29))  # Greece subset
    # Daily mean using 'valid_time'
    wind_daily = ds['wind_speed'].resample(valid_time='1D').mean(dim='valid_time')
    # Spatial mean
    wind_avg = wind_daily.mean(dim=['latitude','longitude'])
    wind_list.append(wind_avg)

wind_all = xr.concat(wind_list, dim='valid_time')
wind_series = pd.Series(wind_all.values, index=pd.to_datetime(wind_all['valid_time'].values))

# --- Load pollutant data and fire labels ---
poll_data = {}
fire_labels = None

for name, path in pollutants.items():
    ds = xr.open_dataset(path)
    # Daily mean over region
    data_avg = ds['Mean'].mean(dim=['latitude','longitude']).values
    poll_data[name] = pd.Series(data_avg, index=pd.to_datetime(ds['time'].values))
    
    # Fire labels (only once)
    if fire_labels is None:
        labels_avg = ds['fire_label_Portugal'].mean(dim=['latitude','longitude']).values
        fire_labels = pd.Series(labels_avg, index=pd.to_datetime(ds['time'].values))

# --- Combine into DataFrame ---
df = pd.DataFrame(poll_data)
df['wind_speed'] = wind_series
df = df.loc[df.index.intersection(fire_labels.index)]  # align indices

# --- Define subsets ---
def get_subset(label):
    return fire_labels[fire_labels==label].index

# --- Function to compute correlation with safety check ---
def compute_corr(idx):
    sub = df.loc[idx]
    corr_dict = {}
    for col in df.columns[:-1]:  # pollutants only
        if len(sub) < 2:
            corr_dict[col] = np.nan  # Not enough data
        else:
            r, _ = pearsonr(sub['wind_speed'], sub[col])
            corr_dict[col] = r
    return pd.Series(corr_dict, name='wind_speed')

# --- Print subset sizes for info ---
print("All days:", len(df))
print("NFD:", len(get_subset(0)))
for i in range(1,6):
    print(f"Day{i}:", len(get_subset(i)))

# --- Build full correlation table ---
all_days = compute_corr(df.index)
nfd = compute_corr(get_subset(0))
day_tables = [compute_corr(get_subset(i)) for i in range(1,6)]

day_tables = [compute_corr(get_subset(i)) for i in range(1,7)]  # 1 to 6 inclusive
full_table = pd.concat([all_days, nfd] + day_tables, axis=1)
full_table.columns = ['All_days', 'NFD', 'Day0','Day1','Day2','Day3','Day4','Day5']

print(full_table)


All days: 8036
NFD: 6302
Day1: 0
Day2: 0
Day3: 0
Day4: 0
Day5: 0
       All_days  NFD  Day0  Day1  Day2  Day3  Day4  Day5
CO          NaN  NaN   NaN   NaN   NaN   NaN   NaN   NaN
PM2.5       NaN  NaN   NaN   NaN   NaN   NaN   NaN   NaN
PM10        NaN  NaN   NaN   NaN   NaN   NaN   NaN   NaN
NO2         NaN  NaN   NaN   NaN   NaN   NaN   NaN   NaN
NO          NaN  NaN   NaN   NaN   NaN   NaN   NaN   NaN


In [23]:
import xarray as xr
import pandas as pd
import numpy as np
import glob, os

# -------------------------
# 1. Load wind dataset (hourly, 0.25° grid)
# -------------------------
wind_folder = r"D:\IPMA\ERA5\UV_wind\2wind_speed_direction"
wind_files = sorted(glob.glob(os.path.join(wind_folder, "*.nc")))

ds_wind = xr.open_mfdataset(wind_files, combine="by_coords")

# Detect correct time coordinate
time_coord = None
for cand in ["time", "valid_time", "date", "time_counter"]:
    if cand in ds_wind.dims or cand in ds_wind.coords:
        time_coord = cand
        break
if time_coord is None:
    raise ValueError("❌ Could not find a time coordinate in wind dataset")

# Regrid 0.25° → 0.75° (3x3 bins)
wind_coarse = ds_wind.coarsen(latitude=3, longitude=3, boundary="trim").mean()

# Hourly → daily mean
wind_daily = wind_coarse["wind_speed"].resample({time_coord: "1D"}).mean()

# Regional average (Portugal)
wind_series = wind_daily.mean(dim=["latitude", "longitude"]).to_pandas()
wind_series.name = "wind_speed"

# -------------------------
# 2. Load pollutant datasets (already daily + fire labels)
# -------------------------
pollutants_files = {
    "CO":   r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5":r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2":  r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO":   r"D:\IPMA\CAMS\no_fire_Portugal.nc",
}

pollutants = {}
fire_labels = None

for pol, path in pollutants_files.items():
    ds = xr.open_dataset(path)

    # ✅ Take spatial mean over lat/lon to get a single daily series
    pol_series = ds["Mean"].mean(dim=["latitude", "longitude"]).to_pandas()
    pol_series.name = pol
    pollutants[pol] = pol_series

    # ✅ Fire labels: take spatial mean and round (they should already be consistent across grid)
    if fire_labels is None:
        label_var = [v for v in ds.data_vars if "fire_label" in v][0]
        fire_labels = ds[label_var].mean(dim=["latitude", "longitude"]).round().astype(int).to_pandas()

# -------------------------
# 3. Combine into one dataframe
# -------------------------
df = pd.concat(pollutants.values(), axis=1)
df["wind_speed"] = wind_series
df["fire_label"] = fire_labels
df = df.dropna()

# -------------------------
# 4. Correlation helper
# -------------------------
def compute_corr(subset):
    """Return correlation between wind_speed and all pollutants."""
    return subset.corr().loc["wind_speed", ["CO", "NO", "NO2", "PM2.5", "PM10"]]

# All data (no filtering)
all_days = compute_corr(df)

# Label-based subsets
corrs_by_label = {}
for label in range(6):  # 0 = NFD, 1=Day0 ... 5=Day5
    subset = df[df["fire_label"] == label]
    if not subset.empty:
        corrs_by_label[label] = compute_corr(subset)
    else:
        corrs_by_label[label] = pd.Series([np.nan]*5, index=["CO","NO","NO2","PM2.5","PM10"])

# -------------------------
# 5. Build final correlation table
# -------------------------
# Build final correlation table
corr_table = pd.concat(
    [all_days,
     corrs_by_label[0],  # NFD
     corrs_by_label[1],  # Day0
     corrs_by_label[2],  # Day1
     corrs_by_label[3],  # Day2
     corrs_by_label[4],  # Day3
     corrs_by_label[5],  # Day4
     corrs_by_label[6]], # Day5
    axis=1
)

corr_table.columns = ["All_days", "NFD", "Day0", "Day1", "Day2", "Day3", "Day4", "Day5"]

print(corr_table)

# Save to Excel
corr_table.to_excel(r"D:\IPMA\Results\wind_pollutant_correlation_Portugal.xlsx")


KeyError: 6