mean +- standard error of mean of pollutants on day without fire, with fire and up to 5 days after fire outbreak


In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Portugal'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000    

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Portugal):")
print(df.to_string(index=False))


In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Italy'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Italy):")
print(df.to_string(index=False))


In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Spain'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Spain):")
print(df.to_string(index=False))


In [None]:
import xarray as xr
import numpy as np
import pandas as pd
import warnings

# Suppress RuntimeWarnings for invalid SEM calculations
warnings.filterwarnings("ignore", category=RuntimeWarning)

# Dictionary of pollutants and their NetCDF file paths
pollutant_files = {
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

# Human-readable labels
label_names = {
    0: "No fire",
    1: "Day 0 (Fire outbreak)",
    2: "Day 1",
    3: "Day 2",
    4: "Day 3",
    5: "Day 4",
    6: "Day 5",
}

# Initialize results dictionary
results = { "Label": list(label_names.values()) }

# Loop over pollutants
for pollutant, filepath in pollutant_files.items():
    ds = xr.open_dataset(filepath)

    data = ds['Mean']  # pollutant values
    labels = ds['fire_label_Greece'].transpose('latitude', 'longitude', 'time')

    mean_list, sem_list = [], []

    # Loop over fire labels
    for label in label_names.keys():
        mask = labels == label
        masked_data = data.where(mask)

        # Stats across time per grid cell
        mean = masked_data.mean(dim='time', skipna=True)
        std = masked_data.std(dim='time', skipna=True)
        count = masked_data.count(dim='time')
        sem = std / np.sqrt(count)

        # Spatial average
        mean_val = mean.mean(skipna=True).item()
        sem_val = sem.mean(skipna=True).item()

        # Convert CO mg/m³ → µg/m³
        if pollutant == "CO":
            mean_val *= 1000
            sem_val *= 1000

        mean_list.append(mean_val)
        sem_list.append(sem_val)

    # Add columns for this pollutant
    results[f"{pollutant} Mean (µg/m³)"] = mean_list
    results[f"{pollutant} SEM (µg/m³)"] = sem_list

# Convert to DataFrame for pretty table
df = pd.DataFrame(results)

# Print the table
print("\nPollutant concentrations by fire label (spatial average across Greece):")
print(df.to_string(index=False))


Divide daily concentrations of pollutants into quartiles Q1 (lowest), Q2, Q3 and Q4 (highest) for days when fires occurred and days they did not - Assess the impact of fire events on the concentration of air pollutants. Calculation of percentage of days (non-wildfire and wildfire) in each of the four qaurtiles

In [None]:
import xarray as xr
import pandas as pd

# --- List of pollutant files (example with CO and PM2.5) ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

all_results = []

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if different variable name
    fire_flag = ds["fire_binary_Portugal"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Drop NaNs before analysis
    df = df.dropna(subset=["pollutant", "fire"])

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # Summary: counts
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # Summary: percentages
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


Counts table:
quartile  fire   Q1   Q2   Q3   Q4 pollutant
0          0.0  385  384  384  384        CO
1          1.0   36   36   36   36        CO
2          0.0  385  384  384  384     PM2.5
3          1.0   36   36   36   36     PM2.5
4          0.0  385  384  384  384      PM10
5          1.0   36   36   36   36      PM10
6          0.0  385  384  384  384       NO2
7          1.0   36   36   36   36       NO2
8          0.0  385  384  384  384        NO
9          1.0   36   36   36   36        NO

Percentages table:
quartile  fire         Q1         Q2         Q3         Q4 pollutant
0          0.0  25.048796  24.983735  24.983735  24.983735        CO
1          1.0  25.000000  25.000000  25.000000  25.000000        CO
2          0.0  25.048796  24.983735  24.983735  24.983735     PM2.5
3          1.0  25.000000  25.000000  25.000000  25.000000     PM2.5
4          0.0  25.048796  24.983735  24.983735  24.983735      PM10
5          1.0  25.000000  25.000000  25.000000  25.00000

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


In [11]:
import xarray as xr
import pandas as pd

# --- List of pollutant files (example with CO and PM2.5) ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

all_results = []

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if different variable name
    fire_flag = ds["fire_binary_Italy"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Drop NaNs before analysis
    df = df.dropna(subset=["pollutant", "fire"])

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # Summary: counts
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # Summary: percentages
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)

df_quartiles

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


Counts table:
quartile  fire     Q1     Q2     Q3     Q4 pollutant
0          0.0  10282  10282  10281  10282        CO
1          1.0      6      6      6      6        CO
2          0.0  10282  10282  10281  10282     PM2.5
3          1.0      6      6      6      6     PM2.5
4          0.0  10282  10281  10281  10282      PM10
5          1.0      6      6      6      6      PM10
6          0.0  10282  10282  10281  10282       NO2
7          1.0      6      6      6      6       NO2
8          0.0  10277  10277  10277  10277        NO
9          1.0      6      6      6      6        NO

Percentages table:
quartile  fire         Q1         Q2         Q3         Q4 pollutant
0          0.0  25.000608  25.000608  24.998176  25.000608        CO
1          1.0  25.000000  25.000000  25.000000  25.000000        CO
2          0.0  25.000608  25.000608  24.998176  25.000608     PM2.5
3          1.0  25.000000  25.000000  25.000000  25.000000     PM2.5
4          0.0  25.001216  24.998784  

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


Unnamed: 0,latitude,longitude,time,pollutant,fire,quartile
783147,35.25,12.00,2013-01-03,0.011900,1.0,Q1
791598,35.25,12.75,2014-02-22,0.032564,1.0,Q1
2884858,38.25,12.75,2024-10-27,0.072328,1.0,Q1
3360466,39.00,9.00,2006-11-19,0.086161,1.0,Q1
3362877,39.00,9.00,2013-06-26,0.278755,1.0,Q2
...,...,...,...,...,...,...
9143518,47.25,12.00,2021-01-12,3.720558,0.0,Q4
9143519,47.25,12.00,2021-01-13,1.138191,0.0,Q3
9143580,47.25,12.00,2021-03-15,0.275044,0.0,Q2
9143581,47.25,12.00,2021-03-16,0.412056,0.0,Q3


In [12]:
import xarray as xr
import pandas as pd

# --- List of pollutant files (example with CO and PM2.5) ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

all_results = []

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if different variable name
    fire_flag = ds["fire_binary_Spain"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Drop NaNs before analysis
    df = df.dropna(subset=["pollutant", "fire"])

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # Summary: counts
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # Summary: percentages
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)


  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


Counts table:
quartile  fire     Q1     Q2     Q3     Q4 pollutant
0          0.0  11124  11123  11123  11123        CO
1          1.0     88     87     87     87        CO
2          0.0  11124  11123  11123  11123     PM2.5
3          1.0     88     87     87     87     PM2.5
4          0.0  11124  11123  11123  11123      PM10
5          1.0     88     87     87     87      PM10
6          0.0  11124  11123  11123  11123       NO2
7          1.0     88     87     87     87       NO2
8          0.0  11123  11122  11122  11123        NO
9          1.0     88     87     87     87        NO

Percentages table:
quartile  fire         Q1         Q2         Q3         Q4 pollutant
0          0.0  25.001686  24.999438  24.999438  24.999438        CO
1          1.0  25.214900  24.928367  24.928367  24.928367        CO
2          0.0  25.001686  24.999438  24.999438  24.999438     PM2.5
3          1.0  25.214900  24.928367  24.928367  24.928367     PM2.5
4          0.0  25.001686  24.999438  

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


In [13]:
import xarray as xr
import pandas as pd

# --- List of pollutant files (example with CO and PM2.5) ---
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

all_results = []

for pol_name, file_path in files.items():
    ds = xr.open_dataset(file_path)

    pollutant = ds["Mean"]  # adjust if different variable name
    fire_flag = ds["fire_binary_Greece"]

    # Convert to DataFrame
    df = pollutant.to_dataframe(name="pollutant").reset_index()
    df["fire"] = fire_flag.to_dataframe(name="fire").reset_index(drop=True)

    # Drop NaNs before analysis
    df = df.dropna(subset=["pollutant", "fire"])

    # Split into fire / no-fire
    df_fire = df[df["fire"] == 1].copy()
    df_nofire = df[df["fire"] == 0].copy()

    # Quartiles within each group
    df_fire["quartile"] = pd.qcut(df_fire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])
    df_nofire["quartile"] = pd.qcut(df_nofire["pollutant"], q=4, labels=["Q1","Q2","Q3","Q4"])

    df_quartiles = pd.concat([df_fire, df_nofire])

    # Summary: counts
    summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)

    # Summary: percentages
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    # Add pollutant name for identification
    summary_counts["pollutant"] = pol_name
    summary_pct["pollutant"] = pol_name

    all_results.append((summary_counts, summary_pct))

# --- Combine all pollutants ---
counts_table = pd.concat([c for c, p in all_results], axis=0).reset_index()
percentages_table = pd.concat([p for c, p in all_results], axis=0).reset_index()

print("Counts table:")
print(counts_table)

print("\nPercentages table:")
print(percentages_table)


  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


Counts table:
quartile  fire    Q1    Q2    Q3    Q4 pollutant
0          0.0  4955  4954  4954  4954        CO
1          1.0    15    14    14    15        CO
2          0.0  4955  4954  4954  4954     PM2.5
3          1.0    15    14    14    15     PM2.5
4          0.0  4953  4952  4952  4952      PM10
5          1.0    15    14    14    15      PM10
6          0.0  4955  4954  4954  4954       NO2
7          1.0    15    14    14    15       NO2
8          0.0  4953  4953  4953  4953        NO
9          1.0    15    14    14    15        NO

Percentages table:
quartile  fire         Q1         Q2         Q3         Q4 pollutant
0          0.0  25.003785  24.998738  24.998738  24.998738        CO
1          1.0  25.862069  24.137931  24.137931  25.862069        CO
2          0.0  25.003785  24.998738  24.998738  24.998738     PM2.5
3          1.0  25.862069  24.137931  24.137931  25.862069     PM2.5
4          0.0  25.003786  24.998738  24.998738  24.998738      PM10
5          1.

  summary_counts = df_quartiles.groupby(["fire", "quartile"]).size().unstack(fill_value=0)


In [None]:
import xarray as xr
import pandas as pd

# Your pollutant NetCDF files
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

results_counts = {}
results_pct = {}

for pollutant, filepath in files.items():
    print(f"Processing {pollutant}...")

    # Open dataset
    ds = xr.open_dataset(filepath)

    # Change "Mean" if variable name differs
    df = ds["Mean"].to_dataframe(name="pollutant").reset_index()
    df["fire"] = ds["fire_binary_Portugal"].to_dataframe(name="fire").reset_index(drop=True)

    # Global quartiles (all days together)
    quantiles = df["pollutant"].quantile([0.25, 0.5, 0.75]).to_dict()

    # Assign quartile
    def assign_quartile(x):
        if x <= quantiles[0.25]:
            return "Q1"
        elif x <= quantiles[0.5]:
            return "Q2"
        elif x <= quantiles[0.75]:
            return "Q3"
        else:
            return "Q4"

    df["quartile"] = df["pollutant"].apply(assign_quartile)

    # Counts & percentages
    summary_counts = df.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    results_counts[pollutant] = summary_counts
    results_pct[pollutant] = summary_pct

# Combine into big tables
counts_table = pd.concat(results_counts, names=["Pollutant", "Fire"])
pct_table = pd.concat(results_pct, names=["Pollutant", "Fire"])

print("\nCounts of days in each quartile by pollutant & fire status:")
print(counts_table)

print("\nPercentages of days in each quartile by pollutant & fire status:")
print(pct_table)


In [None]:
import xarray as xr
import pandas as pd

# Your pollutant NetCDF files
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Italy.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Italy.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Italy.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Italy.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Italy.nc"
}

results_counts = {}
results_pct = {}

for pollutant, filepath in files.items():
    print(f"Processing {pollutant}...")

    # Open dataset
    ds = xr.open_dataset(filepath)

    # Change "Mean" if variable name differs
    df = ds["Mean"].to_dataframe(name="pollutant").reset_index()
    df["fire"] = ds["fire_binary_Italy"].to_dataframe(name="fire").reset_index(drop=True)

    # Global quartiles (all days together)
    quantiles = df["pollutant"].quantile([0.25, 0.5, 0.75]).to_dict()

    # Assign quartile
    def assign_quartile(x):
        if x <= quantiles[0.25]:
            return "Q1"
        elif x <= quantiles[0.5]:
            return "Q2"
        elif x <= quantiles[0.75]:
            return "Q3"
        else:
            return "Q4"

    df["quartile"] = df["pollutant"].apply(assign_quartile)

    # Counts & percentages
    summary_counts = df.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    results_counts[pollutant] = summary_counts
    results_pct[pollutant] = summary_pct

# Combine into big tables
counts_table = pd.concat(results_counts, names=["Pollutant", "Fire"])
pct_table = pd.concat(results_pct, names=["Pollutant", "Fire"])

print("\nCounts of days in each quartile by pollutant & fire status:")
print(counts_table)

print("\nPercentages of days in each quartile by pollutant & fire status:")
print(pct_table)


In [None]:
import xarray as xr
import pandas as pd

# Your pollutant NetCDF files
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Spain.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Spain.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Spain.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Spain.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Spain.nc"
}

results_counts = {}
results_pct = {}

for pollutant, filepath in files.items():
    print(f"Processing {pollutant}...")

    # Open dataset
    ds = xr.open_dataset(filepath)

    # Change "Mean" if variable name differs
    df = ds["Mean"].to_dataframe(name="pollutant").reset_index()
    df["fire"] = ds["fire_binary_Spain"].to_dataframe(name="fire").reset_index(drop=True)

    # Global quartiles (all days together)
    quantiles = df["pollutant"].quantile([0.25, 0.5, 0.75]).to_dict()

    # Assign quartile
    def assign_quartile(x):
        if x <= quantiles[0.25]:
            return "Q1"
        elif x <= quantiles[0.5]:
            return "Q2"
        elif x <= quantiles[0.75]:
            return "Q3"
        else:
            return "Q4"

    df["quartile"] = df["pollutant"].apply(assign_quartile)

    # Counts & percentages
    summary_counts = df.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    results_counts[pollutant] = summary_counts
    results_pct[pollutant] = summary_pct

# Combine into big tables
counts_table = pd.concat(results_counts, names=["Pollutant", "Fire"])
pct_table = pd.concat(results_pct, names=["Pollutant", "Fire"])

print("\nCounts of days in each quartile by pollutant & fire status:")
print(counts_table)

print("\nPercentages of days in each quartile by pollutant & fire status:")
print(pct_table)


In [None]:
import xarray as xr
import pandas as pd

# Your pollutant NetCDF files
files = {
    "CO": r"D:\IPMA\CAMS\co_fire_Greece.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Greece.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Greece.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Greece.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Greece.nc"
}

results_counts = {}
results_pct = {}

for pollutant, filepath in files.items():
    print(f"Processing {pollutant}...")

    # Open dataset
    ds = xr.open_dataset(filepath)

    # Change "Mean" if variable name differs
    df = ds["Mean"].to_dataframe(name="pollutant").reset_index()
    df["fire"] = ds["fire_binary_Greece"].to_dataframe(name="fire").reset_index(drop=True)

    # Global quartiles (all days together)
    quantiles = df["pollutant"].quantile([0.25, 0.5, 0.75]).to_dict()

    # Assign quartile
    def assign_quartile(x):
        if x <= quantiles[0.25]:
            return "Q1"
        elif x <= quantiles[0.5]:
            return "Q2"
        elif x <= quantiles[0.75]:
            return "Q3"
        else:
            return "Q4"

    df["quartile"] = df["pollutant"].apply(assign_quartile)

    # Counts & percentages
    summary_counts = df.groupby(["fire", "quartile"]).size().unstack(fill_value=0)
    summary_pct = summary_counts.div(summary_counts.sum(axis=1), axis=0) * 100

    results_counts[pollutant] = summary_counts
    results_pct[pollutant] = summary_pct

# Combine into big tables
counts_table = pd.concat(results_counts, names=["Pollutant", "Fire"])
pct_table = pd.concat(results_pct, names=["Pollutant", "Fire"])

print("\nCounts of days in each quartile by pollutant & fire status:")
print(counts_table)

print("\nPercentages of days in each quartile by pollutant & fire status:")
print(pct_table)


Pearson correlation performed between meteorological variables and air pollutants, for all days and every label established

In [None]:
import xarray as xr
import pandas as pd
import numpy as np
from scipy.stats import pearsonr
import glob
import os

# --- Paths ---
wind_folder = r"D:\IPMA\ERA5\UV_wind\2wind_speed_direction"
pollutants = {
    "CO": r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5": r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2": r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO": r"D:\IPMA\CAMS\no_fire_Portugal.nc"
}

# --- Load and combine all wind speed files ---
wind_files = sorted(glob.glob(os.path.join(wind_folder, "*.nc")))
wind_list = []

for f in wind_files:
    ds = xr.open_dataset(f)
    ds = ds.sel(latitude=slice(34,42), longitude=slice(19,29))  # Greece subset
    # Daily mean using 'valid_time'
    wind_daily = ds['wind_speed'].resample(valid_time='1D').mean(dim='valid_time')
    # Spatial mean
    wind_avg = wind_daily.mean(dim=['latitude','longitude'])
    wind_list.append(wind_avg)

wind_all = xr.concat(wind_list, dim='valid_time')
wind_series = pd.Series(wind_all.values, index=pd.to_datetime(wind_all['valid_time'].values))

# --- Load pollutant data and fire labels ---
poll_data = {}
fire_labels = None

for name, path in pollutants.items():
    ds = xr.open_dataset(path)
    # Daily mean over region
    data_avg = ds['Mean'].mean(dim=['latitude','longitude']).values
    poll_data[name] = pd.Series(data_avg, index=pd.to_datetime(ds['time'].values))
    
    # Fire labels (only once)
    if fire_labels is None:
        labels_avg = ds['fire_label_Portugal'].mean(dim=['latitude','longitude']).values
        fire_labels = pd.Series(labels_avg, index=pd.to_datetime(ds['time'].values))

# --- Combine into DataFrame ---
df = pd.DataFrame(poll_data)
df['wind_speed'] = wind_series
df = df.loc[df.index.intersection(fire_labels.index)]  # align indices

# --- Define subsets ---
def get_subset(label):
    return fire_labels[fire_labels==label].index

# --- Function to compute correlation with safety check ---
def compute_corr(idx):
    sub = df.loc[idx]
    corr_dict = {}
    for col in df.columns[:-1]:  # pollutants only
        if len(sub) < 2:
            corr_dict[col] = np.nan  # Not enough data
        else:
            r, _ = pearsonr(sub['wind_speed'], sub[col])
            corr_dict[col] = r
    return pd.Series(corr_dict, name='wind_speed')

# --- Print subset sizes for info ---
print("All days:", len(df))
print("NFD:", len(get_subset(0)))
for i in range(1,6):
    print(f"Day{i}:", len(get_subset(i)))

# --- Build full correlation table ---
all_days = compute_corr(df.index)
nfd = compute_corr(get_subset(0))
day_tables = [compute_corr(get_subset(i)) for i in range(1,6)]

day_tables = [compute_corr(get_subset(i)) for i in range(1,7)]  # 1 to 6 inclusive
full_table = pd.concat([all_days, nfd] + day_tables, axis=1)
full_table.columns = ['All_days', 'NFD', 'Day0','Day1','Day2','Day3','Day4','Day5']

print(full_table)


In [None]:
import xarray as xr
import pandas as pd
import numpy as np
import glob, os

# -------------------------
# 1. Load wind dataset (hourly, 0.25° grid)
# -------------------------
wind_folder = r"D:\IPMA\ERA5\UV_wind\2wind_speed_direction"
wind_files = sorted(glob.glob(os.path.join(wind_folder, "*.nc")))

ds_wind = xr.open_mfdataset(wind_files, combine="by_coords")

# Detect correct time coordinate
time_coord = None
for cand in ["time", "valid_time", "date", "time_counter"]:
    if cand in ds_wind.dims or cand in ds_wind.coords:
        time_coord = cand
        break
if time_coord is None:
    raise ValueError("❌ Could not find a time coordinate in wind dataset")

# Regrid 0.25° → 0.75° (3x3 bins)
wind_coarse = ds_wind.coarsen(latitude=3, longitude=3, boundary="trim").mean()

# Hourly → daily mean
wind_daily = wind_coarse["wind_speed"].resample({time_coord: "1D"}).mean()

# Regional average (Portugal)
wind_series = wind_daily.mean(dim=["latitude", "longitude"]).to_pandas()
wind_series.name = "wind_speed"

# -------------------------
# 2. Load pollutant datasets (already daily + fire labels)
# -------------------------
pollutants_files = {
    "CO":   r"D:\IPMA\CAMS\co_fire_Portugal.nc",
    "PM2.5":r"D:\IPMA\CAMS\pm2p5_fire_Portugal.nc",
    "PM10": r"D:\IPMA\CAMS\pm10_fire_Portugal.nc",
    "NO2":  r"D:\IPMA\CAMS\no2_fire_Portugal.nc",
    "NO":   r"D:\IPMA\CAMS\no_fire_Portugal.nc",
}

pollutants = {}
fire_labels = None

for pol, path in pollutants_files.items():
    ds = xr.open_dataset(path)

    # ✅ Take spatial mean over lat/lon to get a single daily series
    pol_series = ds["Mean"].mean(dim=["latitude", "longitude"]).to_pandas()
    pol_series.name = pol
    pollutants[pol] = pol_series

    # ✅ Fire labels: take spatial mean and round (they should already be consistent across grid)
    if fire_labels is None:
        label_var = [v for v in ds.data_vars if "fire_label" in v][0]
        fire_labels = ds[label_var].mean(dim=["latitude", "longitude"]).round().astype(int).to_pandas()

# -------------------------
# 3. Combine into one dataframe
# -------------------------
df = pd.concat(pollutants.values(), axis=1)
df["wind_speed"] = wind_series
df["fire_label"] = fire_labels
df = df.dropna()

# -------------------------
# 4. Correlation helper
# -------------------------
def compute_corr(subset):
    """Return correlation between wind_speed and all pollutants."""
    return subset.corr().loc["wind_speed", ["CO", "NO", "NO2", "PM2.5", "PM10"]]

# All data (no filtering)
all_days = compute_corr(df)

# Label-based subsets
corrs_by_label = {}
for label in range(6):  # 0 = NFD, 1=Day0 ... 5=Day5
    subset = df[df["fire_label"] == label]
    if not subset.empty:
        corrs_by_label[label] = compute_corr(subset)
    else:
        corrs_by_label[label] = pd.Series([np.nan]*5, index=["CO","NO","NO2","PM2.5","PM10"])

# -------------------------
# 5. Build final correlation table
# -------------------------
# Build final correlation table
corr_table = pd.concat(
    [all_days,
     corrs_by_label[0],  # NFD
     corrs_by_label[1],  # Day0
     corrs_by_label[2],  # Day1
     corrs_by_label[3],  # Day2
     corrs_by_label[4],  # Day3
     corrs_by_label[5],  # Day4
     corrs_by_label[6]], # Day5
    axis=1
)

corr_table.columns = ["All_days", "NFD", "Day0", "Day1", "Day2", "Day3", "Day4", "Day5"]

print(corr_table)

# Save to Excel
corr_table.to_excel(r"D:\IPMA\Results\wind_pollutant_correlation_Portugal.xlsx")
