Main pollutants PM10 & PM2.5, NO2, CO2, O3


CAMS global reanalysis EAC4 - Data Preparation (according to what Virgilio gave in matlab scripts)

In [None]:
# To separate monthly files when downloading the full year - DONE for 2023-2024

import xarray as xr
import pandas as pd

# Load the dataset
file_path = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl\CAMS_global_reanalysis_EAC4_chem_singlvl_2024.nc" #change path accordingly
ds = xr.open_dataset(file_path)

# Ensure valid_time is a datetime object
ds['valid_time'] = pd.to_datetime(ds['valid_time'].values)

# Get unique years in the dataset
years = pd.Series(ds['valid_time'].dt.year.values).unique()

# Iterate through years and months
dest_folder = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl"
for year in years:
    for month in range(1, 13):
        # Filter dataset for the given year and month
        monthly_ds = ds.sel(valid_time=(ds.valid_time.dt.year == year) & (ds.valid_time.dt.month == month))

        if monthly_ds.valid_time.size > 0:  # Only save if data exists for the month
            output_filename = rf"{dest_folder}\CAMS_global_reanalysis_EAC4_chem_singlvl_{year}{month:02d}.nc"
            monthly_ds.to_netcdf(output_filename)
            print(f"Saved {output_filename}")

# Close the dataset
ds.close()

In [None]:
# To change valid_time to time (so it matches) - DONE for 2023-2024

import xarray as xr
import os
import glob

# Path to your directory containing the NetCDF files
input_dir = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl"
output_dir = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl\final_2003_2024"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Get all NetCDF files in the input directory
nc_files = glob.glob(os.path.join(input_dir, "*.nc"))

# Loop through each file
for file_path in nc_files:
    print(f"Processing {file_path}...")

    # Open the NetCDF file
    ds = xr.open_dataset(file_path)

    # Check if 'valid_time' exists and rename it to 'time'
    if 'valid_time' in ds:
        ds = ds.rename({"valid_time": "time"})

    # Get the filename and create the output path in the new directory
    output_file = os.path.join(output_dir, os.path.basename(file_path))

    # Save the updated dataset to the output directory (overwrite original file in the new folder)
    ds.to_netcdf(output_file)

    print(f"Renamed 'valid_time' to 'time' and saved to {output_file}")

print("Renaming and saving to new folder complete!")


In [None]:
# To convert coords from 0 to 360 to -180 to 180 - DONE for 2003-2022

import os
import xarray as xr
import numpy as np

# Define input and output folders
input_folder = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl\raw_2003_2022"
output_folder = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl\transformation_2003_2022"

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Get list of NetCDF files in the input folder
nc_files = [f for f in os.listdir(input_folder) if f.endswith(".nc")]

# Process each file
for nc_file in nc_files:
    input_path = os.path.join(input_folder, nc_file)
    output_path = os.path.join(output_folder, nc_file)

    print(f"Processing: {nc_file}")

    # Open the dataset
    ds = xr.open_dataset(input_path)

    # Extract longitude and latitude
    lon = ds['longitude'].values  # (480,)
    lat = ds['latitude'].values   # (241,)

    # Convert longitude from 0-360 to -180 to 180
    lon2 = (lon + 180) % 360 - 180

    # Swap the first and second halves
    lon3 = np.copy(lon2)
    lon3[:240] = lon2[240:480]
    lon3[240:480] = lon2[:240]

    # Create a meshgrid (not strictly needed for saving, but useful)
    LON, LAT = np.meshgrid(lon3, lat)

    # Apply the same transformation to pm10, pm1, and pm2p5
    for var in ["pm10", "pm1", "pm2p5"]:
        if var in ds:
            data = ds[var].values  # Shape: (time, lat, lon)
            transformed_data = np.copy(data)

            # Swap the longitude axis (last axis)
            transformed_data[:, :, :240] = data[:, :, 240:480]
            transformed_data[:, :, 240:480] = data[:, :, :240]

            # Replace the dataset variable with the corrected data
            ds[var].values = transformed_data

    # Update longitude in the dataset
    ds = ds.assign_coords(longitude=lon3)

    # Save the modified dataset
    ds.to_netcdf(output_path)

    print(f"Saved transformed file: {output_path}")



In [None]:
# To crop to study area - DONE for 2003-2022

import xarray as xr
import os
import glob

# Define your study area (lat_max, lon_min, lat_min, lon_max)
lat_max, lon_min, lat_min, lon_max = 66, -12, 34, 36

# Input and output directories
input_dir = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl\transformation_2003_2022"
output_dir = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl\cropped_2003_2022"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Get all NetCDF files in the input directory
nc_files = glob.glob(os.path.join(input_dir, "*.nc"))

for file_path in nc_files:
    print(f"Processing {file_path}...")

    # Open the NetCDF file
    ds = xr.open_dataset(file_path)

    # Ensure latitude slicing is correct (since it decreases from 90 to -90)
    ds_cropped = ds.sel(
        latitude=slice(lat_max, lat_min),  # lat_max is greater than lat_min
        longitude=slice(lon_min, lon_max)  # lon_min is less than lon_max
    )

    # Define output file path
    output_file = os.path.join(output_dir, os.path.basename(file_path))

    # Save the cropped dataset
    ds_cropped.to_netcdf(output_file)
    print(f"Saved cropped file to {output_file}")

print("Processing complete!")