Main pollutants PM10 & PM2.5, NO2, CO2, O3


CAMS global reanalysis EAC4 - Data Preparation (according to what Virgilio gave in matlab scripts)

Single level

In [None]:
# To separate monthly files when downloading the full year - DONE for 2023-2024

import xarray as xr
import pandas as pd

# Load the dataset
file_path = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl\CAMS_global_reanalysis_EAC4_chem_singlvl_2024.nc" #change path accordingly
ds = xr.open_dataset(file_path)

# Ensure valid_time is a datetime object
ds['valid_time'] = pd.to_datetime(ds['valid_time'].values)

# Get unique years in the dataset
years = pd.Series(ds['valid_time'].dt.year.values).unique()

# Iterate through years and months
dest_folder = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl"
for year in years:
    for month in range(1, 13):
        # Filter dataset for the given year and month
        monthly_ds = ds.sel(valid_time=(ds.valid_time.dt.year == year) & (ds.valid_time.dt.month == month))

        if monthly_ds.valid_time.size > 0:  # Only save if data exists for the month
            output_filename = rf"{dest_folder}\CAMS_global_reanalysis_EAC4_chem_singlvl_{year}{month:02d}.nc"
            monthly_ds.to_netcdf(output_filename)
            print(f"Saved {output_filename}")

# Close the dataset
ds.close()

In [None]:
# To change valid_time to time (so it matches) - DONE for 2023-2024

import xarray as xr
import os
import glob

# Path to your directory containing the NetCDF files
input_dir = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl"
output_dir = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl\final_2003_2024"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Get all NetCDF files in the input directory
nc_files = glob.glob(os.path.join(input_dir, "*.nc"))

# Loop through each file
for file_path in nc_files:
    print(f"Processing {file_path}...")

    # Open the NetCDF file
    ds = xr.open_dataset(file_path)

    # Check if 'valid_time' exists and rename it to 'time'
    if 'valid_time' in ds:
        ds = ds.rename({"valid_time": "time"})

    # Get the filename and create the output path in the new directory
    output_file = os.path.join(output_dir, os.path.basename(file_path))

    # Save the updated dataset to the output directory (overwrite original file in the new folder)
    ds.to_netcdf(output_file)

    print(f"Renamed 'valid_time' to 'time' and saved to {output_file}")

print("Renaming and saving to new folder complete!")


In [None]:
# To convert coords from 0 to 360 to -180 to 180 - DONE for 2003-2022

import os
import xarray as xr
import numpy as np

# Define input and output folders
input_folder = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl\raw_2003_2022"
output_folder = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl\transformation_2003_2022"

# Ensure output folder exists
os.makedirs(output_folder, exist_ok=True)

# Get list of NetCDF files in the input folder
nc_files = [f for f in os.listdir(input_folder) if f.endswith(".nc")]

# Process each file
for nc_file in nc_files:
    input_path = os.path.join(input_folder, nc_file)
    output_path = os.path.join(output_folder, nc_file)

    print(f"Processing: {nc_file}")

    # Open the dataset
    ds = xr.open_dataset(input_path)

    # Extract longitude and latitude
    lon = ds['longitude'].values  # (480,)
    lat = ds['latitude'].values   # (241,)

    # Convert longitude from 0-360 to -180 to 180
    lon2 = (lon + 180) % 360 - 180

    # Swap the first and second halves
    lon3 = np.copy(lon2)
    lon3[:240] = lon2[240:480]
    lon3[240:480] = lon2[:240]

    # Create a meshgrid (not strictly needed for saving, but useful)
    LON, LAT = np.meshgrid(lon3, lat)

    # Apply the same transformation to pm10, pm1, and pm2p5
    for var in ["pm10", "pm1", "pm2p5"]:
        if var in ds:
            data = ds[var].values  # Shape: (time, lat, lon)
            transformed_data = np.copy(data)

            # Swap the longitude axis (last axis)
            transformed_data[:, :, :240] = data[:, :, 240:480]
            transformed_data[:, :, 240:480] = data[:, :, :240]

            # Replace the dataset variable with the corrected data
            ds[var].values = transformed_data

    # Update longitude in the dataset
    ds = ds.assign_coords(longitude=lon3)

    # Save the modified dataset
    ds.to_netcdf(output_path)

    print(f"Saved transformed file: {output_path}")



In [None]:
# To crop to study area - DONE for 2003-2022

import xarray as xr
import os
import glob

# Define your study area (lat_max, lon_min, lat_min, lon_max)
lat_max, lon_min, lat_min, lon_max = 66, -12, 34, 36

# Input and output directories
input_dir = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl\transformation_2003_2022"
output_dir = r"D:\CAMS\CAMS_global_reanalysis_EAC4\chem_singlvl\cropped_2003_2022"

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Get all NetCDF files in the input directory
nc_files = glob.glob(os.path.join(input_dir, "*.nc"))

for file_path in nc_files:
    print(f"Processing {file_path}...")

    # Open the NetCDF file
    ds = xr.open_dataset(file_path)

    # Ensure latitude slicing is correct (since it decreases from 90 to -90)
    ds_cropped = ds.sel(
        latitude=slice(lat_max, lat_min),  # lat_max is greater than lat_min
        longitude=slice(lon_min, lon_max)  # lon_min is less than lon_max
    )

    # Define output file path
    output_file = os.path.join(output_dir, os.path.basename(file_path))

    # Save the cropped dataset
    ds_cropped.to_netcdf(output_file)
    print(f"Saved cropped file to {output_file}")

print("Processing complete!")

Multi level

ERA5 - all 5 variables

In [None]:
#To separate year by year files downloaded with more than 1 year each - DONE for 1979-2024

import xarray as xr
import pandas as pd

# Load the dataset
file_path = r"E:\IPMA\ERA5\UV_wind\ERA5_hourly_uv_2003_1999.nc"
ds = xr.open_dataset(file_path)

# Ensure valid_time is a datetime object
ds['valid_time'] = pd.to_datetime(ds['valid_time'].values)

# Get unique years in the dataset
years = pd.Series(ds['valid_time'].dt.year.values).unique()

# Destination folder to save yearly files
dest_folder = r"E:\IPMA\ERA5\UV_wind"

# Iterate through years
for year in years:
    # Filter dataset for the given year
    yearly_ds = ds.sel(valid_time=ds.valid_time.dt.year == year)

    if yearly_ds.valid_time.size > 0:  # Only save if data exists for the year
        output_filename = rf"{dest_folder}\ERA5_hourly_uv_{year}.nc" #change accordingly to what file is being used
        yearly_ds.to_netcdf(output_filename)
        print(f"Saved {output_filename}")

# Close the dataset
ds.close()


In [None]:
# To separate monthly files when downloading the full year - DONE for 1979-2024

import os
import xarray as xr
import pandas as pd

# Directory where all .nc files are located
file_path = r"E:\IPMA\ERA5\UV_wind\1raw_year_1979_2024"

# Destination folder for monthly files
dest_folder = r"E:\IPMA\ERA5\UV_wind\1raw_month_1979_2024"

# Iterate over each file in the directory
for filename in os.listdir(file_path):
    if filename.endswith(".nc"):
        file_full_path = os.path.join(file_path, filename)
        
        # Load the dataset
        ds = xr.open_dataset(file_full_path)

        # Ensure valid_time is a datetime object
        ds['valid_time'] = pd.to_datetime(ds['valid_time'].values)

        # Get unique years in the dataset
        years = pd.Series(ds['valid_time'].dt.year.values).unique()

        # Iterate through years and months
        for year in years:
            for month in range(1, 13):
                # Filter dataset for the given year and month
                monthly_ds = ds.sel(valid_time=(ds.valid_time.dt.year == year) & (ds.valid_time.dt.month == month))

                if monthly_ds.valid_time.size > 0:  # Only save if data exists for the month
                    output_filename = rf"{dest_folder}\ERA5_hourly_uv_{year}{month:02d}.nc" #change accordingly to what file is being used
                    monthly_ds.to_netcdf(output_filename)
                    print(f"Saved {output_filename}")

        # Close the dataset
        ds.close()


FRP

In [None]:
# To put together all the csv files into one - DONE

import pandas as pd
import os

# Folder containing the CSV files
folder_path = r"E:\IPMA\FRP"

# Get all CSV file paths from the folder
csv_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path) if f.endswith(".csv")]

# Read and concatenate the files
df = pd.concat([pd.read_csv(file) for file in csv_files], ignore_index=True)

# Save the merged dataset
df.to_csv(os.path.join(folder_path, "FRP_2001_2023.csv"), index=False)

print("All CSV files in the folder merged successfully!")


SPEI

In [None]:
# To convert .npy to .nc files - RAQUEL SOURCE

import os
import numpy as np
import xarray as xr

# Path to the directory containing the .npy files
npy_folder = r"E:\IPMA\SPEI\SPEI3"  # Update with the correct path

# Get a list of all .npy files in the folder
npy_files = [f for f in os.listdir(npy_folder) if f.endswith('.npy')]

# Loop through each .npy file and convert it to .nc
for npy_file in npy_files:
    npy_path = os.path.join(npy_folder, npy_file)
    
    # Load the .npy file
    data = np.load(npy_path)
    
    # Create an xarray DataArray with dimensions and coordinates
    data_array = xr.DataArray(data, dims=["lat", "lon", "time"], 
                              coords={"lat": np.arange(data.shape[0]), 
                                      "lon": np.arange(data.shape[1]),
                                      "time": np.arange(data.shape[2])})
    
    # Create the output .nc file name (same as input, but with .nc extension)
    nc_file = os.path.splitext(npy_file)[0] + ".nc"
    nc_path = os.path.join(npy_folder, nc_file)
    
    # Save the DataArray to a NetCDF file
    data_array.to_netcdf(nc_path)
    print(f"Converted {npy_file} to {nc_file}")


In [None]:
# To separate spei for 1979-2024 - WEB SOURCE - DONE

import pandas as pd
import xarray as xr

def extract_spei_data(input_file, start_year=1979, end_year=2024):
    """
    Extracts data from a NetCDF file for the years between start_year and end_year and saves it to a new NetCDF file.
    
    Args:
        input_file (str): Path to the input NetCDF file.
        start_year (int): Start year for extraction (default 1979).
        end_year (int): End year for extraction (default 2024).
    """
    # Open the NetCDF file using xarray
    ds = xr.open_dataset(input_file)

    # Ensure time is in datetime format (if it's not already in datetime format)
    ds['time'] = pd.to_datetime(ds['time'].values)

    # Filter the data based on the time dimension (between the start and end year)
    filtered_ds = ds.sel(time=slice(f"{start_year}-01-01", f"{end_year}-12-31"))
    
    # Create output filename dynamically based on the input file name and year range
    base_filename = input_file.split('/')[-1].split('.')[0]  # Extract file name (e.g., spei_01)
    output_file = f"{base_filename}_{start_year}-{end_year}.nc"
    
    # Save the filtered dataset to a new NetCDF file
    filtered_ds.to_netcdf(output_file)
    print(f"Saved filtered data to: {output_file}")

# Example usage:
input_file = r"E:\IPMA\SPEIbase_v2-10\SPEI12\spei12.nc"
extract_spei_data(input_file)


In [None]:
# To separate spei for each year between 1979-2024 - WEB SOURCE - DONE

import xarray as xr
import pandas as pd

def extract_spei_data_by_year(input_file, start_year=1979, end_year=2024):
    """
    Extracts data from a NetCDF file for each year between start_year and end_year
    and saves each year as a separate NetCDF file.
    
    Args:
        input_file (str): Path to the input NetCDF file.
        start_year (int): Start year for extraction (default 1979).
        end_year (int): End year for extraction (default 2024).
    """
    # Open the NetCDF file using xarray
    ds = xr.open_dataset(input_file)

    # Ensure time is in datetime format (if it's not already in datetime format)
    ds['time'] = pd.to_datetime(ds['time'].values)

    # Loop through each year and extract the data for that year
    for year in range(start_year, end_year + 1):
        # Filter the dataset for the current year
        filtered_ds = ds.sel(time=slice(f"{year}-01-01", f"{year}-12-31"))
        
        # Create output filename for each year
        base_filename = input_file.split('/')[-1].split('.')[0]  # Extract file name (e.g., spei_01)
        output_file = f"{base_filename}_{year}.nc"
        
        # Save the filtered data for this year to a new NetCDF file
        filtered_ds.to_netcdf(output_file)
        print(f"Saved filtered data for {year} to: {output_file}")

# Example usage:
input_file = r"E:\IPMA\SPEIbase_v2-10\SPEI12\spei12.nc"
extract_spei_data_by_year(input_file)


In [None]:
# To crop to study area - DONE for 1979-2023

import os
import xarray as xr

# Define your input and output directories
input_dir = r"E:\IPMA\SPEIbase_v2-10\SPEI12\1raw_1979_2023"
output_dir = r"E:\IPMA\SPEIbase_v2-10\SPEI12\2cropped_1979_2023"

# Define the latitude and longitude boundaries for your study area
lat_max, lon_min, lat_min, lon_max = 66, -12, 34, 36  # Study area for Europe

# Make sure the output directory exists
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Loop through all NC files in the input directory
for file_name in os.listdir(input_dir):
    if file_name.endswith(".nc"):  # Check if it's a NetCDF file
        input_file_path = os.path.join(input_dir, file_name)

        # Open the NetCDF file using xarray
        with xr.open_dataset(input_file_path) as ds:
            # Crop the dataset to include only the specified region
            ds_europe = ds.sel(lat=slice(lat_min, lat_max), lon=slice(lon_min, lon_max))

            # Create the output file path
            output_file_path = os.path.join(output_dir, file_name)

            # Save the cropped data to a new NetCDF file
            ds_europe.to_netcdf(output_file_path)
            print(f"Saved cropped file: {output_file_path}")
