Script to extract GloFAS reanalysis data at station locations stored in an s3 bucket. Metadata file is used to identify which station points to extract (use Lisflood x and y coordinates if available).

In [7]:
import s3fs
import dask
import xarray as xr
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [3]:
country = 'mozambique'  # define country of interest
directory = '/s3/scratch/jamie.towner/flood_aa'  # define main working directory

In [4]:
# Set up the S3 path for the Zarr files
store = f"s3://wfp-seasmon/input/cds/glofas-historical/saf/01/*.zarr"

# Set up connection to s3 store
s3 = s3fs.S3FileSystem.current()

# Fetch list of .zarr stores (files)
remote_files = s3.glob(store)
store = [
    s3fs.S3Map(root=f"s3://{file}", s3=s3, check=False) for file in remote_files
]

In [9]:
# Load the CSV file containing station information (i.e., station name, lat, lon)
# define paths to data
metadata_directory = os.path.join(directory, country, "data/metadata")
station_info_file = "metadata_observations.csv"
station_info_path = os.path.join(metadata_directory, station_info_file)
station_info = pd.read_csv(station_info_path)

# Create the output directory if it doesn't exist
out_dir = os.path.join(directory, country, "data/forecasts/glofas_reanalysis")
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

# Initialize a dictionary to store data for each station
station_data = {}

# Initialize tqdm with the total number of iterations to track progress
total_iterations = len(remote_files) * len(station_info)
pbar = tqdm(total=total_iterations, desc="Extracting Data")

# Open multiple .zarr files with dask and xarray, setting chunk configuration
with dask.config.set(**{"array.slicing.split_large_chunks": True}):
    ds = xr.open_mfdataset(
        store,
        decode_coords="all",
        engine="zarr",
        parallel=True,  # Enable parallel processing for speed-up
        combine="by_coords"
    )

    # Loop over each station in the station_info CSV
    for index, row in station_info.iterrows():
        point_name = row['station name']
        latitude = row['lisflood_y']
        longitude = row['lisflood_x']
        if np.isnan(latitude) or np.isnan(longitude):
            latitude = row['latitude']
            longitude = row['longitude']

        # Replace 'lat' and 'lon' with 'latitude' and 'longitude'
        lat_index = ds['latitude'].sel(latitude=latitude, method='nearest').values
        lon_index = ds['longitude'].sel(longitude=longitude, method='nearest').values

        # Extract river discharge data for the nearest point
        data_at_point = ds['dis24'].sel(latitude=lat_index, longitude=lon_index).values
        dates = ds.time.values

        # Convert dates to DD/MM/YYYY format
        formatted_dates = pd.to_datetime(dates).strftime('%d/%m/%Y')

        # Create a DataFrame for the extracted data
        extracted_df = pd.DataFrame({'date': formatted_dates, 'river discharge': data_at_point})

        # Append the data to the station's DataFrame within the station_data dictionary
        if point_name not in station_data:
            station_data[point_name] = extracted_df
        else:
            # Merge with the existing data for the same station
            station_data[point_name] = pd.concat([station_data[point_name], extracted_df])
        
        pbar.update(len(remote_files))  # Update tqdm progress by number of files processed

# Close the tqdm progress bar
pbar.close()

# Save extracted data for each station to CSV files
for station, data in station_data.items():
    csv_file_name = os.path.join(out_dir, f"{station}.csv")
    data.to_csv(csv_file_name, index=False)

Extracting Data:   0%|          | 0/720 [00:50<?, ?it/s]
Extracting Data: 100%|██████████| 720/720 [04:03<00:00,  2.96it/s]


In [13]:
all_dfs = []
for station, data in station_data.items():
    name = "".join(c for c in station if c.isalnum() or c in (' ', '_')).replace(' ', '_')
    data = data.rename(columns={'river discharge':name})
    data = data.set_index('date')
    all_dfs.append(data)
pd.concat(all_dfs,axis=1)

Unnamed: 0_level_0,Limpopo_em_Mapai,Limpopo__Combomune,Limpopo_em_Chokwe,Limpopo_em_Sicacate,Limpopo_em_Mabalane,Limpopo_em_XaiXai,Changane_em_Chibuto,Limpopo_em_Macaretane,Zambeze_em_Marromeu_Sena_Sugar,Chire_em_Vila_Bocage,Chire_em_megaza_Mutamba,Zambeze_em_Caia_SS,Revubue_em_Chingodzi,Zambeze_em_Zumbo,Zambeze_em_Tete,LuenhaLuenha_I
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
01/01/1979,67.312500,78.109375,118.546875,162.015625,83.046875,172.546875,18.500000,112.390625,6006.078125,2158.187500,2149.546875,6039.031250,154.625000,4546.671875,155.062500,13.203125
02/01/1979,60.187500,68.015625,100.890625,139.140625,71.765625,147.703125,18.453125,95.875000,6121.359375,2141.906250,2137.546875,6207.343750,150.265625,4486.312500,150.484375,12.437500
03/01/1979,55.484375,61.140625,87.375000,121.140625,63.875000,128.187500,18.437500,83.453125,6335.562500,2132.890625,2132.718750,6458.640625,151.000000,4484.171875,151.718750,11.718750
04/01/1979,51.937500,56.500000,77.765625,107.468750,58.578125,113.203125,18.421875,74.984375,6596.234375,2146.218750,2157.296875,6720.765625,156.765625,4717.218750,156.781250,10.984375
05/01/1979,50.000000,52.906250,71.640625,97.609375,54.656250,102.046875,18.390625,69.828125,6833.203125,2173.812500,2182.031250,6938.046875,174.890625,5321.890625,174.000000,10.281250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27/12/2023,609.632812,351.343750,233.664062,292.671875,264.867188,315.960938,21.554688,232.429688,3544.843750,1696.757812,1698.617188,3555.710938,55.070312,1896.640625,56.187500,0.875000
28/12/2023,887.953125,733.156250,450.976562,313.484375,658.031250,296.648438,21.242188,528.476562,3575.375000,1707.015625,1714.359375,3597.179688,52.453125,2054.179688,53.351562,1.398438
29/12/2023,980.148438,937.101562,764.796875,620.421875,878.296875,541.796875,20.804688,826.335938,3622.257812,1726.132812,1733.398438,3655.976562,51.585938,3704.164062,52.671875,3.414062
30/12/2023,961.867188,1002.171875,1028.093750,769.640625,995.531250,722.507812,20.593750,1092.562500,3702.375000,1745.648438,1753.281250,3790.500000,51.679688,4879.015625,52.554688,7.218750


In [16]:
csv_file_name = os.path.join(out_dir, f"all_stations/glofas_reanalysis_complete_series.csv")
pd.concat(all_dfs,axis=1).to_csv(csv_file_name)

In [17]:
csv_file_name = os.path.join(out_dir, f"all_stations/glofas_reanalysis.csv")

df_all = pd.concat(all_dfs,axis=1)
df_all.index = pd.to_datetime(df_all.index,format='%d/%m/%Y')
df_all[df_all.index>='01/01/2003'].to_csv(csv_file_name)

'/s3/scratch/jamie.towner/flood_aa/mozambique/data/forecasts/glofas_reanalysis/all_stations/glofas_reanalysis_newstations.csv'

### get correlation of observed data with glofas

In [27]:
df_obs = pd.read_csv('/s3/scratch/jamie.towner/flood_aa/mozambique/data/observations/gauging_stations/all_stations/observations_newstations.csv')
df_obs = df_obs.rename(columns={'Unnamed: 0':'date'})
df_obs["date"] = pd.to_datetime(df_obs["date"], format='mixed')
df_obs = df_obs.set_index('date')
df_obs

Unnamed: 0_level_0,Limpopo_em_Mapai,Limpopo__Combomune,Limpopo_em_Chokwe,Limpopo_em_Sicacate,Limpopo_em_Mabalane,Limpopo_em_XaiXai,Changane_em_Chibuto,Limpopo_em_Macaretane,Zambeze_em_Marromeu_Sena_Sugar,Chire_em_Vila_Bocage,Chire_em_megaza_Mutamba,Zambeze_em_Caia_SS,Revubue_em_Chingodzi,Zambeze_em_Zumbo,Zambeze_em_Tete,LuenhaLuenha_I
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2003-01-01,,1.50,0.88,2.150000,0.930000,1.305,,96.870000,,,,3.023333,2.100000,,2.423333,
2003-01-02,,1.51,0.89,2.160000,0.926667,1.150,,96.880000,,,,3.063333,2.033333,,2.383333,
2003-01-03,,1.52,0.89,2.170000,0.920000,1.330,,96.903333,,,,3.080000,2.246667,,2.496667,
2003-01-04,,1.53,0.89,2.166667,0.920000,1.610,,96.906667,,,,3.100000,5.250000,,2.886667,
2003-01-05,,1.54,0.89,2.170000,0.910000,1.710,,96.920000,,,,3.066667,4.290000,,3.316667,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-12-27,,,,,,,,,,,,,,,,
2023-12-28,,,,,,,,,,,,,,,,
2023-12-29,,,,,,,,,,,,,,,,
2023-12-30,,,,,,,,,,,,,,,,


In [30]:
df_all[df_all.index>='01/01/2003'].corrwith(df_obs)

Limpopo_em_Mapai                  0.640104
Limpopo__Combomune                0.499416
Limpopo_em_Chokwe                 0.510731
Limpopo_em_Sicacate               0.476322
Limpopo_em_Mabalane               0.514672
Limpopo_em_XaiXai                 0.392785
Changane_em_Chibuto               0.348991
Limpopo_em_Macaretane            -0.138975
Zambeze_em_Marromeu_Sena_Sugar    0.601446
Chire_em_Vila_Bocage              0.133261
Chire_em_megaza_Mutamba          -0.067096
Zambeze_em_Caia_SS                0.609723
Revubue_em_Chingodzi              0.562018
Zambeze_em_Zumbo                  0.623901
Zambeze_em_Tete                   0.367926
LuenhaLuenha_I                    0.427382
dtype: float64