# Compiling SBR Wind Data

We want to compile all the wind data released from the SBR phases (currently pahse 0 and phase 1) and use that to generate Gaussian plumes.  Since wind data is only released around over passes and we want to be able to randomly sample a 5-10min contiguous duration (i.e. no big gap between data points), we keep the wind data for each overpass in a separate file.

Here, we want to extract the wind datasets for an overpass and the relevant columns.  Since the data is stored in a GDrive, the simplest way of extracting the data is to download the whole folders into `orbio/methane-cv/notebooks/data/sbr_drives` and glob for the wind files.  We then load the relevant columns (wind vector components and direction) and save into new files named `<datetime of first reading>)_wind.parquet`

TODO we need someway to group the continguous wind data into time ordered groups.  We can have multiple overpasses in a single day and even at the same time!

In [None]:
import tabulate

from copy import deepcopy
from datetime import datetime
from pathlib import Path
from tempfile import TemporaryDirectory, TemporaryFile

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from src.azure_wrap.blob_storage_sdk_v2 import upload_dir

pd.set_option("display.max_rows", None)

In [None]:
wind_files = list(Path("./data/sbr_data").glob("**/*wind+meteorological_data.csv"))
len(wind_files)

In [None]:
def wind_speed(u: float, v: float) -> float:
    """Calculate wind speed from u and v components."""
    return np.sqrt(np.square(u) + np.square(v))


def wind_direction(u: float, v: float) -> float:
    """Calculate wind direction from u and v components."""
    return np.degrees(np.arctan2(u, v))

In [None]:
wind_dir = Path("data/")
wind_dir.mkdir(parents=True, exist_ok=True)

wind_dfs = []
for file in wind_files:
    satellite = Path(file).parent.stem.split("_")[-1]
    df = (
        pd.read_csv(file)
        .assign(satellite=satellite)
        .assign(timestamp=lambda df: pd.to_datetime(df["TIMESTAMP UTC"]))
        .assign(date=lambda df: df.timestamp.dt.date)
        .assign(velocity_x=lambda df: df["Ux_Avg_10meter(m/s)"])
        .assign(velocity_y=lambda df: df["Uy_Avg_10meter(m/s)"])
        .assign(direction_deg=lambda df: df["WndDir_10meter(degrees)"] - 180)
        .dropna()  # some overpasses have no data but the file exists
        .assign(
            speed=lambda df: df.apply(lambda row: wind_speed(row.velocity_x, row.velocity_y), axis=1),
            calculated_direction_deg=lambda df: df.apply(
                lambda row: wind_direction(row.velocity_x, row.velocity_y), axis=1
            ),
        )
    )[
        [
            "satellite",
            "date",
            "timestamp",
            "velocity_x",
            "velocity_y",
            "direction_deg",
            "speed",
            "calculated_direction_deg",
        ]
    ]
    wind_dfs.append(df)
    # break

wind_df = pd.concat(wind_dfs)
wind_df.to_parquet("data/sbr_wind.parquet")

In [None]:
wind_df.head()

In [None]:
wind_df.sort_values(["date", "satellite"], ascending=True).groupby(["date", "satellite"]).agg(
    num_readings=("timestamp", "count"),
    speed_max=("speed", "max"),
    speed_min=("speed", "min"),
    speed_mean=("speed", "mean"),
    speed_std=("speed", "std"),
    direction_mean=("calculated_direction_deg", "mean"),
    direction_std=("calculated_direction_deg", "std"),
)

In [None]:
wind_df[(wind_df["date"] == datetime.strptime("2024-11-24", "%Y-%m-%d").date()) & (wind_df["satellite"] == "LS9")]

## Wind Speed and Direction Distributions

In [None]:
width = 2 * np.pi / 360
plot_df = (
    wind_df.calculated_direction_deg.round(0)
    .value_counts()
    .rename_axis("direction")
    .rename("counts")
    .reset_index()
    .assign(counts=lambda df: df.counts / df.counts.sum())
)

# fig, (ax1, ax2, ax3) = plt.subplots(1, 3)
plt.figure(figsize=(15, 4))
ax1 = plt.subplot(131)
ax2 = plt.subplot(132, projection="polar")
ax3 = plt.subplot(133, projection="polar")

# wind speed
ax1.hist(wind_df.speed, bins=360)
ax1.set_yticklabels([])
ax1.set_title("Wind Speed Density")
ax1.set_xlabel("meters per second", fontsize=10)
ax1.set_ylabel("", fontsize=14)

# reported wind direction
plot_df = (
    wind_df.calculated_direction_deg.round(0)
    .value_counts()
    .rename_axis("direction")
    .rename("counts")
    .reset_index()
    .assign(counts=lambda df: df.counts / df.counts.sum())
)
ax2.set_theta_zero_location("N")
ax2.set_theta_direction(-1)
ax2.bar(plot_df.direction, plot_df.counts, bottom=0.0, width=width)  # color=colors, width=width)
ax2.set_yticklabels([])
ax2.set_title("Calculated Wind Direction Density")

# calculated wind direction
plot_df = (
    wind_df.direction_deg.round(0)
    .value_counts()
    .rename_axis("direction")
    .rename("counts")
    .reset_index()
    .assign(counts=lambda df: df.counts / df.counts.sum())
)
ax3.set_theta_zero_location("N")
ax3.set_theta_direction(-1)
ax3.bar(plot_df.direction, plot_df.counts, bottom=0.0, width=width)  # color=colors, width=width)
ax3.set_yticklabels([])
ax3.set_title("Reported Wind Direction Density")

In [None]:
direction_error = wind_df.direction_deg - wind_df.calculated_direction_deg
print(f"Direction Error Mean: {direction_error.mean()}")
print(f"Direction Error STD: {direction_error.std()}")

In [None]:
# Create a figure with 2 subplots arranged horizontally
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# First subplot
ax1.scatter(wind_df.direction_deg, wind_df.speed, alpha=0.25, s=4)
ax1.set_title("Reported Wind Direction vs Wind Speed")
ax1.set_xlabel("Reported Wind Direction")
ax1.set_ylabel("Wind Speed")
ax1.grid(True, linestyle="--", alpha=0.7)

# Second subplot (identical)
ax2.scatter(wind_df.calculated_direction_deg, wind_df.speed, alpha=0.25, s=4)
ax2.set_title("Calculated Wind Direction vs Wind Speed")
ax2.set_xlabel("Calculated Wind Direction")
ax2.set_ylabel("Wind Speed")
ax2.grid(True, linestyle="--", alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
# Create a figure with 2 subplots arranged horizontally
fig = plt.figure(figsize=(8, 4))

plt.scatter(wind_df.direction_deg, wind_df.calculated_direction_deg, alpha=0.25, s=4)
plt.title("Reported Wind Direction vs Calculated Wind Direction")
plt.xlabel("Reported Wind Direction")
plt.ylabel("Calculated Wind Direction")
plt.grid(True, linestyle="--", alpha=0.7)

plt.tight_layout()
plt.show()

In [None]:
# !pip install tabulate
print(wind_df[["direction_deg", "calculated_direction_deg"]].head(20).to_markdown())

## Extracting Wind Data and Uploading to BlobStore

Here, we want to extract the wind datasets for an overpass and the relevant columns.  Since the data is stored in a GDrive, the simplest way of extracting the data is to download the whole folders into `orbio/methane-cv/notebooks/data/sbr_drives` and glob for the wind files.  We then load the relevant columns (wind vector components and direction) and save into new files named `<datetime of first reading>)_wind.parquet`

In [None]:
wind_dir = Path("data/wind/")
wind_dir.mkdir(parents=True, exist_ok=True)

max_time_gap = 10  # seconds - files with larger gaps typically has irregular sampling
WIND_BLOB = "sbr_wind_data"

with TemporaryDirectory() as temp_dir:
    for file in wind_files:
        satellite = Path(file).parent.stem.split("_")[-1]
        df = (
            pd.read_csv(file)
            .assign(satellite=satellite)
            .assign(timestamp=lambda df: pd.to_datetime(df["TIMESTAMP UTC"]))
            .assign(speed_x=lambda df: df["Ux_Avg_10meter(m/s)"])
            .assign(speed_y=lambda df: df["Uy_Avg_10meter(m/s)"])
            .assign(direction_deg=lambda df: df["WndDir_10meter(degrees)"])[
                ["satellite", "timestamp", "speed_x", "speed_y", "direction_deg"]
            ]
            .dropna()  # some overpasses have no data but the file exists
            .sort_values(by="timestamp", ascending=True)
        )

        if len(df) == 0:
            print(f"'{file}' has no data -- skipping")
            continue
        # skip files with time gaps > 10 as that indicates weird sampling occuring
        if (gap := df.timestamp.diff().max()) > pd.Timedelta(seconds=max_time_gap):
            print(f"'{file}' has a large gap in time ({gap}) -- skipping")
            continue

        satellite = df.satellite.iloc[0]
        date = df.timestamp.iloc[0].date().isoformat()
        filepath = Path(temp_dir) / f"{satellite}_{date}.parquet"
        df.to_parquet(filepath)

        ##############################
        # upload to blob store
        ##############################
        # since the files are on local disk and we want to preserve the original file / directory structure
        # we need to strip out the containing directory not part of the original structure
        relative_path = file.parent.parent.parent
        blob_path = file.relative_to(relative_path)
        azure_blob_path = Path(blob) / blob_path

        try:
            upload_dir(filepath.as_posix(), azure_blob_path.as_posix(), recursive=False)
        except Exception as err:
            if err.error_code == "ScriptExecution.WriteStreams.AlreadyExists":
                print(f"{file} already exists or is not empty -- skipping")
            else:
                raise err

### Check if files were uploaded OK

In [None]:
from azureml.fsspec import AzureMachineLearningFileSystem

from src.azure_wrap.blob_storage_sdk_v2 import DATASTORE_URI

fs = AzureMachineLearningFileSystem(DATASTORE_URI)
wind_files = list(fs.glob(f"{WIND_BLOB}/**/*.parquet"))

len(wind_files)

### Check for files with large time gaps

In [None]:
files = []
durations = []
max_gaps = []

for file in wind_files:
    satellite = Path(file).parent.stem.split("_")[-1]
    df = (
        pd.read_csv(file)
        .assign(satellite=satellite)
        .assign(timestamp=lambda df: pd.to_datetime(df["TIMESTAMP UTC"]))
        .assign(speed_x=lambda df: df["Ux_Avg_10meter(m/s)"])
        .assign(speed_y=lambda df: df["Uy_Avg_10meter(m/s)"])
        .assign(direction_deg=lambda df: df["WndDir_10meter(degrees)"])[
            ["satellite", "timestamp", "speed_x", "speed_y", "direction_deg"]
        ]
        .dropna()  # some overpasses have no data but the file exists
    )

    duration = df.timestamp.max() - df.timestamp.min()
    durations.append(duration)
    max_gap = df.timestamp.diff().max()
    max_gaps.append(max_gap)
    files.append(file)
df = pd.DataFrame({"file": files, "duration": durations, "max_gap": max_gaps}).sort_values(by="duration")

In [None]:
large_gaps_df = df[df.max_gap > pd.Timedelta(seconds=10)].reset_index(drop=True)
large_gaps_df

In [None]:
# Looks like all the files have at least an hour duration
# the minimum duration of a file with a minimal gap < 10 seconds in just over 4 hours
df.sort_values(by="duration", ascending=True)

### Inspecting Files with large gaps

We will exclude files with gaps larger than 10 seconds, as that typically indicates some weird sampling happening.

In [None]:
list(large_gaps_df.file)[0:5]

In [None]:
df1 = (
    pd.read_csv("data/sbr_data/Phase 1 (1-1-25-3-31-25)/02272025_LS8/2025-02-27_wind+meteorological_data.csv")
    .assign(satellite=satellite)
    .assign(timestamp=lambda df: pd.to_datetime(df["TIMESTAMP UTC"]))
    .assign(speed_x=lambda df: df["Ux_Avg_10meter(m/s)"])
    .assign(speed_y=lambda df: df["Uy_Avg_10meter(m/s)"])
    .assign(direction_deg=lambda df: df["WndDir_10meter(degrees)"])[
        ["satellite", "timestamp", "speed_x", "speed_y", "direction_deg"]
    ]
    .dropna()  # some overpasses have no data but the file exists
)
df1.timestamp.plot()

In [None]:
df1.timestamp.diff().max()

In [None]:
df1