# Evaluate Wind

In this notebook, the impact of the wind and gyro data in hte metadata is to be used to explore if these modalities allow the detection of critical phases onboard (e.g., a storm), during which we would expect a heightened stress level.

To do so, the metadata of the VDR files that remained on the local machine was extracted and saved under `data/interim-meta`.

In [1]:
import audeer
import pandas as pd
import numpy as np
import os
from datetime import datetime
from pathlib import Path

In [2]:
def read_metadata_formatter(dir_meta, str_channel, formatter, column_names):
    """
    Extract only lines matching a specific NMEA formatter (e.g., '$GPVTG') from a channel's .txt files.
    Returns a DataFrame with only those lines parsed.

    Parameters:
    - dir_meta: root directory
    - str_channel: channel string (e.g., 'SI04')
    - formatter: string, e.g. '$GPVTG' (or list of formatters)
    - column_names: list or dict for the formatter
    """
    from pathlib import Path
    import pandas as pd
    import warnings

    if isinstance(formatter, str):
        formatters = [formatter]
    else:
        formatters = formatter

    all_dataframes = []
    dir_meta_path = Path(dir_meta)
    if not dir_meta_path.exists():
        warnings.warn(f"Directory not found: {dir_meta}")
        return pd.DataFrame()
    subdirs = [d for d in dir_meta_path.iterdir() if d.is_dir()]
    if not subdirs:
        warnings.warn(f"No subdirectories found in {dir_meta}")
        return pd.DataFrame()
    for subdir in subdirs:
        pattern = f"*({str_channel}).txt"
        matching_files = list(subdir.glob(pattern))
        if len(matching_files) == 0:
            warnings.warn(f"No files matching pattern '{pattern}' found in {subdir}")
            continue
        elif len(matching_files) > 1:
            warnings.warn(
                f"Multiple files matching pattern '{pattern}' in {subdir.name}: {[f.name for f in matching_files]}"
            )
            raise ValueError(
                f"Multiple files matching pattern '{pattern}' in {subdir.name}"
            )
        file_path = matching_files[0]
        print(f"Processing: {file_path.name}")
        if not file_path.exists():
            warnings.warn(f"File not found: {file_path}")
            continue
        with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
            for line_num, line in enumerate(f, 1):
                line = line.strip()
                if not line or " : " not in line:
                    continue
                timestamp_str, data_part = line.split(" : ", 1)
                if not any(data_part.startswith(fmt) for fmt in formatters):
                    continue
                # Remove checksum if present
                if "*" in data_part:
                    data_part = data_part.split("*")[0]
                fields = data_part.split(",")
                row_data = {
                    "timestamp": pd.to_datetime(
                        timestamp_str, format="%Y-%m-%d,%H:%M:%S.%f"
                    )
                }
                if isinstance(column_names, list):
                    for i, col_name in enumerate(column_names):
                        if col_name == "time":
                            row_data[col_name] = row_data["timestamp"]
                        elif i - 1 < len(fields) and col_name != "time":
                            field_idx = i - 1 if "time" in column_names else i
                            if field_idx < len(fields):
                                value = fields[field_idx].strip()
                                if value.replace(".", "").replace("-", "").isdigit():
                                    try:
                                        row_data[col_name] = (
                                            float(value) if "." in value else int(value)
                                        )
                                    except ValueError:
                                        row_data[col_name] = value
                                else:
                                    row_data[col_name] = value
                            else:
                                row_data[col_name] = None
                elif isinstance(column_names, dict):
                    for field_idx, col_name in column_names.items():
                        if field_idx < len(fields):
                            value = fields[field_idx].strip()
                            if value.replace(".", "").replace("-", "").isdigit():
                                try:
                                    row_data[col_name] = (
                                        float(value) if "." in value else int(value)
                                    )
                                except ValueError:
                                    row_data[col_name] = value
                            else:
                                row_data[col_name] = value
                        else:
                            row_data[col_name] = None
                row_data["raw_data"] = data_part
                row_data["source_file"] = file_path.name
                row_data["source_directory"] = subdir.name
                all_dataframes.append(row_data)
    if not all_dataframes:
        warnings.warn(f"No valid data found for channel {str_channel} in {dir_meta}")
        return pd.DataFrame()
    return pd.DataFrame(all_dataframes)


# Example usage for extracting only VTG from SI04 (GPS):
# vtg_cols = ["time", "modality", "cog_true", "T", "cog_magnetic", "M", "sog", "N", "mode"]
# df_vtg = read_metadata_formatter(metadata_dir, "SI04", "$GPVTG", vtg_cols)

# You can now use this function in your true wind calculation cell for each formatter/channel.

In [3]:
# Path to the directories
dir_metadata = "../../data/interim-meta"
dir_evaluated = "../../data/evaluated"
dir_out_wind = audeer.mkdir(os.path.join(dir_evaluated, "wind_speed"))

In [4]:
# Calculate true wind speed and direction using GPS COG (course over ground) with formatter-specific extraction

# 1. Read all required channels using formatter-specific extraction
# Wind (apparent, SI07, MWV)
print("Reading wind metadata from SI07 channel...")
wind_cols = [
    "time",
    "modality",
    "angle_in_degrees",
    "reference",
    "wind_speed",
    "unit",
    "status",
]
df_wind = read_metadata_formatter(dir_metadata, "SI07", "$WIMWV", wind_cols)

# Speed log (SI05, VBW: dual ground/water speed)
print("Reading speed metadata from SI05 channel...")
speed_cols = [
    "time",
    "modality",
    "longitudinal_water_speed",
    "transverse_water_speed",
    "status",
    "",
    "",
]
df_speed = read_metadata_formatter(dir_metadata, "SI05", "$VDVBW", speed_cols)

# GPS (SI04, VTG: COG and SOG)
print("Reading GPS metadata from SI04 channel...")
gps_cols = [
    "time",
    "modality",
    "cog_true",
    "T",
    "cog_magnetic",
    "M",
    "sog",
    "N",
    "mode",
]
df_gps = read_metadata_formatter(dir_metadata, "SI04", "$GPVTG", gps_cols)

# 2. Preprocess and merge on timestamp (nearest, within 2 seconds)
for df in [df_wind, df_speed, df_gps]:
    df["timestamp"] = pd.to_datetime(df["timestamp"])

# Merge wind and speed
df_merged = pd.merge_asof(
    df_wind.sort_values("timestamp"),
    df_speed.sort_values("timestamp"),
    on="timestamp",
    direction="nearest",
    tolerance=pd.Timedelta("2s"),
    suffixes=("", "_speed"),
)
# Merge with GPS
df_merged = pd.merge_asof(
    df_merged.sort_values("timestamp"),
    df_gps[["timestamp", "cog_true", "sog"]].sort_values("timestamp"),
    on="timestamp",
    direction="nearest",
    tolerance=pd.Timedelta("2s"),
)

# Convert relevant columns to numeric, coercing errors (empty strings, invalid values become NaN)
for col in ["wind_speed", "angle_in_degrees", "sog", "cog_true"]:
    df_merged[col] = pd.to_numeric(df_merged[col], errors="coerce")

# Keep a copy before dropping for diagnostics
df_merged_before = df_merged.copy()
before_drop = df_merged_before.shape[0]
df_merged = df_merged_before.dropna(
    subset=["wind_speed", "angle_in_degrees", "sog", "cog_true"]
)
after_drop = df_merged.shape[0]
print(f"After dropping missing/invalid data, merged shape: {df_merged.shape}")
print(
    f"Dropped {before_drop - after_drop} rows due to invalid or missing numeric data."
)

# Show a sample of problematic rows (if any were dropped)
if before_drop > after_drop:
    mask_invalid = df_merged_before.index.difference(df_merged.index)
    print("Sample of problematic rows (first 5):")
    print(
        df_merged_before.loc[
            mask_invalid,
            ["timestamp", "wind_speed", "angle_in_degrees", "sog", "cog_true"],
        ].head()
    )


# 3. Calculate true wind using COG and SOG from GPS (rotate apparent wind vector into earth coordinates)
def calc_true_wind_gps(row):
    aws = float(row["wind_speed"])  # apparent wind speed
    awa = np.deg2rad(
        float(row["angle_in_degrees"])
    )  # apparent wind angle (deg to rad, relative to bow)
    sog = float(row["sog"])
    cog = np.deg2rad(float(row["cog_true"]))  # use COG as heading proxy

    # Apparent wind in earth coordinates (rotate by COG)
    aws_x_earth = aws * np.cos(cog + awa)
    aws_y_earth = aws * np.sin(cog + awa)

    # Ship velocity in earth coordinates
    sog_x = sog * np.cos(cog)
    sog_y = sog * np.sin(cog)

    # True wind vector in earth coordinates
    tws_x = aws_x_earth + (-sog_x)
    tws_y = aws_y_earth + (-sog_y)
    tws = np.hypot(tws_x, tws_y)
    twd = (np.rad2deg(np.arctan2(tws_y, tws_x))) % 360
    return pd.Series({"true_wind_speed_gps": tws, "true_wind_direction_gps": twd})


print("Calculating true wind using GPS COG/SOG (corrected for reference frame)...")
df_true_wind_gps = df_merged.apply(calc_true_wind_gps, axis=1)
df_merged = pd.concat([df_merged, df_true_wind_gps], axis=1)

print(
    "Sample of calculated true wind using GPS COG/SOG (formatter-specific extraction, corrected):"
)
print(
    df_merged[
        [
            "timestamp",
            "wind_speed",
            "angle_in_degrees",
            "sog",
            "cog_true",
            "true_wind_speed_gps",
            "true_wind_direction_gps",
        ]
    ].head()
)

Reading wind metadata from SI07 channel...
Processing: SERIAL_23_01_2023-10_05_59.10_(SI07).txt
Processing: SERIAL_30_01_2023-13_58_09.40_(SI07).txt
Processing: SERIAL_09_01_2023-23_56_53.40_(SI07).txt
Processing: SERIAL_26_12_2022-23_56_48.20_(SI07).txt
Processing: SERIAL_28_12_2022-23_56_46.40_(SI07).txt
Processing: SERIAL_11_01_2023-23_56_52.50_(SI07).txt
Processing: SERIAL_07_01_2023-23_56_39.40_(SI07).txt
Processing: SERIAL_02_02_2023-09_39_44.60_Track_2_(SI07).txt
Processing: SERIAL_31_12_2022-23_56_44.60_(SI07).txt
Processing: SERIAL_30_12_2022-23_56_45.50_(SI07).txt
Processing: SERIAL_25_01_2023-13_40_58.30_(SI07).txt
Processing: SERIAL_04_01_2023-23_56_42.00_(SI07).txt
Processing: SERIAL_01_01_2023-23_56_43.80_(SI07).txt
Processing: SERIAL_26_01_2023-10_50_57.80_(SI07).txt
Processing: SERIAL_29_12_2022-23_56_46.50_(SI07).txt
Processing: SERIAL_25_12_2022-23_56_49.10_(SI07).txt
Processing: SERIAL_27_12_2022-23_56_47.30_(SI07).txt
Processing: SERIAL_06_01_2023-23_56_40.40_(SI07)

In [5]:
df_true_wind_gps

Unnamed: 0,true_wind_speed_gps,true_wind_direction_gps
0,8.434060,350.382892
1,8.618901,348.255906
2,8.434060,350.382892
3,9.491324,351.352425
4,8.434060,350.482892
...,...,...
2793875,11.459745,6.548890
2793876,11.459745,6.548890
2793877,12.512094,7.421926
2793878,11.651946,5.001464


In [6]:
# df_merged.to_csv(os.path.join(dir_out_wind, "true_wind_speed.csv"))

In [7]:
df_merged.sort_values(by="true_wind_speed_gps", ascending=False).head(10)

Unnamed: 0,timestamp,time,modality,angle_in_degrees,reference,wind_speed,unit,status,raw_data,source_file,...,transverse_water_speed,status_speed,Unnamed: 14,raw_data_speed,source_file_speed,source_directory_speed,cog_true,sog,true_wind_speed_gps,true_wind_direction_gps
2115555,2023-01-29 00:34:07.600,2023-01-29 00:34:07.600,$WIMWV,343,R,50,N,A,"$WIMWV,343,R,050,N,A",SERIAL_28_01_2023-08_33_55.80_(SI07).txt,...,,A,,"$VDVBW,13.0,,A,,,V",SERIAL_28_01_2023-08_33_56.20_(SI05).txt,IMO_9510682_2023-01-28T08-33-55,22.0,12.5,38.221317,359.513109
2115550,2023-01-29 00:34:02.400,2023-01-29 00:34:02.400,$WIMWV,344,R,50,N,A,"$WIMWV,344,R,050,N,A",SERIAL_28_01_2023-08_33_55.80_(SI07).txt,...,,A,,"$VDVBW,13.1,,A,,,V",SERIAL_28_01_2023-08_33_56.20_(SI05).txt,IMO_9510682_2023-01-28T08-33-55,22.2,12.5,38.140174,1.017008
2115553,2023-01-29 00:34:05.600,2023-01-29 00:34:05.600,$WIMWV,341,R,49,N,A,"$WIMWV,341,R,049,N,A",SERIAL_28_01_2023-08_33_55.80_(SI07).txt,...,,A,,"$VDVBW,13.0,,A,,,V",SERIAL_28_01_2023-08_33_56.20_(SI05).txt,IMO_9510682_2023-01-28T08-33-55,22.3,12.5,37.403071,357.053627
2115554,2023-01-29 00:34:06.600,2023-01-29 00:34:06.600,$WIMWV,341,R,49,N,A,"$WIMWV,341,R,049,N,A",SERIAL_28_01_2023-08_33_55.80_(SI07).txt,...,,A,,"$VDVBW,13.0,,A,,,V",SERIAL_28_01_2023-08_33_56.20_(SI05).txt,IMO_9510682_2023-01-28T08-33-55,22.1,12.5,37.403071,356.853627
2115549,2023-01-29 00:34:01.400,2023-01-29 00:34:01.400,$WIMWV,344,R,49,N,A,"$WIMWV,344,R,049,N,A",SERIAL_28_01_2023-08_33_55.80_(SI07).txt,...,,A,,"$VDVBW,13.1,,A,,,V",SERIAL_28_01_2023-08_33_56.20_(SI05).txt,IMO_9510682_2023-01-28T08-33-55,22.1,12.5,37.144373,0.777662
2115558,2023-01-29 00:34:10.800,2023-01-29 00:34:10.800,$WIMWV,340,R,48,N,A,"$WIMWV,340,R,048,N,A",SERIAL_28_01_2023-08_33_55.80_(SI07).txt,...,,A,,"$VDVBW,12.9,,A,,,V",SERIAL_28_01_2023-08_33_56.20_(SI05).txt,IMO_9510682_2023-01-28T08-33-55,22.2,12.5,36.505052,355.474429
2115559,2023-01-29 00:34:11.800,2023-01-29 00:34:11.800,$WIMWV,340,R,48,N,A,"$WIMWV,340,R,048,N,A",SERIAL_28_01_2023-08_33_55.80_(SI07).txt,...,,A,,"$VDVBW,12.9,,A,,,V",SERIAL_28_01_2023-08_33_56.20_(SI05).txt,IMO_9510682_2023-01-28T08-33-55,22.3,12.5,36.505052,355.574429
2115557,2023-01-29 00:34:09.800,2023-01-29 00:34:09.800,$WIMWV,343,R,48,N,A,"$WIMWV,343,R,048,N,A",SERIAL_28_01_2023-08_33_55.80_(SI07).txt,...,,A,,"$VDVBW,13.0,,A,,,V",SERIAL_28_01_2023-08_33_56.20_(SI05).txt,IMO_9510682_2023-01-28T08-33-55,22.1,12.5,36.230985,359.310686
2115548,2023-01-29 00:34:00.300,2023-01-29 00:34:00.300,$WIMWV,345,R,48,N,A,"$WIMWV,345,R,048,N,A",SERIAL_28_01_2023-08_33_55.80_(SI07).txt,...,,A,,"$VDVBW,13.1,,A,,,V",SERIAL_28_01_2023-08_33_56.20_(SI05).txt,IMO_9510682_2023-01-28T08-33-55,22.0,12.5,36.071305,1.854222
2115551,2023-01-29 00:34:03.400,2023-01-29 00:34:03.400,$WIMWV,345,R,48,N,A,"$WIMWV,345,R,048,N,A",SERIAL_28_01_2023-08_33_55.80_(SI07).txt,...,,A,,"$VDVBW,13.1,,A,,,V",SERIAL_28_01_2023-08_33_56.20_(SI05).txt,IMO_9510682_2023-01-28T08-33-55,22.4,12.5,36.071305,2.254222
