In [1]:
import pandas as pd
import numpy as np

BG_CSV     = "data/background_meta.csv"
OUT_CSV    = "data/background_none_split.csv"
SEED       = 42

N_TRAIN = 9909
N_VALID = 3411
N_TEST  = 6121

# Load and filter to none rows
bg = pd.read_csv(BG_CSV)
none_bg = bg[bg["category"].astype(str).str.lower() == "none"].copy()

# Deduplicate by the S3-driving keys so each selection maps to a unique timeseries prefix
keys = ["deployment_id", "report_cycle_start_epoch"]
dedup = none_bg.drop_duplicates(subset=keys).copy()

# Check you have enough unique prefixes
need = N_TRAIN + N_VALID + N_TEST
if len(dedup) < need:
    raise ValueError(f"Only {len(dedup)} unique (deployment_id, epoch); need {need}.")

# Shuffle for random selection
rng = np.random.default_rng(SEED)
perm = rng.permutation(len(dedup))
dedup = dedup.iloc[perm].reset_index(drop=True)

# Assign new categories that the downloader expects
dedup.loc[:N_TRAIN-1, "category"] = "train"
dedup.loc[N_TRAIN:N_TRAIN+N_VALID-1, "category"] = "valid"
dedup.loc[N_TRAIN+N_VALID:N_TRAIN+N_VALID+N_TEST-1, "category"] = "test"

# Keep just what we assigned
split_df = dedup.iloc[:need].copy()

# Save the file the downloader will consume
split_df.to_csv(OUT_CSV, index=False)
print("Saved:", OUT_CSV, "rows:", len(split_df))
print(split_df["category"].value_counts())


Saved: data/background_none_split.csv rows: 19441
category
train    9909
test     6121
valid    3411
Name: count, dtype: int64


In [None]:
# head -n 1 background_meta.csv > empty_wake.csv

In [None]:
#  python download_timeseries.py \
#   -w empty_wake.csv \
#   -b background_none_split.csv \
#   -o vessel_wake_timeseries_bg_none \
#   --overwrite_existing

In [None]:
# sh-4.2$ aws configure set aws_access_key_id Kp6254d3ZfqLQRRgrdha/gHGazAqBFXdDkXdbVkv
# sh-4.2$ aws configure set aws_access_key_id AKIARLZU3BIR3K4AGWR6
# sh-4.2$ aws configure set aws_secret_access_key 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
# sh-4.2$ aws configure set default.region us-west-2
# sh-4.2$ aws sts get-caller-identity
# {
#     "UserId": "AIDARLZU3BIRR2ZRSTA74",
#     "Account": "094063233571",
#     "Arn": "arn:aws:iam::094063233571:user/SheidaMajouniExternal"
# }

In [None]:
# sh-4.2$ python download_timeseries.py   -w ../data/empty_wake.csv   -b ../data/background_none_split.csv   -o ../data/vessel_NO_wake_timeseries 