In [39]:
import netCDF4
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy.ma as ma

import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.optim import Adam
from torch import nn



In [40]:
MIN_YEAR=1993
MIN_DAY=1
MIN_MONTH=1

MAX_YEAR=2013
MAX_DAY=31
MAX_MONTH=12

In [41]:
dataset = netCDF4.Dataset('dataset/Copernicus_ENA_Satelite_Maps_Training_Data/dt_ena_19930101_vDT2021.nc', mode='r')

In [42]:
latitude = dataset.variables['latitude'][:]
longitude = dataset.variables['longitude'][:]

In [43]:
latitude = np.linspace(25.125, 49.875, 100)  # 100 latitude points
longitude = np.linspace(-99.875, -60.125, 160)  # 160 longitude points

In [44]:
lon_grid, lat_grid = np.meshgrid(longitude, latitude)  # Shape (100, 160)
lon_flat = lon_grid.ravel()  # Flattened longitude, shape (100 * 160,)
lat_flat = lat_grid.ravel()  # Flattened latitude, shape (100 * 160,)
lon_flat = lon_flat.astype(np.float32)
lat_flat = lat_flat.astype(np.float32)

In [45]:
directory = "dataset/Copernicus_ENA_Satelite_Maps_Training_Data"
results = []

for filename in os.listdir(directory):
    if filename.endswith(".nc"):
        file_path = os.path.join(directory, filename)
        date_str = filename.split("_")[2]
        if len(date_str) == 8:
            # Open the .nc file
            dataset = netCDF4.Dataset(file_path, mode="r")
            sla = dataset.variables["sla"][:]
            results.append([date_str, sla])
            dataset.close()

In [46]:
for day in results:
    day[1] = day[1].filled(-10) 
           

In [47]:
def preprocess_date(data_str, all_dates=[]):
    try:
        if len(data_str) != 8 or not data_str.isdigit():
            raise ValueError(f"Invalid date format: {data_str}")
        year = int(data_str[:4])
        month = int(data_str[4:6])
        day = int(data_str[6:8])
        year_norm = (year - MIN_YEAR) / (MAX_YEAR - MIN_YEAR)
        month_sin = np.sin(2 * np.pi * (month - 1) / 12)
        month_cos = np.cos(2 * np.pi * (month - 1) / 12)
        day_sin = np.sin(2 * np.pi * (day - 1) / 31)
        day_cos = np.cos(2 * np.pi * (day - 1) / 31)
        all_dates.append(data_str)
        return [year_norm, month_sin, month_cos, day_sin, day_cos], all_dates
    except ValueError as e:
        print(e)
        return None

In [48]:
comb_data = []
all_dates = []

for day in results:
    sla_flat = day[1].ravel()
    sla_flat = sla_flat.astype(np.float32)
    daily_data = np.column_stack((lon_flat, lat_flat, sla_flat))
    processed_date, all_dates = preprocess_date(day[0], all_dates)
    if processed_date is None:
        continue
    date_array = np.tile(processed_date, (daily_data.shape[0], 1)) 
    daily_data_with_date = np.hstack((date_array, daily_data))  
    comb_data.append(daily_data_with_date)

final_data = np.vstack(comb_data)


In [49]:
final_data.shape

(116832000, 8)

In [50]:
final_data[0]

array([  0.   ,   0.   ,   1.   ,   0.   ,   1.   , -99.875,  25.125,
       -10.   ])

In [51]:
sla_values = final_data[:, 7] 
masked_value = -10  

boolean_mask = sla_values != masked_value
attention_mask = boolean_mask.astype(np.int32)

In [52]:
DIM = 8
PRED_LENGTH = 365
CONTEXT_LENGTH = 365
TIME_FEAT = 5

In [53]:
# load data per station
df_AtC = pd.read_csv('dataset/Training_Anomalies_Station_Data/Atlantic_City_1993_2013_training_data.csv')
df_Balt = pd.read_csv('dataset/Training_Anomalies_Station_Data/Baltimore_1993_2013_training_data.csv')
df_East = pd.read_csv('dataset/Training_Anomalies_Station_Data/Eastport_1993_2013_training_data.csv')
df_FP = pd.read_csv('dataset/Training_Anomalies_Station_Data/Fort_Pulaski_1993_2013_training_data.csv')
df_Lewes = pd.read_csv('dataset/Training_Anomalies_Station_Data/Lewes_1993_2013_training_data.csv')
df_NL = pd.read_csv('dataset/Training_Anomalies_Station_Data/New_London_1993_2013_training_data.csv')
df_Newp = pd.read_csv('dataset/Training_Anomalies_Station_Data/Newport_1993_2013_training_data.csv')
df_Port = pd.read_csv('dataset/Training_Anomalies_Station_Data/Portland_1993_2013_training_data.csv')
df_SH = pd.read_csv('dataset/Training_Anomalies_Station_Data/Sandy_Hook_1993_2013_training_data.csv')
df_SP = pd.read_csv('dataset/Training_Anomalies_Station_Data/Sewells_Point_1993_2013_training_data.csv')  
df_Batt = pd.read_csv('dataset/Training_Anomalies_Station_Data/The_Battery_1993_2013_training_data.csv')
df_Wash = pd.read_csv('dataset/Training_Anomalies_Station_Data/Washington_1993_2013_training_data.csv')


In [54]:
# check for nulls
print("Nulls in Atlantic City: \n" + str(df_AtC.isnull().isnull().sum()))
print('\n')
print("Nulls in Baltimore: \n" + str(df_Balt.isnull().isnull().sum()))
print('\n')
print("Nulls in Eastport:\n" + str(df_East.isnull().isnull().sum()))
print('\n')
print("Nulls in Fort Pulaski: \n" + str(df_FP.isnull().isnull().sum()))
print('\n')
print("Nulls in Lewes: \n" + str(df_Lewes.isnull().isnull().sum()))
print('\n')
print("Nulls in New London: \n" + str(df_NL.isnull().isnull().sum()))
print('\n')
print("Nulls in Newport: \n" + str(df_Newp.isnull().isnull().sum()))
print('\n')
print("Nulls in Portland: \n" + str(df_Port.isnull().isnull().sum()))
print('\n')
print("Nulls in Sandy Hook: \n" + str(df_SH.isnull().isnull().sum()))
print('\n')
print("Nulls in Sewells Point: \n" + str(df_SP.isnull().isnull().sum()))
print('\n')
print("Nulls in The Battery: \n" + str(df_Batt.isnull().isnull().sum()))
print('\n')
print("Nulls in Washington: \n" + str(df_Wash.isnull().isnull().sum()))

Nulls in Atlantic City: 
t            0
anomaly      0
location     0
latitude     0
longitude    0
dtype: int64


Nulls in Baltimore: 
t            0
anomaly      0
location     0
latitude     0
longitude    0
dtype: int64


Nulls in Eastport:
t            0
anomaly      0
location     0
latitude     0
longitude    0
dtype: int64


Nulls in Fort Pulaski: 
t            0
anomaly      0
location     0
latitude     0
longitude    0
dtype: int64


Nulls in Lewes: 
t            0
anomaly      0
location     0
latitude     0
longitude    0
dtype: int64


Nulls in New London: 
t            0
anomaly      0
location     0
latitude     0
longitude    0
dtype: int64


Nulls in Newport: 
t            0
anomaly      0
location     0
latitude     0
longitude    0
dtype: int64


Nulls in Portland: 
t            0
anomaly      0
location     0
latitude     0
longitude    0
dtype: int64


Nulls in Sandy Hook: 
t            0
anomaly      0
location     0
latitude     0
longitude    0
dtype: int64


N

In [55]:
# drop last 3 columns
df_AtC = df_AtC.iloc[:, :-3]
df_Balt = df_Balt.iloc[:, :-3]
df_East = df_East.iloc[:, :-3]
df_FP = df_FP.iloc[:, :-3]
df_Lewes = df_Lewes.iloc[:, :-3]
df_NL = df_NL.iloc[:, :-3]
df_Newp = df_Newp.iloc[:, :-3]
df_Port = df_Port.iloc[:, :-3]
df_SH = df_SH.iloc[:, :-3]
df_SP = df_SP.iloc[:, :-3]
df_Batt = df_Batt.iloc[:, :-3]
df_Wash = df_Wash.iloc[:, :-3]

In [59]:
#filter out hyphens from the t column
df_AtC['t'] = df_AtC['t'].str.replace('-', '')
df_Balt['t'] = df_Balt['t'].str.replace('-', '')
df_East['t'] = df_East['t'].str.replace('-', '')
df_FP['t'] = df_FP['t'].str.replace('-', '')
df_Lewes['t'] = df_Lewes['t'].str.replace('-', '')
df_NL['t'] = df_NL['t'].str.replace('-', '')
df_Newp['t'] = df_Newp['t'].str.replace('-', '')
df_Port['t'] = df_Port['t'].str.replace('-', '')
df_SH['t'] = df_SH['t'].str.replace('-', '')
df_SP['t'] = df_SP['t'].str.replace('-', '')
df_Batt['t'] = df_Batt['t'].str.replace('-', '')
df_Wash['t'] = df_Wash['t'].str.replace('-', '')

In [65]:
def df_to_sequences_labels(anom_data, dataset):
    sequence_length = CONTEXT_LENGTH
    prediction_length = PRED_LENGTH
    total_days = len(all_dates) 

    # Create sequences and labels
    sequences = []
    labels = []

    # remove rows where date in all_dates and date in anom_data are not equal and get removed dates
    removed_dates = [date for date in all_dates if date not in anom_data['t'].values]
    anom_data = anom_data[anom_data['t'].isin(all_dates)]
    
    # remove 160*100 rows if date was not found in anom_data
    for date in removed_dates:
        transformed_date, _ = preprocess_date(date, [])
        dataset = dataset[~(dataset[:, :5] == transformed_date).all(axis=1)]

    for i in range(total_days - sequence_length + prediction_length):
        # Input: past 365 days
        input_sequence = dataset[i:i + sequence_length, :]
        sequences.append(input_sequence)
        
        # Target: any anomaly in the next 365 days
        target_label = int(np.any(anom_data[i + sequence_length:i + prediction_length + sequence_length]))
        labels.append(target_label)

    sequences = np.array(sequences)  
    labels = np.array(labels)

    return sequences, labels


In [None]:
sequences, labels = df_to_sequences_labels(df_AtC, final_data)

In [20]:
from huggingface_hub import hf_hub_download
from transformers import TimeSeriesTransformerModel, TimeSeriesTransformerConfig

data = torch.tensor(final_data, dtype=torch.float32)
attention_mask = torch.tensor(attention_mask, dtype=torch.int32)

config = TimeSeriesTransformerConfig(
    prediction_length=PRED_LENGTH,  # Example: predict 30 days into the future
    input_size=DIM,  # Matches your feature dimension (8)
    context_length=CONTEXT_LENGTH,  # Sequence length (1 year)
    num_time_features=TIME_FEAT,  # Number of time-related features (year_norm, month_sin, etc.)
    static_cardinalities=[]  # No static categorical features
)

model = TimeSeriesTransformerModel(config)

output = model(
    inputs_embeds=data,         
    attention_mask=attention_mask  
)


train-batch.pt:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  batch = torch.load(file)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/151k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133k [00:00<?, ?B/s]

In [None]:
# Create a DataFrame from the results
columns = [
    "Date",
    "Average_SLA",
    "Atlantic City",
    "Baltimore",
    "Eastport",
    "Fort Pulaski",
    "Lewes",
    "New London",
    "Newport",
    "Portland",
    "Sandy Hook",
    "Sewells Point",
    "The Battery",
    "Washington",
]
results_df = pd.DataFrame(results, columns=columns)

In [None]:
output_csv_path = r"demo_sla.csv"
results_df.to_csv(output_csv_path, index=False)