In [4]:
import netCDF4
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy.ma as ma


In [35]:
MIN_YEAR=1993
MIN_DAY=1
MIN_MONTH=1

MAX_YEAR=2013
MAX_DAY=31
MAX_MONTH=12

In [5]:
dataset = netCDF4.Dataset('dataset/Copernicus_ENA_Satelite_Maps_Training_Data/dt_ena_19930101_vDT2021.nc', mode='r')

In [12]:
latitude = dataset.variables['latitude'][:]
longitude = dataset.variables['longitude'][:]

In [14]:
latitude = np.linspace(25.125, 49.875, 100)  # 100 latitude points
longitude = np.linspace(-99.875, -60.125, 160)  # 160 longitude points

In [46]:
lon_grid, lat_grid = np.meshgrid(longitude, latitude)  # Shape (100, 160)
lon_flat = lon_grid.ravel()  # Flattened longitude, shape (100 * 160,)
lat_flat = lat_grid.ravel()  # Flattened latitude, shape (100 * 160,)
lon_flat = lon_flat.astype(np.float32)
lat_flat = lat_flat.astype(np.float32)

In [18]:
attention_mask = ma.getmask(dataset.variables['sla'][0])

In [19]:
attention_mask = (~attention_mask).astype(int)

In [20]:
attention_mask

array([[0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       ...,
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1]])

In [21]:
len(dataset.variables['longitude'][:])

160

In [22]:
for d in dataset.variables['sla'][:]:
    print(d)

[[-- -- -- ... -0.0594 -0.062400000000000004 -0.066]
 [-- -- -- ... -0.051300000000000005 -0.0579 -0.0623]
 [-- -- -- ... -0.0506 -0.057800000000000004 -0.0625]
 ...
 [-- -- -- ... -0.007200000000000001 -0.015000000000000001
  -0.023100000000000002]
 [-- -- -- ... -0.0048000000000000004 -0.010100000000000001 -0.0176]
 [-- -- -- ... -0.0091 -0.009300000000000001 -0.0176]]


In [48]:
directory = "dataset/Copernicus_ENA_Satelite_Maps_Training_Data"
results = []

for filename in os.listdir(directory):
    if filename.endswith(".nc"):
        file_path = os.path.join(directory, filename)

        # Extract the date part from the filename and format it
        date_str = filename.split("_")[2]
        if len(date_str) == 8:
            #formatted_date = f"{date_str[4:6]}-{date_str[6:8]}-{date_str[:4]}"

            # Open the .nc file
            dataset = netCDF4.Dataset(file_path, mode="r")

            # Extract the 'sla' variable
            sla = dataset.variables["sla"][:]

            # Calculate the average of all values stored in sla
            #average_sla = np.mean(sla)

            # Determine the value for the additional columns
            #value = 1 if average_sla >= 0 else 0

            # Append the result to the list
            results.append([date_str, sla])

            # Close the dataset
            dataset.close()

In [49]:
type(results[0][1][0][99][0])

numpy.ma.core.MaskedConstant

In [50]:
results[0]

['19930101',
 masked_array(
   data=[[[--, --, --, ..., -0.0594, -0.062400000000000004, -0.066],
          [--, --, --, ..., -0.051300000000000005, -0.0579, -0.0623],
          [--, --, --, ..., -0.0506, -0.057800000000000004, -0.0625],
          ...,
          [--, --, --, ..., -0.007200000000000001, -0.015000000000000001,
           -0.023100000000000002],
          [--, --, --, ..., -0.0048000000000000004,
           -0.010100000000000001, -0.0176],
          [--, --, --, ..., -0.0091, -0.009300000000000001, -0.0176]]],
   mask=[[[ True,  True,  True, ..., False, False, False],
          [ True,  True,  True, ..., False, False, False],
          [ True,  True,  True, ..., False, False, False],
          ...,
          [ True,  True,  True, ..., False, False, False],
          [ True,  True,  True, ..., False, False, False],
          [ True,  True,  True, ..., False, False, False]]],
   fill_value=-2147483647)]

In [51]:
for day in results:
    day[1] = day[1].filled(-10) 
           

In [52]:
day = results[0]
len(day[1].ravel())

16000

In [57]:
def preprocess_date(data_str):
    try:
        if len(data_str) != 8 or not data_str.isdigit():
            raise ValueError(f"Invalid date format: {data_str}")
        year = int(data_str[:4])
        month = int(data_str[4:6])
        day = int(data_str[6:8])
        year_norm = (year - MIN_YEAR) / (MAX_YEAR - MIN_YEAR)
        month_sin = np.sin(2 * np.pi * (month - 1) / 12)
        month_cos = np.cos(2 * np.pi * (month - 1) / 12)
        day_sin = np.sin(2 * np.pi * (day - 1) / 31)
        day_cos = np.cos(2 * np.pi * (day - 1) / 31)
        return [year_norm, month_sin, month_cos, day_sin, day_cos]
    except ValueError as e:
        print(e)
        return None

In [58]:
comb_data = []

for day in results:
    sla_flat = day[1].ravel()
    sla_flat = sla_flat.astype(np.float32)
    daily_data = np.column_stack((lon_flat, lat_flat, sla_flat))
    processed_date = preprocess_date(day[0])
    if processed_date is None:
        continue
    date_array = np.tile(processed_date, (daily_data.shape[0], 1))  # Repeat the processed date for each row
    daily_data_with_date = np.hstack((date_array, daily_data))  # Add date as first columns
    comb_data.append(daily_data_with_date)

final_data = np.vstack(comb_data)


In [70]:
final_data.shape

(116832000, 8)

In [72]:
final_data[0]

array([  0.   ,   0.   ,   1.   ,   0.   ,   1.   , -99.875,  25.125,
       -10.   ])

In [66]:
sla_values = final_data[:, 7]  # Assuming dataset is a NumPy array
masked_value = -10  # Replace this with your actual placeholder

boolean_mask = sla_values != masked_value
attention_mask = boolean_mask.astype(np.int32)

In [69]:
attention_mask

array([0, 0, 0, ..., 1, 1, 1])

In [16]:
# Create a DataFrame from the results
columns = [
    "Date",
    "Average_SLA",
    "Atlantic City",
    "Baltimore",
    "Eastport",
    "Fort Pulaski",
    "Lewes",
    "New London",
    "Newport",
    "Portland",
    "Sandy Hook",
    "Sewells Point",
    "The Battery",
    "Washington",
]
results_df = pd.DataFrame(results, columns=columns)

In [17]:
results_df = results_df.drop(columns=["Average_SLA"])


In [20]:
output_csv_path = r"demo_sla.csv"

In [21]:
results_df.to_csv(output_csv_path, index=False)

In [1]:
results

NameError: name 'results' is not defined

In [20]:
from huggingface_hub import hf_hub_download
import torch
from transformers import TimeSeriesTransformerModel

file = hf_hub_download(
    repo_id="hf-internal-testing/tourism-monthly-batch", filename="train-batch.pt", repo_type="dataset"
)
batch = torch.load(file)

model = TimeSeriesTransformerModel.from_pretrained("huggingface/time-series-transformer-tourism-monthly")


train-batch.pt:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  batch = torch.load(file)


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/151k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133k [00:00<?, ?B/s]