In [1]:
import numpy as np
import pandas as pd
from ray import tune
from ray.tune.search.hyperopt import HyperOptSearch
import matplotlib.pyplot as plt

from neuralforecast.core import NeuralForecast
from neuralforecast.models import NHITS
from neuralforecast.losses.pytorch import HuberLoss
from neuralforecast.losses.numpy import mae

In [2]:
def evaluate(Y_hat_df, av_mask, model_name):
    results_df = Y_hat_df.copy()
    # Filter values with at least 1 available mask in input window
    results_df = results_df.merge(av_mask[['unique_id', 'cutoff', 'sum_av_mask']], on=['unique_id', 'cutoff'], how='left')
    results_df = results_df[results_df['sum_av_mask'] > 0].reset_index(drop=True)
    # Filter ffill values of y
    results_df = results_df[results_df.available_mask==1]
    # Keep critical values of y
    results_critical_df = results_df[(results_df.y<=70) | (results_df.y>=180)]
    return mae(results_df['y'], results_df[model_name]), mae(results_critical_df['y'], results_critical_df[model_name])

In [5]:
data = pd.read_csv('/home/scratch/wpotosna/data/ohiot1dm_exog_9_day_test.csv')
data['ds'] = pd.to_datetime(data['ds'])

df = []
unique_ids = data['unique_id'].unique()
for unique_id in unique_ids:
    df_uid = data[data['unique_id'] == unique_id].reset_index(drop=True)
    df_uid["sum_av_mask"] = df_uid['available_mask'].rolling(window=120, min_periods=1).sum()
    df.append(df_uid)
av_mask = pd.concat(df).reset_index(drop=True)
av_mask = av_mask.rename(columns={'ds': 'cutoff'})
av_mask.head()

Unnamed: 0,unique_id,cutoff,y,available_mask,CHO,insulin,sum_av_mask
0,#559,2021-12-07 01:20:00,101.0,1,0.0,0.0,1.0
1,#559,2021-12-07 01:25:00,98.0,1,0.0,0.0,2.0
2,#559,2021-12-07 01:30:00,104.0,1,0.0,0.0,3.0
3,#559,2021-12-07 01:35:00,112.0,1,0.0,0.0,4.0
4,#559,2021-12-07 01:40:00,120.0,1,0.0,0.0,5.0


In [6]:
Y_hat_df = pd.read_csv('results_glucose/h_6/baselines/nhits_20230829.csv')
Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])

mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS')

print('NHITS baseline ALL: ', mae_all)
print('NHITS baseline CRITICAL: ', mae_critical)

Y_hat_df = pd.read_csv('results_glucose/h_6/baselines/tft_20230829.csv')
Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])

mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoTFT')

print('TFT baseline ALL: ', mae_all)
print('TFT baseline CRITICAL: ', mae_critical)

Y_hat_df = pd.read_csv('results_glucose/h_6/exogenous/nhits_20230829.csv')
Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])

mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS')
print('NHITS exog ALL: ', mae_all)
print('NHITS exog CRITICAL: ', mae_critical)

Y_hat_df = pd.read_csv('results_glucose/h_6/treat/nhits_20230829.csv')
Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])

mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS_TREAT')
print('NHITS TREAT ALL: ', mae_all)
print('NHITS TREAT CRITICAL: ', mae_critical)

FileNotFoundError: [Errno 2] No such file or directory: 'results_glucose/h_6/baselines/nhits_20230829.csv'

In [14]:
Y_hat_df = pd.read_csv('results_glucose/h_6/baselines/nhits_20230829.csv')
Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])

mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS')

print('NHITS baseline ALL: ', mae_all)
print('NHITS baseline CRITICAL: ', mae_critical)

Y_hat_df = pd.read_csv('results_glucose/h_6/baselines/tft_20230829.csv')
Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])

mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoTFT')

print('TFT baseline ALL: ', mae_all)
print('TFT baseline CRITICAL: ', mae_critical)

Y_hat_df = pd.read_csv('results_glucose/h_6/exogenous/tft_20230829.csv')
Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])

mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoTFT')

print('TFT exog ALL: ', mae_all)
print('TFT exog CRITICAL: ', mae_critical)

Y_hat_df = pd.read_csv('results_glucose/h_6/exogenous/nhits_20230829.csv')
Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])

mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS')
print('NHITS exog ALL: ', mae_all)
print('NHITS exog CRITICAL: ', mae_critical)

Y_hat_df = pd.read_csv('results_glucose/h_6/treat/nhits_20230829.csv')
Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])

mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS_TREAT')
print('NHITS TREAT ALL: ', mae_all)
print('NHITS TREAT CRITICAL: ', mae_critical)


Y_hat_df = pd.read_csv('results_glucose/h_6/treat/nhits_20230829_cho.csv')
Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])

mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS_TREAT')
print('NHITS TREAT ALL: ', mae_all)
print('NHITS TREAT CRITICAL: ', mae_critical)

NHITS baseline ALL:  9.319583626742984
NHITS baseline CRITICAL:  10.50554280553876
TFT baseline ALL:  9.259035800329098
TFT baseline CRITICAL:  10.371936204513847
TFT exog ALL:  9.464974399592458
TFT exog CRITICAL:  11.417223404672573
NHITS exog ALL:  8.972649823354212
NHITS exog CRITICAL:  10.100923448845085
NHITS TREAT ALL:  8.94415280577869
NHITS TREAT CRITICAL:  10.288121691332083
NHITS TREAT ALL:  8.902128148961392
NHITS TREAT CRITICAL:  9.885354577064941


In [15]:
experiment_ids = ['nhits_20230829_cho', 'nhits_20230831_1', 'nhits_20230831_2','nhits_20230831_3',
                  'nhits_20230831_4', 'nhits_20230831_5', 'nhits_20230831_6', 'nhits_20230831_7', 'nhits_20230831_8']
mae_alls=[]
mae_criticals=[]
for experiment_id in experiment_ids:
    Y_hat_df = pd.read_csv(f'results_glucose/h_6/treat/{experiment_id}.csv')
    Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
    Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])
    mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS_TREAT')
    mae_alls.append(mae_all)
    mae_criticals.append(mae_critical)

In [16]:
print('NHITS TREAT ALL: ', mae_alls)
print('NHITS TREAT CRITICAL: ', mae_criticals)
print('NHITS TEST ALL AVERAGE: ', np.mean(mae_alls))
print('NHITS TEST CRITICAL AVERAGE: ', np.mean(mae_criticals))
print('NHITS TEST ALL STD: ', np.std(mae_alls))
print('NHITS TEST CRITICAL STD: ', np.std(mae_criticals))

NHITS TREAT ALL:  [8.902128148961392, 8.898860100422286, 8.731333180774268, 8.908003669432745, 8.97602519511188, 8.953040576677637, 8.93725488928601, 8.69443896291447, 8.955262941965394]
NHITS TREAT CRITICAL:  [9.885354577064941, 10.067645513756197, 9.938478664001089, 10.066430550051397, 10.141582577246343, 10.183065959562825, 10.163202115990446, 9.969622231980892, 10.262580547768776]
NHITS TEST ALL AVERAGE:  8.88403862950512
NHITS TEST CRITICAL AVERAGE:  10.07532919304699
NHITS TEST ALL STD:  0.09516995759204688
NHITS TEST CRITICAL STD:  0.11780966316668567


# GOOD ONES

# h= 6

In [19]:
experiment_ids = ['nhits_20230901_1', 'nhits_20230901_2']
mae_alls=[]
mae_criticals=[]
for experiment_id in experiment_ids:
    Y_hat_df = pd.read_csv(f'results_glucose/h_6/baselines/{experiment_id}.csv')
    Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
    Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])
    mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS')
    mae_alls.append(mae_all)
    mae_criticals.append(mae_critical)

print('NHITS TREAT ALL: ', mae_alls)
print('NHITS TREAT CRITICAL: ', mae_criticals)
print('NHITS TEST ALL AVERAGE: ', np.mean(mae_alls))
print('NHITS TEST CRITICAL AVERAGE: ', np.mean(mae_criticals))
print('NHITS TEST ALL STD: ', np.std(mae_alls))
print('NHITS TEST CRITICAL STD: ', np.std(mae_criticals))

NHITS TREAT ALL:  [9.294324594925511, 9.29302931093339]
NHITS TREAT CRITICAL:  [10.46137866366852, 10.351821920168703]
NHITS TEST ALL AVERAGE:  9.29367695292945
NHITS TEST CRITICAL AVERAGE:  10.406600291918611
NHITS TEST ALL STD:  0.0006476419960605995
NHITS TEST CRITICAL STD:  0.05477837174990885


In [20]:
experiment_ids = ['nhits_20230901_2_1', 'nhits_20230901_2_2', 'nhits_20230901_2_3', 'nhits_20230901_2_4']
mae_alls=[]
mae_criticals=[]
for experiment_id in experiment_ids:
    Y_hat_df = pd.read_csv(f'results_glucose/h_6/treat/{experiment_id}.csv')
    Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
    Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])
    mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS_TREAT')
    mae_alls.append(mae_all)
    mae_criticals.append(mae_critical)

print('NHITS TREAT ALL: ', mae_alls)
print('NHITS TREAT CRITICAL: ', mae_criticals)
print('NHITS TEST ALL AVERAGE: ', np.mean(mae_alls))
print('NHITS TEST CRITICAL AVERAGE: ', np.mean(mae_criticals))
print('NHITS TEST ALL STD: ', np.std(mae_alls))
print('NHITS TEST CRITICAL STD: ', np.std(mae_criticals))

NHITS TREAT ALL:  [8.770153566167709, 8.825971526894087, 8.820030653921464, 8.929680516670206]
NHITS TREAT CRITICAL:  [9.85622752483674, 9.965233632996737, 9.87923582234853, 10.091018492184665]
NHITS TEST ALL AVERAGE:  8.836459065913367
NHITS TEST CRITICAL AVERAGE:  9.947928868091667
NHITS TEST ALL STD:  0.05802270971996216
NHITS TEST CRITICAL STD:  0.09206243302743222


# h = 24

In [21]:
experiment_ids = ['nhits_20230901_1', 'nhits_20230901_2']
mae_alls=[]
mae_criticals=[]
for experiment_id in experiment_ids:
    Y_hat_df = pd.read_csv(f'results_glucose/h_24/baselines/{experiment_id}.csv')
    Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
    Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])
    mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS')
    mae_alls.append(mae_all)
    mae_criticals.append(mae_critical)

print('NHITS TREAT ALL: ', mae_alls)
print('NHITS TREAT CRITICAL: ', mae_criticals)
print('NHITS TEST ALL AVERAGE: ', np.mean(mae_alls))
print('NHITS TEST CRITICAL AVERAGE: ', np.mean(mae_criticals))
print('NHITS TEST ALL STD: ', np.std(mae_alls))
print('NHITS TEST CRITICAL STD: ', np.std(mae_criticals))

NHITS TREAT ALL:  [24.448452659358573, 24.842071815832718]
NHITS TREAT CRITICAL:  [30.777990970817207, 30.94805512464986]
NHITS TEST ALL AVERAGE:  24.645262237595645
NHITS TEST CRITICAL AVERAGE:  30.863023047733535
NHITS TEST ALL STD:  0.19680957823707246
NHITS TEST CRITICAL STD:  0.08503207691632575


In [24]:
experiment_ids = ['nhits_20230901_1', 'nhits_20230901_2']
mae_alls=[]
mae_criticals=[]
for experiment_id in experiment_ids:
    Y_hat_df = pd.read_csv(f'results_glucose/h_24/exogenous/{experiment_id}.csv')
    Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
    Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])
    mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS')
    mae_alls.append(mae_all)
    mae_criticals.append(mae_critical)

print('NHITS TREAT ALL: ', mae_alls)
print('NHITS TREAT CRITICAL: ', mae_criticals)
print('NHITS TEST ALL AVERAGE: ', np.mean(mae_alls))
print('NHITS TEST CRITICAL AVERAGE: ', np.mean(mae_criticals))
print('NHITS TEST ALL STD: ', np.std(mae_alls))
print('NHITS TEST CRITICAL STD: ', np.std(mae_criticals))

NHITS TREAT ALL:  [23.331997588782052, 23.853627117784963]
NHITS TREAT CRITICAL:  [28.555594490093043, 29.01315113623619]
NHITS TEST ALL AVERAGE:  23.592812353283506
NHITS TEST CRITICAL AVERAGE:  28.784372813164616
NHITS TEST ALL STD:  0.2608147645014558
NHITS TEST CRITICAL STD:  0.2287783230715732


In [23]:
experiment_ids = ['nhits_20230901_1', 'nhits_20230901_2']
mae_alls=[]
mae_criticals=[]
for experiment_id in experiment_ids:
    Y_hat_df = pd.read_csv(f'results_glucose/h_24/treat/{experiment_id}.csv')
    Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
    Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])
    mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS_TREAT')
    mae_alls.append(mae_all)
    mae_criticals.append(mae_critical)

print('NHITS TREAT ALL: ', mae_alls)
print('NHITS TREAT CRITICAL: ', mae_criticals)
print('NHITS TEST ALL AVERAGE: ', np.mean(mae_alls))
print('NHITS TEST CRITICAL AVERAGE: ', np.mean(mae_criticals))
print('NHITS TEST ALL STD: ', np.std(mae_alls))
print('NHITS TEST CRITICAL STD: ', np.std(mae_criticals))

NHITS TREAT ALL:  [23.598199761300105, 23.904786123717106]
NHITS TREAT CRITICAL:  [28.958337218544642, 29.10686700828887]
NHITS TEST ALL AVERAGE:  23.751492942508605
NHITS TEST CRITICAL AVERAGE:  29.032602113416758
NHITS TEST ALL STD:  0.15329318120850033
NHITS TEST CRITICAL STD:  0.07426489487211363


# 10 days of data

In [26]:
experiment_ids = ['nhits_20230901_10_1', 'nhits_20230901_10_2']
mae_alls=[]
mae_criticals=[]
for experiment_id in experiment_ids:
    Y_hat_df = pd.read_csv(f'results_glucose/h_6/baselines/{experiment_id}.csv')
    Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
    Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])
    mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS')
    mae_alls.append(mae_all)
    mae_criticals.append(mae_critical)

print('NHITS TREAT ALL: ', mae_alls)
print('NHITS TREAT CRITICAL: ', mae_criticals)
print('NHITS TEST ALL AVERAGE: ', np.mean(mae_alls))
print('NHITS TEST CRITICAL AVERAGE: ', np.mean(mae_criticals))
print('NHITS TEST ALL STD: ', np.std(mae_alls))
print('NHITS TEST CRITICAL STD: ', np.std(mae_criticals))

NHITS TREAT ALL:  [9.551207755900776, 10.03209366250457]
NHITS TREAT CRITICAL:  [10.763396118152134, 11.552775866126495]
NHITS TEST ALL AVERAGE:  9.791650709202674
NHITS TEST CRITICAL AVERAGE:  11.158085992139315
NHITS TEST ALL STD:  0.2404429533018968
NHITS TEST CRITICAL STD:  0.3946898739871809


In [27]:
experiment_ids = ['nhits_20230901_10_1', 'nhits_20230901_10_2']
mae_alls=[]
mae_criticals=[]
for experiment_id in experiment_ids:
    Y_hat_df = pd.read_csv(f'results_glucose/h_6/exogenous/{experiment_id}.csv')
    Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
    Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])
    mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS')
    mae_alls.append(mae_all)
    mae_criticals.append(mae_critical)

print('NHITS TREAT ALL: ', mae_alls)
print('NHITS TREAT CRITICAL: ', mae_criticals)
print('NHITS TEST ALL AVERAGE: ', np.mean(mae_alls))
print('NHITS TEST CRITICAL AVERAGE: ', np.mean(mae_criticals))
print('NHITS TEST ALL STD: ', np.std(mae_alls))
print('NHITS TEST CRITICAL STD: ', np.std(mae_criticals))

NHITS TREAT ALL:  [9.203094116470272, 9.411317519309483]
NHITS TREAT CRITICAL:  [10.356497463205951, 10.495450755653646]
NHITS TEST ALL AVERAGE:  9.307205817889876
NHITS TEST CRITICAL AVERAGE:  10.425974109429799
NHITS TEST ALL STD:  0.10411170141960557
NHITS TEST CRITICAL STD:  0.06947664622384764


In [28]:
experiment_ids = ['nhits_20230901_10_1', 'nhits_20230901_10_2']
mae_alls=[]
mae_criticals=[]
for experiment_id in experiment_ids:
    Y_hat_df = pd.read_csv(f'results_glucose/h_6/treat/{experiment_id}.csv')
    Y_hat_df['ds'] = pd.to_datetime(Y_hat_df['ds'])
    Y_hat_df['cutoff'] = pd.to_datetime(Y_hat_df['cutoff'])
    mae_all, mae_critical = evaluate(Y_hat_df, av_mask, 'AutoNHITS_TREAT')
    mae_alls.append(mae_all)
    mae_criticals.append(mae_critical)

print('NHITS TREAT ALL: ', mae_alls)
print('NHITS TREAT CRITICAL: ', mae_criticals)
print('NHITS TEST ALL AVERAGE: ', np.mean(mae_alls))
print('NHITS TEST CRITICAL AVERAGE: ', np.mean(mae_criticals))
print('NHITS TEST ALL STD: ', np.std(mae_alls))
print('NHITS TEST CRITICAL STD: ', np.std(mae_criticals))

NHITS TREAT ALL:  [9.530075877390095, 9.149628339832736]
NHITS TREAT CRITICAL:  [10.430569400123957, 10.283910443010038]
NHITS TEST ALL AVERAGE:  9.339852108611415
NHITS TEST CRITICAL AVERAGE:  10.357239921566997
NHITS TEST ALL STD:  0.19022376877867941
NHITS TEST CRITICAL STD:  0.07332947855695959


Job Title: Time Series Data Collector

### Job Summary:
The Time Series Data Collector will be responsible for identifying, scraping, and processing time series datasets from various online sources. This role will work closely with data scientists and machine learning engineers to provide high-quality data that will contribute to analytics and predictive modeling projects.

### Responsibilities:
- **Identify Sources**: Search for reliable data sources on the Internet that provide time series datasets relevant to the business objectives. An initial list of examples will be provided.

   Preferable characteristics:
 * Have a reliable source. Some examples include: Kaggle competitions, public academic papers, official statistics or data published by governments, companies, or organizations.
 * Diversity in key characteristics including: sampling frequency, domain, size (number of series and observations).
 * Large volume in both number of time series and data points.

- **Data Scraping**: Some datasets might requiring using tools like Python libraries (e.g., Beautiful Soup, Selenium, Scrapy) to scrape time series data from websites, APIs, and other platforms.

- **Data Processing**: Clean and preprocess raw data to handle missing values and transform the data into a provided standardized format. Compute basic statistics for the dataset and create short reports describing the data and required information (eg. frequency, domain, number of series). 
  
- **Data Storage**: Work with database administrators to store processed data in AWS S3. Each dataset should be stored separately in individual parquet files.
  
- **Documentation**: Maintain comprehensive documentation that describes data sources, data transformations, and any challenges encountered during the process.
  
- **Adherence to Legal Guidelines**: Ensure that data scraping and usage adhere to legal and ethical guidelines, including copyright laws and data protection regulations.


## 1. Identify Sources

The first step of the process is to identify useful time series data sources. A time series is a sequence of observations recorded over time. We focus on time series with discrete and regular sampling intervals (eg. daily, monthly, weekly, data). A time series dataset will usually contain the following information:

- Datestamps: when the value was recorded
- Values: values of the time series
- Identifier: some datasets contain several time series (eg. different products), and are identified with an id or name.

Some example of domains of time series and examples we are interested are: 

1. Demand (eg. sales of a product, quantity sold)
2. Weather data (eg. temperature, sensor readings, pollution)
3. Finance data (eg. prices of a stock or asset)
4. Medicine and epidemics (eg. ECG and sensor data, number of patients with a disease)
5. Macro-economics and demographics (eg. population, PIB, indicators)
6. Sensor data and IoT (eg. usage of a server, sensors readings on machines, usually high frequency)
7. Electricity (eg. demand of a market or house, prices, output of wind turbines)
8. Transportation (eg. number of airline passengers, traffic at intersections)
9. Web traffic (eg. traffic on a website, Google trends)

Avoid time series from the following domains: audio data, text (parsed), videos or images (parsed), and occurances of sparse events.

Some potential sources are:

1. Kaggle competitions (focus mostly on time series forecasting and anomaly detection)
2. Academic papers (focus mostly on time series forecasting and anomaly detection)
3. Online repositories
4. Public data from companies and government

Sources 1-3 will probably require minimal effort to obtain as they are usually already preprocessed and placed in files. On the contrary, datasets published by organizations might require scrapping or developing scripts to parse the data (eg: FRED https://fred.stlouisfed.org/).

Finally, prioritize first:
- Larger datasets (both number of time series and datestamps).
- High-frequency data (eg. minutely, second data, or lower).
- Sensor data

Time series datasets we have already parsed:
1. M-competitions (1,2,3,4,5)
2. Monash repository and all its datasets (https://zenodo.org/communities/forecasting/?page=1&size=20)
3. Wikipedia traffic

## 2. Data Processing

After downloading the data, parse it into the following standardized format. Each dataset should be contained in an individual parquet file, and should contain only the following four columns:
- `unique_id`: identifier of each time series. If not provided in raw data, defined it as `unique_id_1`, `unique_id_2`, etc.
- `ds`: column with the datestamp in Pandas format (no UTC).
- `y`: value of the time series. If the original dataset contains multiple values per datestamp (commonly knows as a multivariate time series), split them in separate series with differenct values for `unique_id`.
- `available_mask`: column with 0 or 1 indicating the time series has a value for the particular `ds`. This is used to identify missing values as 0. Do not impute missing data or fill the missing values with zeros.

The following link contains examples of parsed data (without `available_mask`): https://nixtla.github.io/neuralforecast/examples/data_format.html.

## 3. Data Storage and Documentation

Finally, upload each dataset in a separate folder to S3 (bucket will be provided later). Each folder should contain:
    1. Parquet with the parsed dataset
    2. Markdown or text file with documentation of the dataset, including: source and general comments or challenges encountered.
    3. Python script parsing the original data to the required format (use a separate script for each dataset).
    4. If applicable, auxiliary scripts used to download the data.
