In [1]:
from datetime import datetime
from distutils.util import strtobool

import numpy as np
import pandas as pd
import hashlib

In [3]:
# Converts the contents in a .tsf file into a dataframe and returns it along with other meta-data of the dataset: frequency, horizon, whether the dataset contains missing values and whether the series have equal lengths
#
# Parameters
# full_file_path_and_name - complete .tsf file path
# replace_missing_vals_with - a term to indicate the missing values in series in the returning dataframe
# value_column_name - Any name that is preferred to have as the name of the column containing series values in the returning dataframe
def convert_tsf_to_dataframe(
    full_file_path_and_name,
    replace_missing_vals_with="NaN",
    value_column_name="series_value",
):
    col_names = []
    col_types = []
    all_data = {}
    line_count = 0
    frequency = None
    forecast_horizon = None
    contain_missing_values = None
    contain_equal_length = None
    found_data_tag = False
    found_data_section = False
    started_reading_data_section = False

    with open(full_file_path_and_name, "r", encoding="cp1252") as file:
    #with open(full_file_path_and_name, "r", encoding="utf-8") as file:
        for line in file:
            # Strip white space from start/end of line
            line = line.strip()

            if line:
                if line.startswith("@"):  # Read meta-data
                    if not line.startswith("@data"):
                        line_content = line.split(" ")
                        if line.startswith("@attribute"):
                            if (
                                len(line_content) != 3
                            ):  # Attributes have both name and type
                                raise Exception("Invalid meta-data specification.")

                            col_names.append(line_content[1])
                            col_types.append(line_content[2])
                        else:
                            if (
                                len(line_content) != 2
                            ):  # Other meta-data have only values
                                raise Exception("Invalid meta-data specification.")

                            if line.startswith("@frequency"):
                                frequency = line_content[1]
                            elif line.startswith("@horizon"):
                                forecast_horizon = int(line_content[1])
                            elif line.startswith("@missing"):
                                contain_missing_values = bool(
                                    strtobool(line_content[1])
                                )
                            elif line.startswith("@equallength"):
                                contain_equal_length = bool(strtobool(line_content[1]))

                    else:
                        if len(col_names) == 0:
                            raise Exception(
                                "Missing attribute section. Attribute section must come before data."
                            )

                        found_data_tag = True
                elif not line.startswith("#"):
                    if len(col_names) == 0:
                        raise Exception(
                            "Missing attribute section. Attribute section must come before data."
                        )
                    elif not found_data_tag:
                        raise Exception("Missing @data tag.")
                    else:
                        if not started_reading_data_section:
                            started_reading_data_section = True
                            found_data_section = True
                            all_series = []

                            for col in col_names:
                                all_data[col] = []

                        full_info = line.split(":")

                        if len(full_info) != (len(col_names) + 1):
                            raise Exception("Missing attributes/values in series.")

                        series = full_info[len(full_info) - 1]
                        series = series.split(",")

                        if len(series) == 0:
                            raise Exception(
                                "A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series. Missing values should be indicated with ? symbol"
                            )

                        numeric_series = []

                        for val in series:
                            if val == "?":
                                numeric_series.append(replace_missing_vals_with)
                            else:
                                numeric_series.append(float(val))

                        if numeric_series.count(replace_missing_vals_with) == len(
                            numeric_series
                        ):
                            raise Exception(
                                "All series values are missing. A given series should contains a set of comma separated numeric values. At least one numeric value should be there in a series."
                            )

                        all_series.append(pd.Series(numeric_series).array)

                        for i in range(len(col_names)):
                            att_val = None
                            if col_types[i] == "numeric":
                                att_val = int(full_info[i])
                            elif col_types[i] == "string":
                                att_val = str(full_info[i])
                            elif col_types[i] == "date":
                                att_val = datetime.strptime(
                                    full_info[i], "%Y-%m-%d %H-%M-%S"
                                )
                            else:
                                raise Exception(
                                    "Invalid attribute type."
                                )  # Currently, the code supports only numeric, string and date types. Extend this as required.

                            if att_val is None:
                                raise Exception("Invalid attribute value.")
                            else:
                                all_data[col_names[i]].append(att_val)

                line_count = line_count + 1

        if line_count == 0:
            raise Exception("Empty file.")
        if len(col_names) == 0:
            raise Exception("Missing attribute section.")
        if not found_data_section:
            raise Exception("Missing series information under data section.")

        all_data[value_column_name] = all_series
        loaded_data = pd.DataFrame(all_data)

        return (
            loaded_data,
            frequency,
            forecast_horizon,
            contain_missing_values,
            contain_equal_length,
        )

In [4]:
# def _create_time_series(unique_id, start, periods, values, freq):
#     print('start', start)
#     assert 1<0, 'STOP'
#     total_dates = pd.date_range(start=start, periods=periods, freq=freq)
#     time_series = pd.DataFrame({'ts_name': unique_id, 'ds': total_dates, 'y': values})
#     return time_series

# def parse_data(loaded_data, frequency):
#     loaded_data['count'] = loaded_data['series_value'].apply(lambda x: len(x))
#     data = pd.concat([_create_time_series(row[1]['series_name'],
#                                           row[1]['start_timestamp'],
#                                           row[1]['count'],
#                                           row[1]['series_value'],
#                                           frequency) for row in loaded_data.iterrows()])
#     return data

In [5]:
def _create_time_series(unique_id, periods, values, freq):
    time_series = pd.DataFrame({'ts_name': unique_id, 'ds': range(len(values)), 'y': values})
    return time_series

def parse_data(loaded_data, frequency):
    loaded_data['count'] = loaded_data['series_value'].apply(lambda x: len(x))
    data = pd.concat([_create_time_series(row[1]['series_name'],
                                          row[1]['count'],
                                          row[1]['series_value'],
                                          frequency) for row in loaded_data.iterrows()])
    return data

In [6]:
frequency_dict = {'nn5_daily_dataset_without_missing_values': 'D',
                  'nn5_weekly_dataset': 'W-MON',
                  'london_smart_meters_dataset_without_missing_values': '30min', 
                  'car_parts_dataset_without_missing_values': 'MS',
                  'covid_deaths_dataset':'D',
                  'solar_10_minutes_dataset': '10min',
                  'hospital_dataset': 'MS',
                  'electricity_hourly_dataset': 'H',
                  'electricity_weekly_dataset': 'W-SUN',
                  'pedestrian_counts_dataset': 'H',
                  'kdd_cup_2018_dataset_without_missing_values': 'H',
                  'wind_farms_minutely_dataset_without_missing_values': 'T',
                  'fred_md_dataset': 'MS',
                  'sunspot_dataset_without_missing_values': 'D',
                  'dominick_dataset': 'W',
                  'kaggle_web_traffic_dataset_without_missing_values': 'D'}

data_dict = {'nn5_daily_dataset_without_missing_values': 'nn5_daily',
             'nn5_weekly_dataset': 'nn5_weekly',
             'london_smart_meters_dataset_without_missing_values': 'london_smart_meters',
             'car_parts_dataset_without_missing_values': 'car_parts',
             'covid_deaths_dataset': 'covid_deaths',
             'solar_10_minutes_dataset': 'solar_alabama',
             'hospital_dataset': 'hospital',
             'electricity_hourly_dataset': 'electricity_hourly',
             'electricity_weekly_dataset': 'electricity_weekly',
             'pedestrian_counts_dataset': 'pedestrian_counts',
             'kdd_cup_2018_dataset_without_missing_values': 'kdd_cup_2018',
             'wind_farms_minutely_dataset_without_missing_values': 'wind_farms',
             'fred_md_dataset': 'fred_md',
             'sunspot_dataset_without_missing_values': 'sunspot',
             'dominick_dataset': 'dominick',
             'kaggle_web_traffic_dataset_without_missing_values': 'web_traffic'}

# datasets = ['nn5_daily_dataset_without_missing_values',
#             'nn5_weekly_dataset',
#             'london_smart_meters_dataset_without_missing_values',
#             'car_parts_dataset_without_missing_values',
#             'covid_deaths_dataset',
#             'solar_10_minutes_dataset',
#             'hospital_dataset',
#             'electricity_hourly_dataset',
#             'electricity_weekly_dataset',
#             'pedestrian_counts_dataset',
#             'kdd_cup_2018_dataset_without_missing_values']
#datasets =   ['wind_farms_minutely_dataset_without_missing_values']
#datasets = ['fred_md_dataset']
#datasets = ['sunspot_dataset_without_missing_values']
datasets = ['kaggle_web_traffic_dataset_without_missing_values']

In [7]:
data_list = []
for data_name in datasets:
    print('data: ', data_name)
    loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe(f"data_monash/{data_name}.tsf")
    data = parse_data(loaded_data=loaded_data, frequency=frequency_dict[data_name])
    data['dataset'] = data_dict[data_name]
    data['frequency'] = frequency_dict[data_name]
    data_list.append(data)
complete_data = pd.concat(data_list).reset_index(drop=True)
complete_data['to_hash'] = complete_data['dataset'] + '_' + complete_data['ts_name']

data:  kaggle_web_traffic_dataset_without_missing_values


## timetnet

In [None]:
complete_data['unique_id'] = complete_data['to_hash'].apply(lambda x: hashlib.sha1(x.encode()).hexdigest())
complete_data = complete_data[['unique_id', 'ds', 'y', 'dataset', 'ts_name', 'frequency']]
complete_data

In [None]:
complete_data.to_parquet('monash_2.parquet', index=False)

In [None]:
assert 1<0, 'STOP'

In [None]:
#loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("data_monash/nn5_daily_dataset_without_missing_values.tsf")            # DONE
#loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("data_monash/nn5_weekly_dataset.tsf")                                  # DONE
#loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("data_monash/london_smart_meters_dataset_without_missing_values.tsf")  # DONE
#loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("data_monash/car_parts_dataset_without_missing_values.tsf")            # DONE
#loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("data_monash/covid_deaths_dataset.tsf")                                # DONE
#loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("data_monash/solar_10_minutes_dataset.tsf")                            # DONE
#loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("data_monash/hospital_dataset.tsf")                                    # DONE
#loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("data_monash/electricity_hourly_dataset.tsf")                          # DONE
#loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("data_monash/wind_farms_minutely_dataset_without_missing_values.tsf")  # DONE
#loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("data_monash/kdd_cup_2018_dataset_without_missing_values.tsf")         # DONE
#loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("data_monash/pedestrian_counts_dataset.tsf")                           # DONE

#loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("data_monash/weather_dataset.tsf")                                     # no date
#loaded_data, frequency, forecast_horizon, contain_missing_values, contain_equal_length = convert_tsf_to_dataframe("data_monash/dominick_dataset.tsf")                                    # no date

In [None]:
frequency_dict = {'half_hourly': '30min',
                  'daily': 'D',
                  '10_minutes': '10min',
                  'hourly': 'H',
                  'monthly': 'MS', # CUIDADO
                  'minutely': 'T',
                  'weekly': 'W-SUN'}  # CUIDADO

In [None]:
def _create_time_series(unique_id, start, periods, values, freq):
    total_dates = pd.date_range(start=start, periods=periods, freq=freq)
    time_series = pd.DataFrame({'unique_id': unique_id, 'ds': total_dates, 'y': values})
    return time_series

In [None]:
def parse_data(loaded_data, frequency):
    loaded_data['count'] = loaded_data['series_value'].apply(lambda x: len(x))
    data = pd.concat([_create_time_series(row[1]['series_name'],
                                          row[1]['start_timestamp'],
                                          row[1]['count'],
                                          row[1]['series_value'],
                                          frequency) for row in loaded_data.iterrows()])
    return data

In [None]:
data = parse_data(loaded_data=loaded_data, frequency=frequency_dict[frequency])

In [None]:
uniques = complete_data.ts_name.unique()
sample = np.random.choice(uniques, 1000)
sample_data = complete_data[complete_data.ts_name.isin(sample)]

In [None]:
data = sample_data[['ts_name','ds','y']]
data.columns = ['unique_id','ds','y']
data.to_csv('dominick.csv', index=False)

In [14]:
wiki_transfer = complete_data[['ts_name','ds','y']]
wiki_transfer.columns = ['unique_id','ds','y']
wiki_transfer.to_csv('wiki_daily.csv', index=False)