In [1]:
import hashlib
import numpy as np
import pandas as pd

from datasetsforecast.m4 import M4

In [2]:
def _create_ds(unique_id, start, periods, freq):
    total_dates = pd.date_range(start=start, periods=periods, freq=freq)
    time_series = pd.DataFrame({'ts_name': unique_id, 'ds': total_dates})
    return time_series

In [3]:
def parse_monthly(Y_df, dates):

    counts = Y_df.groupby('unique_id').count().reset_index()[['unique_id', 'y']]
    counts.columns = ['unique_id', 'count']

    dates = dates.copy()
    dates = dates[dates['SP'] == 'Monthly'].reset_index(drop=True)
    dates = dates.merge(counts, left_on='M4id', right_on='unique_id')

    def parse_date(x):
        if len(x) == 14:
            if int(x[6:8]) < 17:
                return '20' + x[6:8] + '-' + x[3:5] + '-01'
            else:
                return '19' + x[6:8] + '-' + x[3:5] + '-01'
        elif len(x) == 19:
            return x[:10]

    dates['StartingDate'] = dates['StartingDate'].apply(parse_date)
    assert len(dates[dates['StartingDate'].isnull()]) == 0

    ds_df = pd.concat([_create_ds(row[1]['M4id'],
                                row[1]['StartingDate'],
                                row[1]['count'],
                                frequency_map['Monthly']) for row in dates.iterrows()])

    ds_df = ds_df.sort_values(by=['ts_name', 'ds']).reset_index(drop=True)
    np.all(Y_df['unique_id'] == ds_df['ts_name'])
    Y_df['ds'] = ds_df['ds']

    return Y_df

In [4]:
frequency_map = {'Monthly': 'MS', 'Quarterly': 'Q', 'Yearly': 'Y', 'Weekly': 'W'}

Y_df, _, _ = M4.load(directory='./', group='Monthly', cache=True)
dates = pd.read_csv('https://raw.githubusercontent.com/Mcompetitions/M4-methods/master/Dataset/M4-info.csv')

In [5]:
Y_df = parse_monthly(Y_df, dates)

In [6]:
Y_df['dataset'] = 'M4_monthly'
Y_df['frequency'] = 'MS'
Y_df['to_hash'] = Y_df['dataset'] + '_' + Y_df['unique_id']
Y_df = Y_df[['to_hash', 'ds', 'y', 'dataset','frequency']]
Y_df['unique_id'] = Y_df['to_hash'].apply(lambda x: hashlib.sha1(x.encode()).hexdigest())
Y_df = Y_df[['unique_id', 'ds', 'y', 'dataset', 'frequency']]

In [7]:
Y_df.to_parquet('M4_monthly.parquet', index=False)