diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..753b249 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.ipynb merge=nbdev-merge diff --git a/.gitconfig b/.gitconfig new file mode 100644 index 0000000..9054574 --- /dev/null +++ b/.gitconfig @@ -0,0 +1,11 @@ +# Generated by nbdev_install_hooks +# +# If you need to disable this instrumentation do: +# git config --local --unset include.path +# +# To restore: +# git config --local include.path ../.gitconfig +# +[merge "nbdev-merge"] + name = resolve conflicts with nbdev_fix + driver = nbdev_merge %O %A %B %P diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..38afe05 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,43 @@ +name: CI + +on: + push: + branches: [main] + pull_request: + branches: [main] + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + run-tests: + strategy: + fail-fast: false + matrix: + python-version: ["3.10"] #["3.8", "3.9", "3.10", "3.11"] + os: [ubuntu-latest, windows-latest] #[macos-latest, windows-latest, ubuntu-latest] + runs-on: ${{ matrix.os }} + steps: + - name: Clone repo + uses: actions/checkout@v3 + + - name: Set up environment + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + + - name: setup R + uses: r-lib/actions/setup-r@v2 + with: + r-version: "4.3.3" + windows-path-include-rtools: true # Whether to add Rtools to the PATH. + install-r: true # If “false” use the existing installation in the GitHub Action image + update-rtools: true # Update rtools40 compilers and libraries to the latest builds. + + - name: Install the library + run: pip install ".[dev]" + + - name: Run tests + run: nbdev_test --do_print --timing --n_workers 1 diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 1fd42a8..d579549 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -5,35 +5,34 @@ name: Python package on: push: - branches: [ master ] + branches: [master] pull_request: - branches: [ master ] + branches: [master] jobs: build: - runs-on: ubuntu-latest strategy: matrix: - python-version: [3.7, 3.8, 3.9] + python-version: ["3.8", "3.9", "3.10", "3.11"] steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install flake8 pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - - name: Lint with flake8 - run: | - # stop the build if there are Python syntax errors or undefined names - flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics - - name: Test with pytest - run: | - pytest + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + - name: Test with pytest + run: | + pytest diff --git a/.gitignore b/.gitignore index d722dc0..183357a 100644 --- a/.gitignore +++ b/.gitignore @@ -134,4 +134,6 @@ Untitle*.ipynb #files *.csv -*.ipynb + + +_proc \ No newline at end of file diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..5c0e7ce --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,5 @@ +include settings.ini +include LICENSE +include CONTRIBUTING.md +include README.md +recursive-exclude * __pycache__ diff --git a/README.md b/README.md index 0520204..5faa87a 100644 --- a/README.md +++ b/README.md @@ -48,16 +48,16 @@ tsfeatures(panel, dict_freqs={'D': 7, 'W': 52}) ## List of available features -| Features ||| -|:--------|:------|:-------------| -|acf_features|heterogeneity|series_length| -|arch_stat|holt_parameters|sparsity| -|count_entropy|hurst|stability| -|crossing_points|hw_parameters|stl_features| -|entropy|intervals|unitroot_kpss| -|flat_spots|lumpiness|unitroot_pp| -|frequency|nonlinearity|| -|guerrero|pacf_features|| +| Features | | | +| :-------------- | :-------------- | :------------ | +| acf_features | heterogeneity | series_length | +| arch_stat | holt_parameters | sparsity | +| count_entropy | hurst | stability | +| crossing_points | hw_parameters | stl_features | +| entropy | intervals | unitroot_kpss | +| flat_spots | lumpiness | unitroot_pp | +| frequency | nonlinearity | | +| guerrero | pacf_features | | See the docs for a description of the features. To use a particular feature included in the package you need to import it: @@ -96,18 +96,18 @@ Observe that this function receives a list of strings instead of a list of funct ### Non-seasonal data (100 Daily M4 time series) -| feature | diff | feature | diff | feature | diff | feature | diff | -|:----------------|-------:|:----------------|-------:|:----------------|-------:|:----------------|-------:| -| e_acf10 | 0 | e_acf1 | 0 | diff2_acf1 | 0 | alpha | 3.2 | -| seasonal_period | 0 | spike | 0 | diff1_acf10 | 0 | arch_acf | 3.3 | -| nperiods | 0 | curvature | 0 | x_acf1 | 0 | beta | 4.04 | -| linearity | 0 | crossing_points | 0 | nonlinearity | 0 | garch_r2 | 4.74 | -| hw_gamma | 0 | lumpiness | 0 | diff2x_pacf5 | 0 | hurst | 5.45 | -| hw_beta | 0 | diff1x_pacf5 | 0 | unitroot_kpss | 0 | garch_acf | 5.53 | -| hw_alpha | 0 | diff1_acf10 | 0 | x_pacf5 | 0 | entropy | 11.65 | -| trend | 0 | arch_lm | 0 | x_acf10 | 0 | -| flat_spots | 0 | diff1_acf1 | 0 | unitroot_pp | 0 | -| series_length | 0 | stability | 0 | arch_r2 | 1.37 | +| feature | diff | feature | diff | feature | diff | feature | diff | +| :-------------- | ---: | :-------------- | ---: | :------------ | ---: | :-------- | ----: | +| e_acf10 | 0 | e_acf1 | 0 | diff2_acf1 | 0 | alpha | 3.2 | +| seasonal_period | 0 | spike | 0 | diff1_acf10 | 0 | arch_acf | 3.3 | +| nperiods | 0 | curvature | 0 | x_acf1 | 0 | beta | 4.04 | +| linearity | 0 | crossing_points | 0 | nonlinearity | 0 | garch_r2 | 4.74 | +| hw_gamma | 0 | lumpiness | 0 | diff2x_pacf5 | 0 | hurst | 5.45 | +| hw_beta | 0 | diff1x_pacf5 | 0 | unitroot_kpss | 0 | garch_acf | 5.53 | +| hw_alpha | 0 | diff1_acf10 | 0 | x_pacf5 | 0 | entropy | 11.65 | +| trend | 0 | arch_lm | 0 | x_acf10 | 0 | +| flat_spots | 0 | diff1_acf1 | 0 | unitroot_pp | 0 | +| series_length | 0 | stability | 0 | arch_r2 | 1.37 | To replicate this results use: @@ -118,20 +118,20 @@ python -m tsfeatures.compare_with_r --results_directory /some/path ### Sesonal data (100 Hourly M4 time series) -| feature | diff | feature | diff | feature | diff | feature | diff | -|:------------------|-------:|:-------------|-----:|:----------|--------:|:-----------|--------:| -| series_length | 0 |seas_acf1 | 0 | trend | 2.28 | hurst | 26.02 | -| flat_spots | 0 |x_acf1|0| arch_r2 | 2.29 | hw_beta | 32.39 | -| nperiods | 0 |unitroot_kpss|0| alpha | 2.52 | trough | 35 | -| crossing_points | 0 |nonlinearity|0| beta | 3.67 | peak | 69 | -| seasonal_period | 0 |diff1_acf10|0| linearity | 3.97 | -| lumpiness | 0 |x_acf10|0| curvature | 4.8 | -| stability | 0 |seas_pacf|0| e_acf10 | 7.05 | -| arch_lm | 0 |unitroot_pp|0| garch_r2 | 7.32 | -| diff2_acf1 | 0 |spike|0| hw_gamma | 7.32 | -| diff2_acf10 | 0 |seasonal_strength|0.79| hw_alpha | 7.47 | -| diff1_acf1 | 0 |e_acf1|1.67| garch_acf | 7.53 | -| diff2x_pacf5 | 0 |arch_acf|2.18| entropy | 9.45 | +| feature | diff | feature | diff | feature | diff | feature | diff | +| :-------------- | ---: | :---------------- | ---: | :-------- | ---: | :------ | ----: | +| series_length | 0 | seas_acf1 | 0 | trend | 2.28 | hurst | 26.02 | +| flat_spots | 0 | x_acf1 | 0 | arch_r2 | 2.29 | hw_beta | 32.39 | +| nperiods | 0 | unitroot_kpss | 0 | alpha | 2.52 | trough | 35 | +| crossing_points | 0 | nonlinearity | 0 | beta | 3.67 | peak | 69 | +| seasonal_period | 0 | diff1_acf10 | 0 | linearity | 3.97 | +| lumpiness | 0 | x_acf10 | 0 | curvature | 4.8 | +| stability | 0 | seas_pacf | 0 | e_acf10 | 7.05 | +| arch_lm | 0 | unitroot_pp | 0 | garch_r2 | 7.32 | +| diff2_acf1 | 0 | spike | 0 | hw_gamma | 7.32 | +| diff2_acf10 | 0 | seasonal_strength | 0.79 | hw_alpha | 7.47 | +| diff1_acf1 | 0 | e_acf1 | 1.67 | garch_acf | 7.53 | +| diff2x_pacf5 | 0 | arch_acf | 2.18 | entropy | 9.45 | To replicate this results use: diff --git a/nbs/.gitignore b/nbs/.gitignore new file mode 100644 index 0000000..075b254 --- /dev/null +++ b/nbs/.gitignore @@ -0,0 +1 @@ +/.quarto/ diff --git a/nbs/_quarto.yml b/nbs/_quarto.yml new file mode 100644 index 0000000..006b406 --- /dev/null +++ b/nbs/_quarto.yml @@ -0,0 +1,20 @@ +project: + type: website + +format: + html: + theme: cosmo + css: styles.css + toc: true + +website: + twitter-card: true + open-graph: true + repo-actions: [issue] + navbar: + background: primary + search: true + sidebar: + style: floating + +metadata-files: [nbdev.yml, sidebar.yml] diff --git a/nbs/index.ipynb b/nbs/index.ipynb new file mode 100644 index 0000000..5998625 --- /dev/null +++ b/nbs/index.ipynb @@ -0,0 +1,406 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# tsfeatures\n", + "\n", + "Calculates various features from time series data. Python implementation of the R package _[tsfeatures](https://github.com/robjhyndman/tsfeatures)_.\n", + "\n", + "# Installation\n", + "\n", + "You can install the *released* version of `tsfeatures` from the [Python package index](pypi.org) with:\n", + "\n", + "``` python\n", + "pip install tsfeatures\n", + "```\n", + "\n", + "# Usage\n", + "\n", + "The `tsfeatures` main function calculates by default the features used by Montero-Manso, Talagala, Hyndman and Athanasopoulos in [their implementation of the FFORMA model](https://htmlpreview.github.io/?https://github.com/robjhyndman/M4metalearning/blob/master/docs/M4_methodology.html#features).\n", + "\n", + "```python\n", + "from tsfeatures import tsfeatures\n", + "```\n", + "\n", + "This function receives a panel pandas df with columns `unique_id`, `ds`, `y` and optionally the frequency of the data.\n", + "\n", + "\n", + "\n", + "```python\n", + "tsfeatures(panel, freq=7)\n", + "```\n", + "\n", + "By default (`freq=None`) the function will try to infer the frequency of each time series (using `infer_freq` from `pandas` on the `ds` column) and assign a seasonal period according to the built-in dictionary `FREQS`:\n", + "\n", + "```python\n", + "FREQS = {'H': 24, 'D': 1,\n", + " 'M': 12, 'Q': 4,\n", + " 'W':1, 'Y': 1}\n", + "```\n", + "\n", + "You can use your own dictionary using the `dict_freqs` argument:\n", + "\n", + "```python\n", + "tsfeatures(panel, dict_freqs={'D': 7, 'W': 52})\n", + "```\n", + "\n", + "## List of available features\n", + "\n", + "| Features |||\n", + "|:--------|:------|:-------------|\n", + "|acf_features|heterogeneity|series_length|\n", + "|arch_stat|holt_parameters|sparsity|\n", + "|count_entropy|hurst|stability|\n", + "|crossing_points|hw_parameters|stl_features|\n", + "|entropy|intervals|unitroot_kpss|\n", + "|flat_spots|lumpiness|unitroot_pp|\n", + "|frequency|nonlinearity||\n", + "|guerrero|pacf_features||\n", + "\n", + "See the docs for a description of the features. To use a particular feature included in the package you need to import it:\n", + "\n", + "```python\n", + "from tsfeatures import acf_features\n", + "\n", + "tsfeatures(panel, freq=7, features=[acf_features])\n", + "```\n", + "\n", + "You can also define your own function and use it together with the included features:\n", + "\n", + "```python\n", + "def number_zeros(x, freq):\n", + "\n", + " number = (x == 0).sum()\n", + " return {'number_zeros': number}\n", + "\n", + "tsfeatures(panel, freq=7, features=[acf_features, number_zeros])\n", + "```\n", + "\n", + "`tsfeatures` can handle functions that receives a numpy array `x` and a frequency `freq` (this parameter is needed even if you don't use it) and returns a dictionary with the feature name as a key and its value.\n", + "\n", + "## R implementation\n", + "\n", + "You can use this package to call `tsfeatures` from R inside python (you need to have installed R, the packages `forecast` and `tsfeatures`; also the python package `rpy2`):\n", + "\n", + "```python\n", + "from tsfeatures.tsfeatures_r import tsfeatures_r\n", + "\n", + "tsfeatures_r(panel, freq=7, features=[\"acf_features\"])\n", + "```\n", + "\n", + "Observe that this function receives a list of strings instead of a list of functions.\n", + "\n", + "## Comparison with the R implementation (sum of absolute differences)\n", + "\n", + "### Non-seasonal data (100 Daily M4 time series)\n", + "\n", + "| feature | diff | feature | diff | feature | diff | feature | diff |\n", + "|:----------------|-------:|:----------------|-------:|:----------------|-------:|:----------------|-------:|\n", + "| e_acf10 | 0 | e_acf1 | 0 | diff2_acf1 | 0 | alpha | 3.2 |\n", + "| seasonal_period | 0 | spike | 0 | diff1_acf10 | 0 | arch_acf | 3.3 |\n", + "| nperiods | 0 | curvature | 0 | x_acf1 | 0 | beta | 4.04 |\n", + "| linearity | 0 | crossing_points | 0 | nonlinearity | 0 | garch_r2 | 4.74 |\n", + "| hw_gamma | 0 | lumpiness | 0 | diff2x_pacf5 | 0 | hurst | 5.45 |\n", + "| hw_beta | 0 | diff1x_pacf5 | 0 | unitroot_kpss | 0 | garch_acf | 5.53 |\n", + "| hw_alpha | 0 | diff1_acf10 | 0 | x_pacf5 | 0 | entropy | 11.65 |\n", + "| trend | 0 | arch_lm | 0 | x_acf10 | 0 |\n", + "| flat_spots | 0 | diff1_acf1 | 0 | unitroot_pp | 0 |\n", + "| series_length | 0 | stability | 0 | arch_r2 | 1.37 |\n", + "\n", + "To replicate this results use:\n", + "\n", + "``` console\n", + "python -m tsfeatures.compare_with_r --results_directory /some/path\n", + " --dataset_name Daily --num_obs 100\n", + "```\n", + "\n", + "### Sesonal data (100 Hourly M4 time series)\n", + "\n", + "| feature | diff | feature | diff | feature | diff | feature | diff |\n", + "|:------------------|-------:|:-------------|-----:|:----------|--------:|:-----------|--------:|\n", + "| series_length | 0 |seas_acf1 | 0 | trend | 2.28 | hurst | 26.02 |\n", + "| flat_spots | 0 |x_acf1|0| arch_r2 | 2.29 | hw_beta | 32.39 |\n", + "| nperiods | 0 |unitroot_kpss|0| alpha | 2.52 | trough | 35 |\n", + "| crossing_points | 0 |nonlinearity|0| beta | 3.67 | peak | 69 |\n", + "| seasonal_period | 0 |diff1_acf10|0| linearity | 3.97 |\n", + "| lumpiness | 0 |x_acf10|0| curvature | 4.8 |\n", + "| stability | 0 |seas_pacf|0| e_acf10 | 7.05 |\n", + "| arch_lm | 0 |unitroot_pp|0| garch_r2 | 7.32 |\n", + "| diff2_acf1 | 0 |spike|0| hw_gamma | 7.32 |\n", + "| diff2_acf10 | 0 |seasonal_strength|0.79| hw_alpha | 7.47 |\n", + "| diff1_acf1 | 0 |e_acf1|1.67| garch_acf | 7.53 |\n", + "| diff2x_pacf5 | 0 |arch_acf|2.18| entropy | 9.45 |\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![CI](https://github.com/Nixtla/mlforecast/actions/workflows/ci.yaml/badge.svg)](https://github.com/Nixtla/mlforecast/actions/workflows/ci.yaml)\n", + "[![Python](https://img.shields.io/pypi/pyversions/mlforecast)](https://pypi.org/project/mlforecast/)\n", + "[![PyPi](https://img.shields.io/pypi/v/mlforecast?color=blue)](https://pypi.org/project/mlforecast/)\n", + "[![conda-forge](https://img.shields.io/conda/vn/conda-forge/mlforecast?color=blue)](https://anaconda.org/conda-forge/mlforecast)\n", + "[![License](https://img.shields.io/github/license/Nixtla/mlforecast)](https://github.com/Nixtla/mlforecast/blob/main/LICENSE)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# | hide\n", + "import nbdev\n", + "\n", + "nbdev.nbdev_export()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "# tsfeatures\n", + "\n", + "Calculates various features from time series data. Python implementation of the R package _[tsfeatures](https://github.com/robjhyndman/tsfeatures)_.\n", + "\n", + "# Installation\n", + "\n", + "You can install the *released* version of `tsfeatures` from the [Python package index](pypi.org) with:\n", + "\n", + "``` python\n", + "pip install tsfeatures\n", + "```\n", + "\n", + "# Usage\n", + "\n", + "The `tsfeatures` main function calculates by default the features used by Montero-Manso, Talagala, Hyndman and Athanasopoulos in [their implementation of the FFORMA model](https://htmlpreview.github.io/?https://github.com/robjhyndman/M4metalearning/blob/master/docs/M4_methodology.html#features).\n", + "\n", + "```python\n", + "from tsfeatures import tsfeatures\n", + "```\n", + "\n", + "This function receives a panel pandas df with columns `unique_id`, `ds`, `y` and optionally the frequency of the data.\n", + "\n", + "\n", + "\n", + "```python\n", + "tsfeatures(panel, freq=7)\n", + "```\n", + "\n", + "By default (`freq=None`) the function will try to infer the frequency of each time series (using `infer_freq` from `pandas` on the `ds` column) and assign a seasonal period according to the built-in dictionary `FREQS`:\n", + "\n", + "```python\n", + "FREQS = {'H': 24, 'D': 1,\n", + " 'M': 12, 'Q': 4,\n", + " 'W':1, 'Y': 1}\n", + "```\n", + "\n", + "You can use your own dictionary using the `dict_freqs` argument:\n", + "\n", + "```python\n", + "tsfeatures(panel, dict_freqs={'D': 7, 'W': 52})\n", + "```\n", + "\n", + "## List of available features\n", + "\n", + "| Features |||\n", + "|:--------|:------|:-------------|\n", + "|acf_features|heterogeneity|series_length|\n", + "|arch_stat|holt_parameters|sparsity|\n", + "|count_entropy|hurst|stability|\n", + "|crossing_points|hw_parameters|stl_features|\n", + "|entropy|intervals|unitroot_kpss|\n", + "|flat_spots|lumpiness|unitroot_pp|\n", + "|frequency|nonlinearity||\n", + "|guerrero|pacf_features||\n", + "\n", + "See the docs for a description of the features. To use a particular feature included in the package you need to import it:\n", + "\n", + "```python\n", + "from tsfeatures import acf_features\n", + "\n", + "tsfeatures(panel, freq=7, features=[acf_features])\n", + "```\n", + "\n", + "You can also define your own function and use it together with the included features:\n", + "\n", + "```python\n", + "def number_zeros(x, freq):\n", + "\n", + " number = (x == 0).sum()\n", + " return {'number_zeros': number}\n", + "\n", + "tsfeatures(panel, freq=7, features=[acf_features, number_zeros])\n", + "```\n", + "\n", + "`tsfeatures` can handle functions that receives a numpy array `x` and a frequency `freq` (this parameter is needed even if you don't use it) and returns a dictionary with the feature name as a key and its value.\n", + "\n", + "## R implementation\n", + "\n", + "You can use this package to call `tsfeatures` from R inside python (you need to have installed R, the packages `forecast` and `tsfeatures`; also the python package `rpy2`):\n", + "\n", + "```python\n", + "from tsfeatures.tsfeatures_r import tsfeatures_r\n", + "\n", + "tsfeatures_r(panel, freq=7, features=[\"acf_features\"])\n", + "```\n", + "\n", + "Observe that this function receives a list of strings instead of a list of functions.\n", + "\n", + "## Comparison with the R implementation (sum of absolute differences)\n", + "\n", + "### Non-seasonal data (100 Daily M4 time series)\n", + "\n", + "| feature | diff | feature | diff | feature | diff | feature | diff |\n", + "|:----------------|-------:|:----------------|-------:|:----------------|-------:|:----------------|-------:|\n", + "| e_acf10 | 0 | e_acf1 | 0 | diff2_acf1 | 0 | alpha | 3.2 |\n", + "| seasonal_period | 0 | spike | 0 | diff1_acf10 | 0 | arch_acf | 3.3 |\n", + "| nperiods | 0 | curvature | 0 | x_acf1 | 0 | beta | 4.04 |\n", + "| linearity | 0 | crossing_points | 0 | nonlinearity | 0 | garch_r2 | 4.74 |\n", + "| hw_gamma | 0 | lumpiness | 0 | diff2x_pacf5 | 0 | hurst | 5.45 |\n", + "| hw_beta | 0 | diff1x_pacf5 | 0 | unitroot_kpss | 0 | garch_acf | 5.53 |\n", + "| hw_alpha | 0 | diff1_acf10 | 0 | x_pacf5 | 0 | entropy | 11.65 |\n", + "| trend | 0 | arch_lm | 0 | x_acf10 | 0 |\n", + "| flat_spots | 0 | diff1_acf1 | 0 | unitroot_pp | 0 |\n", + "| series_length | 0 | stability | 0 | arch_r2 | 1.37 |\n", + "\n", + "To replicate this results use:\n", + "\n", + "``` console\n", + "python -m tsfeatures.compare_with_r --results_directory /some/path\n", + " --dataset_name Daily --num_obs 100\n", + "```\n", + "\n", + "### Sesonal data (100 Hourly M4 time series)\n", + "\n", + "| feature | diff | feature | diff | feature | diff | feature | diff |\n", + "|:------------------|-------:|:-------------|-----:|:----------|--------:|:-----------|--------:|\n", + "| series_length | 0 |seas_acf1 | 0 | trend | 2.28 | hurst | 26.02 |\n", + "| flat_spots | 0 |x_acf1|0| arch_r2 | 2.29 | hw_beta | 32.39 |\n", + "| nperiods | 0 |unitroot_kpss|0| alpha | 2.52 | trough | 35 |\n", + "| crossing_points | 0 |nonlinearity|0| beta | 3.67 | peak | 69 |\n", + "| seasonal_period | 0 |diff1_acf10|0| linearity | 3.97 |\n", + "| lumpiness | 0 |x_acf10|0| curvature | 4.8 |\n", + "| stability | 0 |seas_pacf|0| e_acf10 | 7.05 |\n", + "| arch_lm | 0 |unitroot_pp|0| garch_r2 | 7.32 |\n", + "| diff2_acf1 | 0 |spike|0| hw_gamma | 7.32 |\n", + "| diff2_acf10 | 0 |seasonal_strength|0.79| hw_alpha | 7.47 |\n", + "| diff1_acf1 | 0 |e_acf1|1.67| garch_acf | 7.53 |\n", + "| diff2x_pacf5 | 0 |arch_acf|2.18| entropy | 9.45 |\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[![Build](https://github.com/FedericoGarza/tsfeatures/workflows/Python%20package/badge.svg)](https://github.com/FedericoGarza/tsfeatures/tree/master)\n", + "[![PyPI version fury.io](https://badge.fury.io/py/tsfeatures.svg)](https://pypi.python.org/pypi/tsfeatures/)\n", + "[![Downloads](https://pepy.tech/badge/tsfeatures)](https://pepy.tech/project/tsfeatures)\n", + "[![Python 3.6+](https://img.shields.io/badge/python-3.7+-blue.svg)](https://www.python.org/downloads/release/python-370+/)\n", + "[![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://github.com/FedericoGarza/tsfeatures/blob/master/LICENSE)\n", + "\n", + "# tsfeatures\n", + "\n", + "Calculates various features from time series data. Python implementation of the R package _[tsfeatures](https://github.com/robjhyndman/tsfeatures)_.\n", + "\n", + "# Installation\n", + "\n", + "You can install the *released* version of `tsfeatures` from the [Python package index](pypi.org) with:\n", + "\n", + "``` python\n", + "pip install tsfeatures\n", + "```\n", + "\n", + "# Usage\n", + "\n", + "The `tsfeatures` main function calculates by default the features used by Montero-Manso, Talagala, Hyndman and Athanasopoulos in [their implementation of the FFORMA model](https://htmlpreview.github.io/?https://github.com/robjhyndman/M4metalearning/blob/master/docs/M4_methodology.html#features).\n", + "\n", + "```python\n", + "from tsfeatures import tsfeatures\n", + "```\n", + "\n", + "This function receives a panel pandas df with columns `unique_id`, `ds`, `y` and optionally the frequency of the data.\n", + "\n", + "\n", + "\n", + "```python\n", + "tsfeatures(panel, freq=7)\n", + "```\n", + "\n", + "By default (`freq=None`) the function will try to infer the frequency of each time series (using `infer_freq` from `pandas` on the `ds` column) and assign a seasonal period according to the built-in dictionary `FREQS`:\n", + "\n", + "```python\n", + "FREQS = {'H': 24, 'D': 1,\n", + " 'M': 12, 'Q': 4,\n", + " 'W':1, 'Y': 1}\n", + "```\n", + "\n", + "You can use your own dictionary using the `dict_freqs` argument:\n", + "\n", + "```python\n", + "tsfeatures(panel, dict_freqs={'D': 7, 'W': 52})\n", + "```\n", + "\n", + "## List of available features\n", + "\n", + "| Features |||\n", + "|:--------|:------|:-------------|\n", + "|acf_features|heterogeneity|series_length|\n", + "|arch_stat|holt_parameters|sparsity|\n", + "|count_entropy|hurst|stability|\n", + "|crossing_points|hw_parameters|stl_features|\n", + "|entropy|intervals|unitroot_kpss|\n", + "|flat_spots|lumpiness|unitroot_pp|\n", + "|frequency|nonlinearity||\n", + "|guerrero|pacf_features||\n", + "\n", + "See the docs for a description of the features. To use a particular feature included in the package you need to import it:\n", + "\n", + "```python\n", + "from tsfeatures import acf_features\n", + "\n", + "tsfeatures(panel, freq=7, features=[acf_features])\n", + "```\n", + "\n", + "You can also define your own function and use it together with the included features:\n", + "\n", + "```python\n", + "def number_zeros(x, freq):\n", + "\n", + " number = (x == 0).sum()\n", + " return {'number_zeros': number}\n", + "\n", + "tsfeatures(panel, freq=7, features=[acf_features, number_zeros])\n", + "```\n", + "\n", + "`tsfeatures` can handle functions that receives a numpy array `x` and a frequency `freq` (this parameter is needed even if you don't use it) and returns a dictionary with the feature name as a key and its value.\n", + "\n", + " \n", + "\n", + "# Authors\n", + "\n", + "* **Federico Garza** - [FedericoGarza](https://github.com/FedericoGarza)\n", + "* **Kin Gutierrez** - [kdgutier](https://github.com/kdgutier)\n", + "* **Cristian Challu** - [cristianchallu](https://github.com/cristianchallu)\n", + "* **Jose Moralez** - [jose-moralez](https://github.com/jose-moralez)\n", + "* **Ricardo Olivares** - [rolivaresar](https://github.com/rolivaresar)\n", + "* **Max Mergenthaler** - [mergenthaler](https://github.com/mergenthaler)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/nbs/m4_data.ipynb b/nbs/m4_data.ipynb new file mode 100644 index 0000000..03089b2 --- /dev/null +++ b/nbs/m4_data.ipynb @@ -0,0 +1,292 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# features\n", + "\n", + "> Fill in a module description here\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |default_exp m4_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "import os\n", + "import urllib\n", + "\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "\n", + "seas_dict = {\n", + " \"Hourly\": {\"seasonality\": 24, \"input_size\": 24, \"output_size\": 48, \"freq\": \"H\"},\n", + " \"Daily\": {\"seasonality\": 7, \"input_size\": 7, \"output_size\": 14, \"freq\": \"D\"},\n", + " \"Weekly\": {\"seasonality\": 52, \"input_size\": 52, \"output_size\": 13, \"freq\": \"W\"},\n", + " \"Monthly\": {\"seasonality\": 12, \"input_size\": 12, \"output_size\": 18, \"freq\": \"M\"},\n", + " \"Quarterly\": {\"seasonality\": 4, \"input_size\": 4, \"output_size\": 8, \"freq\": \"Q\"},\n", + " \"Yearly\": {\"seasonality\": 1, \"input_size\": 4, \"output_size\": 6, \"freq\": \"D\"},\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "\n", + "SOURCE_URL = (\n", + " \"https://raw.githubusercontent.com/Mcompetitions/M4-methods/master/Dataset/\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "\n", + "\n", + "def maybe_download(filename, directory):\n", + " \"\"\"Download the data from M4's website, unless it's already here.\n", + "\n", + " Parameters\n", + " ----------\n", + " filename: str\n", + " Filename of M4 data with format /Type/Frequency.csv. Example: /Test/Daily-train.csv\n", + " directory: str\n", + " Custom directory where data will be downloaded.\n", + " \"\"\"\n", + " data_directory = directory + \"/m4\"\n", + " train_directory = data_directory + \"/Train/\"\n", + " test_directory = data_directory + \"/Test/\"\n", + "\n", + " os.makedirs(data_directory, exist_ok=True)\n", + " os.makedirs(train_directory, exist_ok=True)\n", + " os.makedirs(test_directory, exist_ok=True)\n", + "\n", + " filepath = os.path.join(data_directory, filename)\n", + " if not os.path.exists(filepath):\n", + " filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename, filepath)\n", + "\n", + " size = os.path.getsize(filepath)\n", + " print(\"Successfully downloaded\", filename, size, \"bytes.\")\n", + "\n", + " return filepath" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "\n", + "\n", + "def m4_parser(dataset_name, directory, num_obs=1000000):\n", + " \"\"\"Transform M4 data into a panel.\n", + "\n", + " Parameters\n", + " ----------\n", + " dataset_name: str\n", + " Frequency of the data. Example: 'Yearly'.\n", + " directory: str\n", + " Custom directory where data will be saved.\n", + " num_obs: int\n", + " Number of time series to return.\n", + " \"\"\"\n", + " data_directory = directory + \"/m4\"\n", + " train_directory = data_directory + \"/Train/\"\n", + " test_directory = data_directory + \"/Test/\"\n", + " freq = seas_dict[dataset_name][\"freq\"]\n", + "\n", + " m4_info = pd.read_csv(data_directory + \"/M4-info.csv\", usecols=[\"M4id\", \"category\"])\n", + " m4_info = m4_info[m4_info[\"M4id\"].str.startswith(dataset_name[0])].reset_index(\n", + " drop=True\n", + " )\n", + "\n", + " # Train data\n", + " train_path = \"{}{}-train.csv\".format(train_directory, dataset_name)\n", + "\n", + " train_df = pd.read_csv(train_path, nrows=num_obs)\n", + " train_df = train_df.rename(columns={\"V1\": \"unique_id\"})\n", + "\n", + " train_df = pd.wide_to_long(\n", + " train_df, stubnames=[\"V\"], i=\"unique_id\", j=\"ds\"\n", + " ).reset_index()\n", + " train_df = train_df.rename(columns={\"V\": \"y\"})\n", + " train_df = train_df.dropna()\n", + " train_df[\"split\"] = \"train\"\n", + " train_df[\"ds\"] = train_df[\"ds\"] - 1\n", + " # Get len of series per unique_id\n", + " len_series = train_df.groupby(\"unique_id\").agg({\"ds\": \"max\"}).reset_index()\n", + " len_series.columns = [\"unique_id\", \"len_serie\"]\n", + "\n", + " # Test data\n", + " test_path = \"{}{}-test.csv\".format(test_directory, dataset_name)\n", + "\n", + " test_df = pd.read_csv(test_path, nrows=num_obs)\n", + " test_df = test_df.rename(columns={\"V1\": \"unique_id\"})\n", + "\n", + " test_df = pd.wide_to_long(\n", + " test_df, stubnames=[\"V\"], i=\"unique_id\", j=\"ds\"\n", + " ).reset_index()\n", + " test_df = test_df.rename(columns={\"V\": \"y\"})\n", + " test_df = test_df.dropna()\n", + " test_df[\"split\"] = \"test\"\n", + " test_df = test_df.merge(len_series, on=\"unique_id\")\n", + " test_df[\"ds\"] = test_df[\"ds\"] + test_df[\"len_serie\"] - 1\n", + " test_df = test_df[[\"unique_id\", \"ds\", \"y\", \"split\"]]\n", + "\n", + " df = pd.concat((train_df, test_df))\n", + " df = df.sort_values(by=[\"unique_id\", \"ds\"]).reset_index(drop=True)\n", + "\n", + " # Create column with dates with freq of dataset\n", + " len_series = df.groupby(\"unique_id\").agg({\"ds\": \"max\"}).reset_index()\n", + " dates = []\n", + " for i in range(len(len_series)):\n", + " len_serie = len_series.iloc[i, 1]\n", + " ranges = pd.date_range(start=\"1970/01/01\", periods=len_serie, freq=freq)\n", + " dates += list(ranges)\n", + " df.loc[:, \"ds\"] = dates\n", + "\n", + " df = df.merge(m4_info, left_on=[\"unique_id\"], right_on=[\"M4id\"])\n", + " df.drop(columns=[\"M4id\"], inplace=True)\n", + " df = df.rename(columns={\"category\": \"x\"})\n", + "\n", + " X_train_df = df[df[\"split\"] == \"train\"].filter(items=[\"unique_id\", \"ds\", \"x\"])\n", + " y_train_df = df[df[\"split\"] == \"train\"].filter(items=[\"unique_id\", \"ds\", \"y\"])\n", + " X_test_df = df[df[\"split\"] == \"test\"].filter(items=[\"unique_id\", \"ds\", \"x\"])\n", + " y_test_df = df[df[\"split\"] == \"test\"].filter(items=[\"unique_id\", \"ds\", \"y\"])\n", + "\n", + " X_train_df = X_train_df.reset_index(drop=True)\n", + " y_train_df = y_train_df.reset_index(drop=True)\n", + " X_test_df = X_test_df.reset_index(drop=True)\n", + " y_test_df = y_test_df.reset_index(drop=True)\n", + "\n", + " return X_train_df, y_train_df, X_test_df, y_test_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "\n", + "\n", + "def prepare_m4_data(dataset_name, directory, num_obs):\n", + " \"\"\"Pipeline that obtains M4 times series, tranforms it and\n", + " gets naive2 predictions.\n", + "\n", + " Parameters\n", + " ----------\n", + " dataset_name: str\n", + " Frequency of the data. Example: 'Yearly'.\n", + " directory: str\n", + " Custom directory where data will be saved.\n", + " num_obs: int\n", + " Number of time series to return.\n", + " \"\"\"\n", + " m4info_filename = maybe_download(\"M4-info.csv\", directory)\n", + "\n", + " dailytrain_filename = maybe_download(\"Train/Daily-train.csv\", directory)\n", + " hourlytrain_filename = maybe_download(\"Train/Hourly-train.csv\", directory)\n", + " monthlytrain_filename = maybe_download(\"Train/Monthly-train.csv\", directory)\n", + " quarterlytrain_filename = maybe_download(\"Train/Quarterly-train.csv\", directory)\n", + " weeklytrain_filename = maybe_download(\"Train/Weekly-train.csv\", directory)\n", + " yearlytrain_filename = maybe_download(\"Train/Yearly-train.csv\", directory)\n", + "\n", + " dailytest_filename = maybe_download(\"Test/Daily-test.csv\", directory)\n", + " hourlytest_filename = maybe_download(\"Test/Hourly-test.csv\", directory)\n", + " monthlytest_filename = maybe_download(\"Test/Monthly-test.csv\", directory)\n", + " quarterlytest_filename = maybe_download(\"Test/Quarterly-test.csv\", directory)\n", + " weeklytest_filename = maybe_download(\"Test/Weekly-test.csv\", directory)\n", + " yearlytest_filename = maybe_download(\"Test/Yearly-test.csv\", directory)\n", + " print(\"\\n\")\n", + "\n", + " X_train_df, y_train_df, X_test_df, y_test_df = m4_parser(\n", + " dataset_name, directory, num_obs\n", + " )\n", + "\n", + " return X_train_df, y_train_df, X_test_df, y_test_df" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |hide\n", + "from nbdev.showdoc import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |hide\n", + "import nbdev\n", + "\n", + "nbdev.nbdev_export()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/nbs/nbdev.yml b/nbs/nbdev.yml new file mode 100644 index 0000000..9085f8f --- /dev/null +++ b/nbs/nbdev.yml @@ -0,0 +1,9 @@ +project: + output-dir: _docs + +website: + title: "tsfeatures" + site-url: "https://Nixtla.github.io/tsfeatures/" + description: "Calculates various features from time series data. Python implementation of the R package tsfeatures." + repo-branch: main + repo-url: "https://github.com/Nixtla/tsfeatures" diff --git a/nbs/styles.css b/nbs/styles.css new file mode 100644 index 0000000..66ccc49 --- /dev/null +++ b/nbs/styles.css @@ -0,0 +1,37 @@ +.cell { + margin-bottom: 1rem; +} + +.cell > .sourceCode { + margin-bottom: 0; +} + +.cell-output > pre { + margin-bottom: 0; +} + +.cell-output > pre, .cell-output > .sourceCode > pre, .cell-output-stdout > pre { + margin-left: 0.8rem; + margin-top: 0; + background: none; + border-left: 2px solid lightsalmon; + border-top-left-radius: 0; + border-top-right-radius: 0; +} + +.cell-output > .sourceCode { + border: none; +} + +.cell-output > .sourceCode { + background: none; + margin-top: 0; +} + +div.description { + padding-left: 2px; + padding-top: 5px; + font-style: italic; + font-size: 135%; + opacity: 70%; +} diff --git a/nbs/tsfeatures.ipynb b/nbs/tsfeatures.ipynb new file mode 100644 index 0000000..d4b3371 --- /dev/null +++ b/nbs/tsfeatures.ipynb @@ -0,0 +1,1753 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# features\n", + "\n", + "> Fill in a module description here\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |default_exp tsfeatures" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "import os\n", + "import warnings\n", + "from collections import ChainMap\n", + "from functools import partial\n", + "from multiprocessing import Pool\n", + "from typing import Callable, List, Optional\n", + "\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "warnings.warn = lambda *a, **kw: False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "os.environ[\"MKL_NUM_THREADS\"] = \"1\"\n", + "os.environ[\"NUMEXPR_NUM_THREADS\"] = \"1\"\n", + "os.environ[\"OMP_NUM_THREADS\"] = \"1\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "from itertools import groupby\n", + "from math import e # maybe change with numpy e\n", + "from typing import Dict\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from antropy import spectral_entropy\n", + "from arch import arch_model\n", + "from scipy.optimize import minimize_scalar\n", + "from sklearn.linear_model import LinearRegression\n", + "from statsmodels.api import OLS, add_constant\n", + "from statsmodels.tsa.ar_model import AR\n", + "from statsmodels.tsa.holtwinters import ExponentialSmoothing\n", + "from statsmodels.tsa.seasonal import STL\n", + "from statsmodels.tsa.stattools import acf, kpss, pacf\n", + "from supersmoother import SuperSmoother\n", + "\n", + "from tsfeatures.utils import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from math import isclose\n", + "\n", + "from fastcore.test import *\n", + "\n", + "from tsfeatures.m4_data import *\n", + "from tsfeatures.utils import USAccDeaths, WWWusage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def acf_features(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Calculates autocorrelation function features.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'x_acf1': First autocorrelation coefficient.\n", + " 'x_acf10': Sum of squares of first 10 autocorrelation coefficients.\n", + " 'diff1_acf1': First autocorrelation ciefficient of differenced series.\n", + " 'diff1_acf10': Sum of squared of first 10 autocorrelation coefficients\n", + " of differenced series.\n", + " 'diff2_acf1': First autocorrelation coefficient of twice-differenced series.\n", + " 'diff2_acf10': Sum of squared of first 10 autocorrelation coefficients of\n", + " twice-differenced series.\n", + "\n", + " Only for seasonal data (freq > 1).\n", + " 'seas_acf1': Autocorrelation coefficient at the first seasonal lag.\n", + " \"\"\"\n", + " m = freq\n", + " size_x = len(x)\n", + "\n", + " acfx = acf(x, nlags=max(m, 10), fft=False)\n", + " if size_x > 10:\n", + " acfdiff1x = acf(np.diff(x, n=1), nlags=10, fft=False)\n", + " else:\n", + " acfdiff1x = [np.nan] * 2\n", + "\n", + " if size_x > 11:\n", + " acfdiff2x = acf(np.diff(x, n=2), nlags=10, fft=False)\n", + " else:\n", + " acfdiff2x = [np.nan] * 2\n", + " # first autocorrelation coefficient\n", + "\n", + " try:\n", + " acf_1 = acfx[1]\n", + " except:\n", + " acf_1 = np.nan\n", + "\n", + " # sum of squares of first 10 autocorrelation coefficients\n", + " sum_of_sq_acf10 = np.sum((acfx[1:11]) ** 2) if size_x > 10 else np.nan\n", + " # first autocorrelation ciefficient of differenced series\n", + " diff1_acf1 = acfdiff1x[1]\n", + " # sum of squared of first 10 autocorrelation coefficients of differenced series\n", + " diff1_acf10 = np.sum((acfdiff1x[1:11]) ** 2) if size_x > 10 else np.nan\n", + " # first autocorrelation coefficient of twice-differenced series\n", + " diff2_acf1 = acfdiff2x[1]\n", + " # Sum of squared of first 10 autocorrelation coefficients of twice-differenced series\n", + " diff2_acf10 = np.sum((acfdiff2x[1:11]) ** 2) if size_x > 11 else np.nan\n", + "\n", + " output = {\n", + " \"x_acf1\": acf_1,\n", + " \"x_acf10\": sum_of_sq_acf10,\n", + " \"diff1_acf1\": diff1_acf1,\n", + " \"diff1_acf10\": diff1_acf10,\n", + " \"diff2_acf1\": diff2_acf1,\n", + " \"diff2_acf10\": diff2_acf10,\n", + " }\n", + "\n", + " if m > 1:\n", + " output[\"seas_acf1\"] = acfx[m] if len(acfx) > m else np.nan\n", + "\n", + " return output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def test_acf_features_seasonal():\n", + " z = acf_features(USAccDeaths, 12)\n", + " assert isclose(len(z), 7)\n", + " assert isclose(z[\"x_acf1\"], 0.70, abs_tol=0.01)\n", + " assert isclose(z[\"x_acf10\"], 1.20, abs_tol=0.01)\n", + " assert isclose(z[\"diff1_acf1\"], 0.023, abs_tol=0.01)\n", + " assert isclose(z[\"diff1_acf10\"], 0.27, abs_tol=0.01)\n", + " assert isclose(z[\"diff2_acf1\"], -0.48, abs_tol=0.01)\n", + " assert isclose(z[\"diff2_acf10\"], 0.74, abs_tol=0.01)\n", + " assert isclose(z[\"seas_acf1\"], 0.62, abs_tol=0.01)\n", + "\n", + "\n", + "test_acf_features_seasonal()\n", + "\n", + "\n", + "def test_acf_features_non_seasonal():\n", + " z = acf_features(WWWusage, 1)\n", + " assert isclose(len(z), 6)\n", + " assert isclose(z[\"x_acf1\"], 0.96, abs_tol=0.01)\n", + " assert isclose(z[\"x_acf10\"], 4.19, abs_tol=0.01)\n", + " assert isclose(z[\"diff1_acf1\"], 0.79, abs_tol=0.01)\n", + " assert isclose(z[\"diff1_acf10\"], 1.40, abs_tol=0.01)\n", + " assert isclose(z[\"diff2_acf1\"], 0.17, abs_tol=0.01)\n", + " assert isclose(z[\"diff2_acf10\"], 0.33, abs_tol=0.01)\n", + "\n", + "\n", + "test_acf_features_non_seasonal()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def arch_stat(\n", + " x: np.array, freq: int = 1, lags: int = 12, demean: bool = True\n", + ") -> Dict[str, float]:\n", + " \"\"\"Arch model features.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'arch_lm': R^2 value of an autoregressive model of order lags applied to x**2.\n", + " \"\"\"\n", + " if len(x) <= lags + 1:\n", + " return {\"arch_lm\": np.nan}\n", + " if demean:\n", + " x = x - np.mean(x)\n", + "\n", + " size_x = len(x)\n", + " mat = embed(x**2, lags + 1)\n", + " X = mat[:, 1:]\n", + " y = np.vstack(mat[:, 0])\n", + "\n", + " try:\n", + " r_squared = LinearRegression().fit(X, y).score(X, y)\n", + " except:\n", + " r_squared = np.nan\n", + "\n", + " return {\"arch_lm\": r_squared}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def test_arch_stat_seasonal():\n", + " z = arch_stat(USAccDeaths, 12)\n", + " test_close(len(z), 1)\n", + " test_close(z[\"arch_lm\"], 0.54, eps=0.01)\n", + "\n", + "\n", + "test_arch_stat_seasonal()\n", + "\n", + "\n", + "def test_arch_stat_non_seasonal():\n", + " z = arch_stat(WWWusage, 12)\n", + " test_close(len(z), 1)\n", + " test_close(z[\"arch_lm\"], 0.98, eps=0.01)\n", + "\n", + "\n", + "test_arch_stat_non_seasonal()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def count_entropy(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Count entropy.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'count_entropy': Entropy using only positive data.\n", + " \"\"\"\n", + " entropy = x[x > 0] * np.log(x[x > 0])\n", + " entropy = -entropy.sum()\n", + "\n", + " return {\"count_entropy\": entropy}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def crossing_points(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Crossing points.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'crossing_points': Number of times that x crosses the median.\n", + " \"\"\"\n", + " midline = np.median(x)\n", + " ab = x <= midline\n", + " lenx = len(x)\n", + " p1 = ab[: (lenx - 1)]\n", + " p2 = ab[1:]\n", + " cross = (p1 & (~p2)) | (p2 & (~p1))\n", + "\n", + " return {\"crossing_points\": cross.sum()}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def entropy(x: np.array, freq: int = 1, base: float = e) -> Dict[str, float]:\n", + " \"\"\"Calculates sample entropy.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'entropy': Wrapper of the function spectral_entropy.\n", + " \"\"\"\n", + " try:\n", + " with np.errstate(divide=\"ignore\"):\n", + " entropy = spectral_entropy(x, 1, normalize=True)\n", + " except:\n", + " entropy = np.nan\n", + "\n", + " return {\"entropy\": entropy}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def flat_spots(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Flat spots.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'flat_spots': Number of flat spots in x.\n", + " \"\"\"\n", + " try:\n", + " cutx = pd.cut(x, bins=10, include_lowest=True, labels=False) + 1\n", + " except:\n", + " return {\"flat_spots\": np.nan}\n", + "\n", + " rlex = np.array([sum(1 for i in g) for k, g in groupby(cutx)]).max()\n", + " return {\"flat_spots\": rlex}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def frequency(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Frequency.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'frequency': Wrapper of freq.\n", + " \"\"\"\n", + "\n", + " return {\"frequency\": freq}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def guerrero(\n", + " x: np.array, freq: int = 1, lower: int = -1, upper: int = 2\n", + ") -> Dict[str, float]:\n", + " \"\"\"Applies Guerrero's (1993) method to select the lambda which minimises the\n", + " coefficient of variation for subseries of x.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series.\n", + " lower: float\n", + " The lower bound for lambda.\n", + " upper: float\n", + " The upper bound for lambda.\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'guerrero': Minimum coefficient of variation for subseries of x.\n", + "\n", + " References\n", + " ----------\n", + " [1] Guerrero, V.M. (1993) Time-series analysis supported by power transformations.\n", + " Journal of Forecasting, 12, 37–48.\n", + " \"\"\"\n", + " func_to_min = lambda lambda_par: lambda_coef_var(lambda_par, x=x, period=freq)\n", + "\n", + " min_ = minimize_scalar(func_to_min, bounds=[lower, upper])\n", + " min_ = min_[\"fun\"]\n", + "\n", + " return {\"guerrero\": min_}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def heterogeneity(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Heterogeneity.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'arch_acf': Sum of squares of the first 12 autocorrelations of the\n", + " residuals of the AR model applied to x\n", + " 'garch_acf': Sum of squares of the first 12 autocorrelations of the\n", + " residuals of the GARCH model applied to x\n", + " 'arch_r2': Function arch_stat applied to the residuals of the\n", + " AR model applied to x.\n", + " 'garch_r2': Function arch_stat applied to the residuals of the GARCH\n", + " model applied to x.\n", + " \"\"\"\n", + " m = freq\n", + "\n", + " size_x = len(x)\n", + " order_ar = min(size_x - 1, np.floor(10 * np.log10(size_x)))\n", + " order_ar = int(order_ar)\n", + "\n", + " try:\n", + " x_whitened = AR(x).fit(maxlag=order_ar, ic=\"aic\", trend=\"c\").resid\n", + " except:\n", + " try:\n", + " x_whitened = AR(x).fit(maxlag=order_ar, ic=\"aic\", trend=\"nc\").resid\n", + " except:\n", + " output = {\n", + " \"arch_acf\": np.nan,\n", + " \"garch_acf\": np.nan,\n", + " \"arch_r2\": np.nan,\n", + " \"garch_r2\": np.nan,\n", + " }\n", + "\n", + " return output\n", + " # arch and box test\n", + " x_archtest = arch_stat(x_whitened, m)[\"arch_lm\"]\n", + " LBstat = (acf(x_whitened**2, nlags=12, fft=False)[1:] ** 2).sum()\n", + " # Fit garch model\n", + " garch_fit = arch_model(x_whitened, vol=\"GARCH\", rescale=False).fit(disp=\"off\")\n", + " # compare arch test before and after fitting garch\n", + " garch_fit_std = garch_fit.resid\n", + " x_garch_archtest = arch_stat(garch_fit_std, m)[\"arch_lm\"]\n", + " # compare Box test of squared residuals before and after fittig.garch\n", + " LBstat2 = (acf(garch_fit_std**2, nlags=12, fft=False)[1:] ** 2).sum()\n", + "\n", + " output = {\n", + " \"arch_acf\": LBstat,\n", + " \"garch_acf\": LBstat2,\n", + " \"arch_r2\": x_archtest,\n", + " \"garch_r2\": x_garch_archtest,\n", + " }\n", + "\n", + " return output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def holt_parameters(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Fitted parameters of a Holt model.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'alpha': Level paramater of the Holt model.\n", + " 'beta': Trend parameter of the Hold model.\n", + " \"\"\"\n", + " try:\n", + " fit = ExponentialSmoothing(x, trend=\"add\", seasonal=None).fit()\n", + " params = {\n", + " \"alpha\": fit.params[\"smoothing_level\"],\n", + " \"beta\": fit.params[\"smoothing_trend\"],\n", + " }\n", + " except:\n", + " params = {\"alpha\": np.nan, \"beta\": np.nan}\n", + "\n", + " return params" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def hurst(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Hurst index.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'hurst': Hurst exponent.\n", + " \"\"\"\n", + " try:\n", + " hurst_index = hurst_exponent(x)\n", + " except:\n", + " hurst_index = np.nan\n", + "\n", + " return {\"hurst\": hurst_index}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def hw_parameters(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Fitted parameters of a Holt-Winters model.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'hw_alpha': Level parameter of the HW model.\n", + " 'hw_beta': Trend parameter of the HW model.\n", + " 'hw_gamma': Seasonal parameter of the HW model.\n", + " \"\"\"\n", + " try:\n", + " fit = ExponentialSmoothing(\n", + " x, seasonal_periods=freq, trend=\"add\", seasonal=\"add\"\n", + " ).fit()\n", + " params = {\n", + " \"hw_alpha\": fit.params[\"smoothing_level\"],\n", + " \"hw_beta\": fit.params[\"smoothing_trend\"],\n", + " \"hw_gamma\": fit.params[\"smoothing_seasonal\"],\n", + " }\n", + " except:\n", + " params = {\"hw_alpha\": np.nan, \"hw_beta\": np.nan, \"hw_gamma\": np.nan}\n", + "\n", + " return params" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def intervals(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Intervals with demand.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'intervals_mean': Mean of intervals with positive values.\n", + " 'intervals_sd': SD of intervals with positive values.\n", + " \"\"\"\n", + " x[x > 0] = 1\n", + "\n", + " y = [sum(val) for keys, val in groupby(x, key=lambda k: k != 0) if keys != 0]\n", + " y = np.array(y)\n", + "\n", + " return {\"intervals_mean\": np.mean(y), \"intervals_sd\": np.std(y, ddof=1)}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def lumpiness(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"lumpiness.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'lumpiness': Variance of the variances of tiled windows.\n", + " \"\"\"\n", + " if freq == 1:\n", + " width = 10\n", + " else:\n", + " width = freq\n", + "\n", + " nr = len(x)\n", + " lo = np.arange(0, nr, width)\n", + " up = lo + width\n", + " nsegs = nr / width\n", + " varx = [np.nanvar(x[lo[idx] : up[idx]], ddof=1) for idx in np.arange(int(nsegs))]\n", + "\n", + " if len(x) < 2 * width:\n", + " lumpiness = 0\n", + " else:\n", + " lumpiness = np.nanvar(varx, ddof=1)\n", + "\n", + " return {\"lumpiness\": lumpiness}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def nonlinearity(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Nonlinearity.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'nonlinearity': 10 t**2/len(x) where t is the statistic used in\n", + " Terasvirta's test.\n", + " \"\"\"\n", + " try:\n", + " test = terasvirta_test(x)\n", + " test = 10 * test / len(x)\n", + " except:\n", + " test = np.nan\n", + "\n", + " return {\"nonlinearity\": test}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def pacf_features(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Calculates partial autocorrelation function features.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'x_pacf5': Sum of squares of the first 5 partial autocorrelation\n", + " coefficients.\n", + " 'diff1x_pacf5': Sum of squares of the first 5 partial autocorrelation\n", + " coefficients of differenced series.\n", + " 'diff2x_pacf5': Sum of squares of the first 5 partial autocorrelation\n", + " coefficients of twice-differenced series.\n", + "\n", + " Only for seasonal data (freq > 1).\n", + " 'seas_pacf': Partial autocorrelation\n", + " coefficient at the first seasonal lag.\n", + " \"\"\"\n", + " m = freq\n", + "\n", + " nlags_ = max(m, 5)\n", + "\n", + " if len(x) > 1:\n", + " try:\n", + " pacfx = pacf(x, nlags=nlags_, method=\"ldb\")\n", + " except:\n", + " pacfx = np.nan\n", + " else:\n", + " pacfx = np.nan\n", + " # Sum of first 6 PACs squared\n", + " if len(x) > 5 and not np.all(np.isnan(pacfx)):\n", + " pacf_5 = np.sum(pacfx[1:6] ** 2)\n", + " else:\n", + " pacf_5 = np.nan\n", + " # Sum of first 5 PACs of difference series squared\n", + " if len(x) > 6:\n", + " try:\n", + " diff1_pacf = pacf(np.diff(x, n=1), nlags=5, method=\"ldb\")[1:6]\n", + " diff1_pacf_5 = np.sum(diff1_pacf**2)\n", + " except:\n", + " diff1_pacf_5 = np.nan\n", + " else:\n", + " diff1_pacf_5 = np.nan\n", + " # Sum of first 5 PACs of twice differenced series squared\n", + " if len(x) > 7:\n", + " try:\n", + " diff2_pacf = pacf(np.diff(x, n=2), nlags=5, method=\"ldb\")[1:6]\n", + " diff2_pacf_5 = np.sum(diff2_pacf**2)\n", + " except:\n", + " diff2_pacf_5 = np.nan\n", + " else:\n", + " diff2_pacf_5 = np.nan\n", + "\n", + " output = {\n", + " \"x_pacf5\": pacf_5,\n", + " \"diff1x_pacf5\": diff1_pacf_5,\n", + " \"diff2x_pacf5\": diff2_pacf_5,\n", + " }\n", + "\n", + " if m > 1:\n", + " output[\"seas_pacf\"] = pacfx[m] if len(pacfx) > m else np.nan\n", + "\n", + " return output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def series_length(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Series length.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'series_length': Wrapper of len(x).\n", + " \"\"\"\n", + "\n", + " return {\"series_length\": len(x)}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def sparsity(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Sparsity.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'sparsity': Average obs with zero values.\n", + " \"\"\"\n", + "\n", + " return {\"sparsity\": np.mean(x == 0)}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def stability(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Stability.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'stability': Variance of the means of tiled windows.\n", + " \"\"\"\n", + " if freq == 1:\n", + " width = 10\n", + " else:\n", + " width = freq\n", + "\n", + " nr = len(x)\n", + " lo = np.arange(0, nr, width)\n", + " up = lo + width\n", + " nsegs = nr / width\n", + " meanx = [np.nanmean(x[lo[idx] : up[idx]]) for idx in np.arange(int(nsegs))]\n", + "\n", + " if len(x) < 2 * width:\n", + " stability = 0\n", + " else:\n", + " stability = np.nanvar(meanx, ddof=1)\n", + "\n", + " return {\"stability\": stability}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def stl_features(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Calculates seasonal trend using loess decomposition.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'nperiods': Number of seasonal periods in x.\n", + " 'seasonal_period': Frequency of the time series.\n", + " 'trend': Strength of trend.\n", + " 'spike': Measures \"spikiness\" of x.\n", + " 'linearity': Linearity of x based on the coefficients of an\n", + " orthogonal quadratic regression.\n", + " 'curvature': Curvature of x based on the coefficients of an\n", + " orthogonal quadratic regression.\n", + " 'e_acf1': acfremainder['x_acf1']\n", + " 'e_acf10': acfremainder['x_acf10']\n", + "\n", + " Only for sesonal data (freq > 0).\n", + " 'seasonal_strength': Strength of seasonality.\n", + " 'peak': Strength of peaks.\n", + " 'trough': Strength of trough.\n", + " \"\"\"\n", + " m = freq\n", + " nperiods = int(m > 1)\n", + " # STL fits\n", + " if m > 1:\n", + " try:\n", + " stlfit = STL(x, m, 13).fit()\n", + " except:\n", + " output = {\n", + " \"nperiods\": nperiods,\n", + " \"seasonal_period\": m,\n", + " \"trend\": np.nan,\n", + " \"spike\": np.nan,\n", + " \"linearity\": np.nan,\n", + " \"curvature\": np.nan,\n", + " \"e_acf1\": np.nan,\n", + " \"e_acf10\": np.nan,\n", + " \"seasonal_strength\": np.nan,\n", + " \"peak\": np.nan,\n", + " \"trough\": np.nan,\n", + " }\n", + "\n", + " return output\n", + "\n", + " trend0 = stlfit.trend\n", + " remainder = stlfit.resid\n", + " seasonal = stlfit.seasonal\n", + " else:\n", + " deseas = x\n", + " t = np.arange(len(x)) + 1\n", + " try:\n", + " trend0 = SuperSmoother().fit(t, deseas).predict(t)\n", + " except:\n", + " output = {\n", + " \"nperiods\": nperiods,\n", + " \"seasonal_period\": m,\n", + " \"trend\": np.nan,\n", + " \"spike\": np.nan,\n", + " \"linearity\": np.nan,\n", + " \"curvature\": np.nan,\n", + " \"e_acf1\": np.nan,\n", + " \"e_acf10\": np.nan,\n", + " }\n", + "\n", + " return output\n", + "\n", + " remainder = deseas - trend0\n", + " seasonal = np.zeros(len(x))\n", + " # De-trended and de-seasonalized data\n", + " detrend = x - trend0\n", + " deseason = x - seasonal\n", + " fits = x - remainder\n", + " # Summay stats\n", + " n = len(x)\n", + " varx = np.nanvar(x, ddof=1)\n", + " vare = np.nanvar(remainder, ddof=1)\n", + " vardetrend = np.nanvar(detrend, ddof=1)\n", + " vardeseason = np.nanvar(deseason, ddof=1)\n", + " # Measure of trend strength\n", + " if varx < np.finfo(float).eps:\n", + " trend = 0\n", + " elif vardeseason / varx < 1e-10:\n", + " trend = 0\n", + " else:\n", + " trend = max(0, min(1, 1 - vare / vardeseason))\n", + " # Measure of seasonal strength\n", + " if m > 1:\n", + " if varx < np.finfo(float).eps:\n", + " season = 0\n", + " elif np.nanvar(remainder + seasonal, ddof=1) < np.finfo(float).eps:\n", + " season = 0\n", + " else:\n", + " season = max(0, min(1, 1 - vare / np.nanvar(remainder + seasonal, ddof=1)))\n", + "\n", + " peak = (np.argmax(seasonal) + 1) % m\n", + " peak = m if peak == 0 else peak\n", + "\n", + " trough = (np.argmin(seasonal) + 1) % m\n", + " trough = m if trough == 0 else trough\n", + " # Compute measure of spikiness\n", + " d = (remainder - np.nanmean(remainder)) ** 2\n", + " varloo = (vare * (n - 1) - d) / (n - 2)\n", + " spike = np.nanvar(varloo, ddof=1)\n", + " # Compute measures of linearity and curvature\n", + " time = np.arange(n) + 1\n", + " poly_m = poly(time, 2)\n", + " time_x = add_constant(poly_m)\n", + " coefs = OLS(trend0, time_x).fit().params\n", + "\n", + " try:\n", + " linearity = coefs[1]\n", + " except:\n", + " linearity = np.nan\n", + " try:\n", + " curvature = -coefs[2]\n", + " except:\n", + " curvature = np.nan\n", + " # ACF features\n", + " acfremainder = acf_features(remainder, m)\n", + " # Assemble features\n", + " output = {\n", + " \"nperiods\": nperiods,\n", + " \"seasonal_period\": m,\n", + " \"trend\": trend,\n", + " \"spike\": spike,\n", + " \"linearity\": linearity,\n", + " \"curvature\": curvature,\n", + " \"e_acf1\": acfremainder[\"x_acf1\"],\n", + " \"e_acf10\": acfremainder[\"x_acf10\"],\n", + " }\n", + "\n", + " if m > 1:\n", + " output[\"seasonal_strength\"] = season\n", + " output[\"peak\"] = peak\n", + " output[\"trough\"] = trough\n", + "\n", + " return output" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def unitroot_kpss(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Unit root kpss.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'unitroot_kpss': Statistic for the Kwiatowski et al unit root test.\n", + " \"\"\"\n", + " n = len(x)\n", + " nlags = int(4 * (n / 100) ** (1 / 4))\n", + "\n", + " try:\n", + " test_kpss, _, _, _ = kpss(x, nlags=nlags)\n", + " except:\n", + " test_kpss = np.nan\n", + "\n", + " return {\"unitroot_kpss\": test_kpss}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def unitroot_pp(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Unit root pp.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'unitroot_pp': Statistic for the Phillips-Perron unit root test.\n", + " \"\"\"\n", + " try:\n", + " test_pp = ur_pp(x)\n", + " except:\n", + " test_pp = np.nan\n", + "\n", + " return {\"unitroot_pp\": test_pp}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def statistics(x: np.array, freq: int = 1) -> Dict[str, float]:\n", + " \"\"\"Computes basic statistics of x.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " The time series.\n", + " freq: int\n", + " Frequency of the time series\n", + "\n", + " Returns\n", + " -------\n", + " dict\n", + " 'total_sum': Total sum of the series.\n", + " 'mean': Mean value.\n", + " 'variance': variance of the time series.\n", + " 'median': Median value.\n", + " 'p2point5': 2.5 Percentile.\n", + " 'p5': 5 percentile.\n", + " 'p25': 25 percentile.\n", + " 'p75': 75 percentile.\n", + " 'p95': 95 percentile.\n", + " 'p97point5': 97.5 percentile.\n", + " 'max': Max value.\n", + " 'min': Min value.\n", + " \"\"\"\n", + " res = dict(\n", + " total_sum=np.sum(x),\n", + " mean=np.mean(x),\n", + " variance=np.var(x, ddof=1),\n", + " median=np.median(x),\n", + " p2point5=np.quantile(x, q=0.025),\n", + " p5=np.quantile(x, q=0.05),\n", + " p25=np.quantile(x, q=0.25),\n", + " p75=np.quantile(x, q=0.75),\n", + " p95=np.quantile(x, q=0.95),\n", + " p97point5=np.quantile(x, q=0.975),\n", + " max=np.max(x),\n", + " min=np.min(x),\n", + " )\n", + "\n", + " return res" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def _get_feats(\n", + " index,\n", + " ts,\n", + " freq,\n", + " scale=True,\n", + " features=[\n", + " acf_features,\n", + " arch_stat,\n", + " crossing_points,\n", + " entropy,\n", + " flat_spots,\n", + " heterogeneity,\n", + " holt_parameters,\n", + " lumpiness,\n", + " nonlinearity,\n", + " pacf_features,\n", + " stl_features,\n", + " stability,\n", + " hw_parameters,\n", + " unitroot_kpss,\n", + " unitroot_pp,\n", + " series_length,\n", + " hurst,\n", + " ],\n", + " dict_freqs=FREQS,\n", + "):\n", + " if freq is None:\n", + " inf_freq = pd.infer_freq(ts[\"ds\"])\n", + " if inf_freq is None:\n", + " raise Exception(\n", + " \"Failed to infer frequency from the `ds` column, \"\n", + " \"please provide the frequency using the `freq` argument.\"\n", + " )\n", + "\n", + " freq = dict_freqs.get(inf_freq)\n", + " if freq is None:\n", + " raise Exception(\n", + " \"Error trying to convert infered frequency from the `ds` column \"\n", + " \"to integer. Please provide a dictionary with that frequency \"\n", + " \"as key and the integer frequency as value. \"\n", + " f\"Infered frequency: {inf_freq}\"\n", + " )\n", + "\n", + " if isinstance(ts, pd.DataFrame):\n", + " assert \"y\" in ts.columns\n", + " ts = ts[\"y\"].values\n", + "\n", + " if isinstance(ts, pd.Series):\n", + " ts = ts.values\n", + "\n", + " if scale:\n", + " ts = scalets(ts)\n", + "\n", + " c_map = ChainMap(\n", + " *[dict_feat for dict_feat in [func(ts, freq) for func in features]]\n", + " )\n", + "\n", + " return pd.DataFrame(dict(c_map), index=[index])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def tsfeatures(\n", + " ts: pd.DataFrame,\n", + " freq: Optional[int] = None,\n", + " features: List[Callable] = [\n", + " acf_features,\n", + " arch_stat,\n", + " crossing_points,\n", + " entropy,\n", + " flat_spots,\n", + " heterogeneity,\n", + " holt_parameters,\n", + " lumpiness,\n", + " nonlinearity,\n", + " pacf_features,\n", + " stl_features,\n", + " stability,\n", + " hw_parameters,\n", + " unitroot_kpss,\n", + " unitroot_pp,\n", + " series_length,\n", + " hurst,\n", + " ],\n", + " dict_freqs: Dict[str, int] = FREQS,\n", + " scale: bool = True,\n", + " threads: Optional[int] = None,\n", + ") -> pd.DataFrame:\n", + " \"\"\"Calculates features for time series.\n", + "\n", + " Parameters\n", + " ----------\n", + " ts: pandas df\n", + " Pandas DataFrame with columns ['unique_id', 'ds', 'y'].\n", + " Long panel of time series.\n", + " freq: int\n", + " Frequency of the time series. If None the frequency of\n", + " each time series is infered and assigns the seasonal periods according to\n", + " dict_freqs.\n", + " features: iterable\n", + " Iterable of features functions.\n", + " scale: bool\n", + " Whether (mean-std)scale data.\n", + " dict_freqs: dict\n", + " Dictionary that maps string frequency of int. Ex: {'D': 7, 'W': 1}\n", + " threads: int\n", + " Number of threads to use. Use None (default) for parallel processing.\n", + "\n", + " Returns\n", + " -------\n", + " pandas df\n", + " Pandas DataFrame where each column is a feature and each row\n", + " a time series.\n", + " \"\"\"\n", + " partial_get_feats = partial(\n", + " _get_feats, freq=freq, scale=scale, features=features, dict_freqs=dict_freqs\n", + " )\n", + "\n", + " with Pool(threads) as pool:\n", + " ts_features = pool.starmap(partial_get_feats, ts.groupby(\"unique_id\"))\n", + "\n", + " ts_features = pd.concat(ts_features).rename_axis(\"unique_id\")\n", + " ts_features = ts_features.reset_index()\n", + "\n", + " return ts_features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def _get_feats_wide(\n", + " index,\n", + " ts,\n", + " scale=True,\n", + " features=[\n", + " acf_features,\n", + " arch_stat,\n", + " crossing_points,\n", + " entropy,\n", + " flat_spots,\n", + " heterogeneity,\n", + " holt_parameters,\n", + " lumpiness,\n", + " nonlinearity,\n", + " pacf_features,\n", + " stl_features,\n", + " stability,\n", + " hw_parameters,\n", + " unitroot_kpss,\n", + " unitroot_pp,\n", + " series_length,\n", + " hurst,\n", + " ],\n", + "):\n", + " seasonality = ts[\"seasonality\"].item()\n", + " y = ts[\"y\"].item()\n", + " y = np.array(y)\n", + "\n", + " if scale:\n", + " y = scalets(y)\n", + "\n", + " c_map = ChainMap(\n", + " *[dict_feat for dict_feat in [func(y, seasonality) for func in features]]\n", + " )\n", + "\n", + " return pd.DataFrame(dict(c_map), index=[index])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def tsfeatures_wide(\n", + " ts: pd.DataFrame,\n", + " features: List[Callable] = [\n", + " acf_features,\n", + " arch_stat,\n", + " crossing_points,\n", + " entropy,\n", + " flat_spots,\n", + " heterogeneity,\n", + " holt_parameters,\n", + " lumpiness,\n", + " nonlinearity,\n", + " pacf_features,\n", + " stl_features,\n", + " stability,\n", + " hw_parameters,\n", + " unitroot_kpss,\n", + " unitroot_pp,\n", + " series_length,\n", + " hurst,\n", + " ],\n", + " scale: bool = True,\n", + " threads: Optional[int] = None,\n", + ") -> pd.DataFrame:\n", + " \"\"\"Calculates features for time series.\n", + "\n", + " Parameters\n", + " ----------\n", + " ts: pandas df\n", + " Pandas DataFrame with columns ['unique_id', 'seasonality', 'y'].\n", + " Wide panel of time series.\n", + " features: iterable\n", + " Iterable of features functions.\n", + " scale: bool\n", + " Whether (mean-std)scale data.\n", + " threads: int\n", + " Number of threads to use. Use None (default) for parallel processing.\n", + "\n", + " Returns\n", + " -------\n", + " pandas df\n", + " Pandas DataFrame where each column is a feature and each row\n", + " a time series.\n", + " \"\"\"\n", + " partial_get_feats = partial(_get_feats_wide, scale=scale, features=features)\n", + "\n", + " with Pool(threads) as pool:\n", + " ts_features = pool.starmap(partial_get_feats, ts.groupby(\"unique_id\"))\n", + "\n", + " ts_features = pd.concat(ts_features).rename_axis(\"unique_id\")\n", + " ts_features = ts_features.reset_index()\n", + "\n", + " return ts_features" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test_pipeline\n", + "def test_pipeline():\n", + " def calculate_features_m4(dataset_name, directory, num_obs=1000000):\n", + " _, y_train_df, _, _ = prepare_m4_data(\n", + " dataset_name=dataset_name, directory=directory, num_obs=num_obs\n", + " )\n", + "\n", + " freq = FREQS[dataset_name[0]]\n", + "\n", + " py_feats = tsfeatures(\n", + " y_train_df, freq=freq, features=[count_entropy]\n", + " ).set_index(\"unique_id\")\n", + "\n", + " calculate_features_m4(\"Hourly\", \"data\", 100)\n", + " calculate_features_m4(\"Daily\", \"data\", 100)\n", + "\n", + "\n", + "test_pipeline()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test_statistics\n", + "# TODO apply an assert to the test case\n", + "def test_scale():\n", + " z = np.zeros(10)\n", + " z[-1] = 1\n", + " df = pd.DataFrame({\"unique_id\": 1, \"ds\": range(1, 11), \"y\": z})\n", + " features = tsfeatures(df, freq=7, scale=True, features=[statistics])\n", + " print(features)\n", + "\n", + "\n", + "# TODO apply an assert to the test case\n", + "def test_no_scale():\n", + " z = np.zeros(10)\n", + " z[-1] = 1\n", + " df = pd.DataFrame({\"unique_id\": 1, \"ds\": range(1, 11), \"y\": z})\n", + " features = tsfeatures(df, freq=7, scale=False, features=[statistics])\n", + " print(features)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test_sparsity\n", + "def test_non_zero_sparsity():\n", + " # if we scale the data, the sparsity should be zero\n", + " z = np.zeros(10)\n", + " z[-1] = 1\n", + " df = pd.DataFrame({\"unique_id\": 1, \"ds\": range(1, 11), \"y\": z})\n", + " features = tsfeatures(df, freq=7, scale=True, features=[sparsity])\n", + " z_sparsity = features[\"sparsity\"].values[0]\n", + " assert z_sparsity == 0.0\n", + "\n", + "\n", + "def test_sparsity():\n", + " z = np.zeros(10)\n", + " z[-1] = 1\n", + " df = pd.DataFrame({\"unique_id\": 1, \"ds\": range(1, 11), \"y\": z})\n", + " features = tsfeatures(df, freq=7, scale=False, features=[sparsity])\n", + " print(features)\n", + " z_sparsity = features[\"sparsity\"].values[0]\n", + " assert z_sparsity == 0.9" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test_small_ts\n", + "# TODO apply an assert to the test case\n", + "def test_small():\n", + " z = np.zeros(2)\n", + " z[-1] = 1\n", + " z_df = pd.DataFrame({\"unique_id\": 1, \"ds\": range(1, 3), \"y\": z})\n", + " feats = [\n", + " sparsity,\n", + " acf_features,\n", + " arch_stat,\n", + " crossing_points,\n", + " entropy,\n", + " flat_spots,\n", + " holt_parameters,\n", + " lumpiness,\n", + " nonlinearity,\n", + " pacf_features,\n", + " stl_features,\n", + " stability,\n", + " hw_parameters,\n", + " unitroot_kpss,\n", + " unitroot_pp,\n", + " series_length,\n", + " hurst,\n", + " statistics,\n", + " ]\n", + " feats_df = tsfeatures(z_df, freq=12, features=feats, scale=False)\n", + "\n", + "\n", + "# TODO apply an assert to the test case\n", + "def test_small_1():\n", + " z = np.zeros(1)\n", + " z[-1] = 1\n", + " z_df = pd.DataFrame({\"unique_id\": 1, \"ds\": range(1, 2), \"y\": z})\n", + " feats = [\n", + " sparsity,\n", + " acf_features,\n", + " arch_stat,\n", + " crossing_points,\n", + " entropy,\n", + " flat_spots,\n", + " holt_parameters,\n", + " lumpiness,\n", + " nonlinearity,\n", + " pacf_features,\n", + " stl_features,\n", + " stability,\n", + " hw_parameters,\n", + " unitroot_kpss,\n", + " unitroot_pp,\n", + " series_length,\n", + " hurst,\n", + " statistics,\n", + " ]\n", + " feats_df = tsfeatures(z_df, freq=12, features=feats, scale=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test_pacf_features\n", + "# TODO apply an assert to the test case\n", + "def test_pacf_features_seasonal_short():\n", + " z = np.random.normal(size=15)\n", + " pacf_features(z, freq=7)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test_mutability\n", + "def test_mutability():\n", + " z = np.zeros(100)\n", + " z[-1] = 1\n", + " z_df = pd.DataFrame({\"unique_id\": 1, \"ds\": range(1, 101), \"y\": z})\n", + " feats = [\n", + " sparsity,\n", + " acf_features,\n", + " arch_stat,\n", + " crossing_points,\n", + " entropy,\n", + " flat_spots,\n", + " holt_parameters,\n", + " lumpiness,\n", + " nonlinearity,\n", + " pacf_features,\n", + " stl_features,\n", + " stability,\n", + " hw_parameters,\n", + " unitroot_kpss,\n", + " unitroot_pp,\n", + " series_length,\n", + " hurst,\n", + " ]\n", + " feats_2 = [\n", + " acf_features,\n", + " arch_stat,\n", + " crossing_points,\n", + " entropy,\n", + " flat_spots,\n", + " holt_parameters,\n", + " lumpiness,\n", + " nonlinearity,\n", + " pacf_features,\n", + " stl_features,\n", + " stability,\n", + " hw_parameters,\n", + " unitroot_kpss,\n", + " unitroot_pp,\n", + " series_length,\n", + " hurst,\n", + " sparsity,\n", + " ]\n", + " feats_df = tsfeatures(z_df, freq=7, features=feats, scale=False)\n", + " feats_2_df = tsfeatures(z_df, freq=7, features=feats_2, scale=False)\n", + " pd.testing.assert_frame_equal(feats_df, feats_2_df[feats_df.columns])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test_holt_parameters\n", + "def test_holt_parameters_seasonal():\n", + " z = holt_parameters(USAccDeaths, 12)\n", + " assert isclose(len(z), 2)\n", + " assert isclose(z[\"alpha\"], 0.96, abs_tol=0.07)\n", + " assert isclose(z[\"beta\"], 0.00, abs_tol=0.1)\n", + "\n", + "\n", + "def test_holt_parameters_non_seasonal():\n", + " z = holt_parameters(WWWusage, 1)\n", + " assert isclose(len(z), 2)\n", + " assert isclose(z[\"alpha\"], 0.99, abs_tol=0.02)\n", + " assert isclose(z[\"beta\"], 0.99, abs_tol=0.02)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test_arch_stat\n", + "def test_arch_stat_seasonal():\n", + " z = arch_stat(USAccDeaths, 12)\n", + " assert isclose(len(z), 1)\n", + " assert isclose(z[\"arch_lm\"], 0.54, abs_tol=0.01)\n", + "\n", + "\n", + "def test_arch_stat_non_seasonal():\n", + " z = arch_stat(WWWusage, 12)\n", + " assert isclose(len(z), 1)\n", + " assert isclose(z[\"arch_lm\"], 0.98, abs_tol=0.01)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# test_acf_features\n", + "def test_acf_features_seasonal():\n", + " z = acf_features(USAccDeaths, 12)\n", + " assert isclose(len(z), 7)\n", + " assert isclose(z[\"x_acf1\"], 0.70, abs_tol=0.01)\n", + " assert isclose(z[\"x_acf10\"], 1.20, abs_tol=0.01)\n", + " assert isclose(z[\"diff1_acf1\"], 0.023, abs_tol=0.01)\n", + " assert isclose(z[\"diff1_acf10\"], 0.27, abs_tol=0.01)\n", + " assert isclose(z[\"diff2_acf1\"], -0.48, abs_tol=0.01)\n", + " assert isclose(z[\"diff2_acf10\"], 0.74, abs_tol=0.01)\n", + " assert isclose(z[\"seas_acf1\"], 0.62, abs_tol=0.01)\n", + "\n", + "\n", + "def test_acf_features_non_seasonal():\n", + " z = acf_features(WWWusage, 1)\n", + " assert isclose(len(z), 6)\n", + " assert isclose(z[\"x_acf1\"], 0.96, abs_tol=0.01)\n", + " assert isclose(z[\"x_acf10\"], 4.19, abs_tol=0.01)\n", + " assert isclose(z[\"diff1_acf1\"], 0.79, abs_tol=0.01)\n", + " assert isclose(z[\"diff1_acf10\"], 1.40, abs_tol=0.01)\n", + " assert isclose(z[\"diff2_acf1\"], 0.17, abs_tol=0.01)\n", + " assert isclose(z[\"diff2_acf10\"], 0.33, abs_tol=0.01)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |hide\n", + "from nbdev.showdoc import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |hide\n", + "import nbdev\n", + "\n", + "nbdev.nbdev_export()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/nbs/tsfeatures_r.ipynb b/nbs/tsfeatures_r.ipynb new file mode 100644 index 0000000..6a62f6a --- /dev/null +++ b/nbs/tsfeatures_r.ipynb @@ -0,0 +1,280 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# utils\n", + "\n", + "> supporting utils for tsfeatures\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |default_exp tsfeatures_r" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "# |hide\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "from typing import List\n", + "\n", + "import pandas as pd\n", + "import rpy2.robjects as robjects\n", + "from rpy2.robjects import pandas2ri" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def tsfeatures_r(\n", + " ts: pd.DataFrame,\n", + " freq: int,\n", + " features: List[str] = [\n", + " \"length\",\n", + " \"acf_features\",\n", + " \"arch_stat\",\n", + " \"crossing_points\",\n", + " \"entropy\",\n", + " \"flat_spots\",\n", + " \"heterogeneity\",\n", + " \"holt_parameters\",\n", + " \"hurst\",\n", + " \"hw_parameters\",\n", + " \"lumpiness\",\n", + " \"nonlinearity\",\n", + " \"pacf_features\",\n", + " \"stability\",\n", + " \"stl_features\",\n", + " \"unitroot_kpss\",\n", + " \"unitroot_pp\",\n", + " ],\n", + " **kwargs,\n", + ") -> pd.DataFrame:\n", + " \"\"\"tsfeatures wrapper using r.\n", + "\n", + " Parameters\n", + " ----------\n", + " ts: pandas df\n", + " Pandas DataFrame with columns ['unique_id', 'ds', 'y'].\n", + " Long panel of time series.\n", + " freq: int\n", + " Frequency of the time series.\n", + " features: List[str]\n", + " String list of features to calculate.\n", + " **kwargs:\n", + " Arguments used by the original tsfeatures function.\n", + "\n", + " References\n", + " ----------\n", + " https://pkg.robjhyndman.com/tsfeatures/reference/tsfeatures.html\n", + " \"\"\"\n", + " rstring = \"\"\"\n", + " function(df, freq, features, ...){\n", + " suppressMessages(library(data.table))\n", + " suppressMessages(library(tsfeatures))\n", + "\n", + " dt <- as.data.table(df)\n", + " setkey(dt, unique_id)\n", + "\n", + " series_list <- split(dt, by = \"unique_id\", keep.by = FALSE)\n", + " series_list <- lapply(series_list,\n", + " function(serie) serie[, ts(y, frequency = freq)])\n", + "\n", + " if(\"hw_parameters\" %in% features){\n", + " features <- setdiff(features, \"hw_parameters\")\n", + "\n", + " if(length(features)>0){\n", + " hw_series_features <- suppressMessages(tsfeatures(series_list, \"hw_parameters\", ...))\n", + " names(hw_series_features) <- paste0(\"hw_\", names(hw_series_features))\n", + "\n", + " series_features <- suppressMessages(tsfeatures(series_list, features, ...))\n", + " series_features <- cbind(series_features, hw_series_features)\n", + " } else {\n", + " series_features <- suppressMessages(tsfeatures(series_list, \"hw_parameters\", ...))\n", + " names(series_features) <- paste0(\"hw_\", names(series_features))\n", + " }\n", + " } else {\n", + " series_features <- suppressMessages(tsfeatures(series_list, features, ...))\n", + " }\n", + "\n", + " setDT(series_features)\n", + "\n", + " series_features[, unique_id := names(series_list)]\n", + "\n", + " }\n", + " \"\"\"\n", + " pandas2ri.activate()\n", + " rfunc = robjects.r(rstring)\n", + "\n", + " feats = rfunc(ts, freq, features, **kwargs)\n", + " pandas2ri.deactivate()\n", + "\n", + " renamer = {\"ARCH.LM\": \"arch_lm\", \"length\": \"series_length\"}\n", + " feats = feats.rename(columns=renamer)\n", + "\n", + " return feats\n", + "\n", + "\n", + "def tsfeatures_r_wide(\n", + " ts: pd.DataFrame,\n", + " features: List[str] = [\n", + " \"length\",\n", + " \"acf_features\",\n", + " \"arch_stat\",\n", + " \"crossing_points\",\n", + " \"entropy\",\n", + " \"flat_spots\",\n", + " \"heterogeneity\",\n", + " \"holt_parameters\",\n", + " \"hurst\",\n", + " \"hw_parameters\",\n", + " \"lumpiness\",\n", + " \"nonlinearity\",\n", + " \"pacf_features\",\n", + " \"stability\",\n", + " \"stl_features\",\n", + " \"unitroot_kpss\",\n", + " \"unitroot_pp\",\n", + " ],\n", + " **kwargs,\n", + ") -> pd.DataFrame:\n", + " \"\"\"tsfeatures wrapper using r.\n", + "\n", + " Parameters\n", + " ----------\n", + " ts: pandas df\n", + " Pandas DataFrame with columns ['unique_id', 'seasonality', 'y'].\n", + " Wide panel of time series.\n", + " features: List[str]\n", + " String list of features to calculate.\n", + " **kwargs:\n", + " Arguments used by the original tsfeatures function.\n", + "\n", + " References\n", + " ----------\n", + " https://pkg.robjhyndman.com/tsfeatures/reference/tsfeatures.html\n", + " \"\"\"\n", + " rstring = \"\"\"\n", + " function(uids, seasonalities, ys, features, ...){\n", + " suppressMessages(library(data.table))\n", + " suppressMessages(library(tsfeatures))\n", + " suppressMessages(library(purrr))\n", + "\n", + " series_list <- pmap(\n", + " list(uids, seasonalities, ys),\n", + " function(uid, seasonality, y) ts(y, frequency=seasonality)\n", + " )\n", + " names(series_list) <- uids\n", + "\n", + " if(\"hw_parameters\" %in% features){\n", + " features <- setdiff(features, \"hw_parameters\")\n", + "\n", + " if(length(features)>0){\n", + " hw_series_features <- suppressMessages(tsfeatures(series_list, \"hw_parameters\", ...))\n", + " names(hw_series_features) <- paste0(\"hw_\", names(hw_series_features))\n", + "\n", + " series_features <- suppressMessages(tsfeatures(series_list, features, ...))\n", + " series_features <- cbind(series_features, hw_series_features)\n", + " } else {\n", + " series_features <- suppressMessages(tsfeatures(series_list, \"hw_parameters\", ...))\n", + " names(series_features) <- paste0(\"hw_\", names(series_features))\n", + " }\n", + " } else {\n", + " series_features <- suppressMessages(tsfeatures(series_list, features, ...))\n", + " }\n", + "\n", + " setDT(series_features)\n", + "\n", + " series_features[, unique_id := names(series_list)]\n", + "\n", + " }\n", + " \"\"\"\n", + " pandas2ri.activate()\n", + " rfunc = robjects.r(rstring)\n", + "\n", + " uids = ts[\"unique_id\"].to_list()\n", + " seasonalities = ts[\"seasonality\"].to_list()\n", + " ys = ts[\"y\"].to_list()\n", + "\n", + " feats = rfunc(uids, seasonalities, ys, features, **kwargs)\n", + " pandas2ri.deactivate()\n", + "\n", + " renamer = {\"ARCH.LM\": \"arch_lm\", \"length\": \"series_length\"}\n", + " feats = feats.rename(columns=renamer)\n", + "\n", + " return feats" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |hide\n", + "from nbdev.showdoc import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |hide\n", + "import nbdev\n", + "\n", + "nbdev.nbdev_export()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/nbs/utils.ipynb b/nbs/utils.ipynb new file mode 100644 index 0000000..a782375 --- /dev/null +++ b/nbs/utils.ipynb @@ -0,0 +1,432 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# utils\n", + "\n", + "> supporting utils for tsfeatures\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |default_exp utils" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "# |hide\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "import numpy as np\n", + "import statsmodels.api as sm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |hide\n", + "from fastcore.test import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'divide': 'ignore', 'over': 'warn', 'under': 'ignore', 'invalid': 'ignore'}" + ] + }, + "execution_count": null, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# |export\n", + "np.seterr(divide=\"ignore\", invalid=\"ignore\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "FREQS = {\"H\": 24, \"D\": 1, \"M\": 12, \"Q\": 4, \"W\": 1, \"Y\": 1}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def scalets(x: np.array) -> np.array:\n", + " \"\"\"Mean-std scale a time series.\n", + "\n", + " Scales the time series x by removing the mean and dividing by the standard deviation.\n", + "\n", + " Parameters\n", + " ----------\n", + " x : np.array\n", + " The input time series data.\n", + "\n", + " Returns\n", + " -------\n", + " np.array\n", + " The scaled time series values.\n", + " \"\"\"\n", + " return (x - x.mean()) / x.std(ddof=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def poly(x: np.array, p: int) -> np.array:\n", + " \"\"\"Returns or evaluates orthogonal polynomials of degree 1 to degree over the\n", + " specified set of points x:\n", + " these are all orthogonal to the constant polynomial of degree 0.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " Time series.\n", + " p: int\n", + " Degree of the polynomial.\n", + "\n", + " References\n", + " ----------\n", + " https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/poly\n", + " \"\"\"\n", + " X = np.transpose(np.vstack([x**k for k in range(p + 1)]))\n", + "\n", + " return np.linalg.qr(X)[0][:, 1:]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def embed(x: np.array, p: int) -> np.array:\n", + " \"\"\"Embeds the time series x into a low-dimensional Euclidean space.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " Time series.\n", + " p: int\n", + " Embedding dimension.\n", + "\n", + " References\n", + " ----------\n", + " https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/embed\n", + " \"\"\"\n", + " x = np.transpose(np.vstack([np.roll(x, k) for k in range(p)]))\n", + " return x[p - 1 :]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def terasvirta_test(x: np.array, lag: int = 1, scale: bool = True) -> float:\n", + " \"\"\"Generically computes Teraesvirta's neural network test for neglected\n", + " nonlinearity either for the time series x or the regression y~x.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " Time series.\n", + " lag: int\n", + " Specifies the model order in terms of lags.\n", + " scale: bool\n", + " Whether the data should be scaled before computing the test.\n", + "\n", + " Returns\n", + " -------\n", + " float\n", + " Terasvirta statistic.\n", + "\n", + " References\n", + " ----------\n", + " https://www.rdocumentation.org/packages/tseries/versions/0.10-47/topics/terasvirta.test\n", + " \"\"\"\n", + " if scale:\n", + " x = scalets(x)\n", + "\n", + " size_x = len(x)\n", + " y = embed(x, lag + 1)\n", + "\n", + " X = y[:, 1:]\n", + " X = sm.add_constant(X)\n", + "\n", + " y = y[:, 0]\n", + "\n", + " ols = sm.OLS(y, X).fit()\n", + "\n", + " u = ols.resid\n", + " ssr0 = (u**2).sum()\n", + "\n", + " X_nn_list = []\n", + "\n", + " for i in range(lag):\n", + " for j in range(i, lag):\n", + " element = X[:, i + 1] * X[:, j + 1]\n", + " element = np.vstack(element)\n", + " X_nn_list.append(element)\n", + "\n", + " for i in range(lag):\n", + " for j in range(i, lag):\n", + " for k in range(j, lag):\n", + " element = X[:, i + 1] * X[:, j + 1] * X[:, k + 1]\n", + " element = np.vstack(element)\n", + " X_nn_list.append(element)\n", + "\n", + " X_nn = np.concatenate(X_nn_list, axis=1)\n", + " X_nn = np.concatenate([X, X_nn], axis=1)\n", + " ols_nn = sm.OLS(u, X_nn).fit()\n", + "\n", + " v = ols_nn.resid\n", + " ssr = (v**2).sum()\n", + "\n", + " stat = size_x * np.log(ssr0 / ssr)\n", + "\n", + " return stat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def hurst_exponent(x: np.array) -> float:\n", + " \"\"\"Computes hurst exponent.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " Time series.\n", + "\n", + " References\n", + " ----------\n", + " [1] Taken from https://gist.github.com/alexvorndran/aad69fa741e579aad093608ccaab4fe1\n", + " [2] Based on https://codereview.stackexchange.com/questions/224360/hurst-exponent-calculator\n", + " \"\"\"\n", + " n = x.size # num timesteps\n", + " t = np.arange(1, n + 1)\n", + " y = x.cumsum() # marginally more efficient than: np.cumsum(sig)\n", + " mean_t = y / t # running mean\n", + "\n", + " s_t = np.sqrt(np.array([np.mean((x[: i + 1] - mean_t[i]) ** 2) for i in range(n)]))\n", + " r_t = np.array([np.ptp(y[: i + 1] - t[: i + 1] * mean_t[i]) for i in range(n)])\n", + "\n", + " with np.errstate(invalid=\"ignore\"):\n", + " r_s = r_t / s_t\n", + "\n", + " r_s = np.log(r_s)[1:]\n", + " n = np.log(t)[1:]\n", + " a = np.column_stack((n, np.ones(n.size)))\n", + " hurst_exponent, _ = np.linalg.lstsq(a, r_s, rcond=-1)[0]\n", + "\n", + " return hurst_exponent" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def ur_pp(x: np.array) -> float:\n", + " \"\"\"Performs the Phillips and Perron unit root test.\n", + "\n", + " Parameters\n", + " ----------\n", + " x: numpy array\n", + " Time series.\n", + "\n", + " References\n", + " ----------\n", + " https://www.rdocumentation.org/packages/urca/versions/1.3-0/topics/ur.pp\n", + " \"\"\"\n", + " n = len(x)\n", + " lmax = 4 * (n / 100) ** (1 / 4)\n", + "\n", + " lmax, _ = divmod(lmax, 1)\n", + " lmax = int(lmax)\n", + "\n", + " y, y_l1 = x[1:], x[: n - 1]\n", + "\n", + " n -= 1\n", + "\n", + " y_l1 = sm.add_constant(y_l1)\n", + "\n", + " model = sm.OLS(y, y_l1).fit()\n", + " my_tstat, res = model.tvalues[0], model.resid\n", + " s = 1 / (n * np.sum(res**2))\n", + " myybar = (1 / n**2) * (((y - y.mean()) ** 2).sum())\n", + " myy = (1 / n**2) * ((y**2).sum())\n", + " my = (n ** (-3 / 2)) * (y.sum())\n", + "\n", + " idx = np.arange(lmax)\n", + " coprods = []\n", + " for i in idx:\n", + " first_del = res[i + 1 :]\n", + " sec_del = res[: n - i - 1]\n", + " prod = first_del * sec_del\n", + " coprods.append(prod.sum())\n", + " coprods = np.array(coprods)\n", + "\n", + " weights = 1 - (idx + 1) / (lmax + 1)\n", + " sig = s + (2 / n) * ((weights * coprods).sum())\n", + " lambda_ = 0.5 * (sig - s)\n", + " lambda_prime = lambda_ / sig\n", + "\n", + " alpha = model.params[1]\n", + "\n", + " test_stat = n * (alpha - 1) - lambda_ / myybar\n", + "\n", + " return test_stat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "def lambda_coef_var(lambda_par: float, x: np.array, period: int = 2):\n", + " \"\"\"Calculates coefficient of variation for subseries of x.\n", + "\n", + " Parameters\n", + " ----------\n", + " lambda_par: float\n", + " Lambda Box-cox transformation parameter.\n", + " Must be greater than zero.\n", + " x: numpy array\n", + " Time series.\n", + " period: int\n", + " The length of each subseries (usually the length of seasonal period).\n", + "\n", + " Returns\n", + " -------\n", + " float\n", + " Coefficient of variation.\n", + " \"\"\"\n", + " if len(np.unique(x)) == 1:\n", + " return 1\n", + "\n", + " split_size = divmod(len(x) - 1, period)\n", + " split_size, _ = split_size\n", + "\n", + " split = np.array_split(x, split_size)\n", + "\n", + " mu_h = np.array([np.nanmean(sub) for sub in split])\n", + " sig_h = np.array([np.nanstd(sub, ddof=1) for sub in split])\n", + "\n", + " rat = sig_h / mu_h ** (1 - lambda_par)\n", + "\n", + " value = np.nanstd(rat, ddof=1) / np.nanmean(rat)\n", + "\n", + " return value" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |export\n", + "\n", + "# fmt: off\n", + "WWWusage = [88, 84, 85, 85, 84, 85, 83, 85, 88, 89, 91, 99, 104, 112, 126, 138, 146, 151, 150, 148, 147, 149, 143, 132, 131, 139, 147, 150, 148, 145, 140, 134, 131, 131, 129, 126, 126, 132, 137, 140, 142, 150, 159, 167, 170, 171, 172, 172, 174, 175, 172, 172, 174, 174, 169, 165, 156, 142, 131, 121, 112, 104, 102, 99, 99, 95, 88, 84, 84, 87, 89, 88, 85, 86, 89, 91, 91, 94, 101, 110, 121, 135, 145, 149, 156, 165, 171, 175, 177, 182, 193, 204, 208, 210, 215, 222, 228, 226, 222, 220,]\n", + "\n", + "USAccDeaths = [9007,8106,8928,9137,10017,10826,11317,10744,9713,9938,9161,8927,7750,6981,8038,8422,8714,9512,10120,9823,8743,9129,8710,8680,8162,7306,8124,7870,9387,9556,10093,9620,8285,8466,8160,8034,7717,7461,7767,7925,8623,8945,10078,9179,8037,8488,7874,8647,7792,6957,7726,8106,8890,9299,10625,9302,8314,8850,8265,8796,7836,6892,7791,8192,9115,9434,10484,9827,9110,9070,8633,9240,]\n", + "\n", + "# fmt: on" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |hide\n", + "from nbdev.showdoc import *" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# |hide\n", + "import nbdev\n", + "\n", + "nbdev.nbdev_export()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "python3", + "language": "python", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index c2e88d9..0000000 --- a/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -antropy>=0.1.4 -arch>=4.14 -pandas>=1.0.5 -scikit-learn>=0.23.1 -statsmodels>=0.12.2 -supersmoother>=0.4 diff --git a/scripts/compare_with_r.py b/scripts/compare_with_r.py new file mode 100644 index 0000000..2a1311e --- /dev/null +++ b/scripts/compare_with_r.py @@ -0,0 +1,84 @@ +#!/usr/bin/env python +# coding: utf-8 + +import argparse +import sys +import time + +from tsfeatures import tsfeatures +from .tsfeatures_r import tsfeatures_r +from .m4_data import prepare_m4_data +from .utils import FREQS + + +def compare_features_m4(dataset_name, directory, num_obs=1000000): + _, y_train_df, _, _ = prepare_m4_data( + dataset_name=dataset_name, directory=directory, num_obs=num_obs + ) + + freq = FREQS[dataset_name[0]] + + print("Calculating python features...") + init = time.time() + py_feats = tsfeatures(y_train_df, freq=freq).set_index("unique_id") + print("Total time: ", time.time() - init) + + print("Calculating r features...") + init = time.time() + r_feats = tsfeatures_r(y_train_df, freq=freq, parallel=True).set_index("unique_id") + print("Total time: ", time.time() - init) + + diff = py_feats.sub(r_feats, 1).abs().sum(0).sort_values() + + return diff + + +def main(args): + if args.num_obs: + num_obs = args.num_obs + else: + num_obs = 100000 + + if args.dataset_name: + datasets = [args.dataset_name] + else: + datasets = ["Daily", "Hourly", "Yearly", "Quarterly", "Weekly", "Monthly"] + + for dataset_name in datasets: + diff = compare_features_m4(dataset_name, args.results_directory, num_obs) + diff.name = "diff" + diff = diff.rename_axis("feature") + diff = diff.reset_index() + diff["diff"] = diff["diff"].map("{:.2f}".format) + save_dir = args.results_directory + "/" + dataset_name + "_comparison_" + save_dir += str(num_obs) + ".csv" + diff.to_csv(save_dir, index=False) + + print("Comparison saved at: ", save_dir) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Get features for M4 data") + + parser.add_argument( + "--results_directory", + required=True, + type=str, + help="directory where M4 data will be downloaded", + ) + parser.add_argument( + "--num_obs", + required=False, + type=int, + help="number of M4 time series to be tested (uses all data by default)", + ) + parser.add_argument( + "--dataset_name", + required=False, + type=str, + help="type of dataset to get features", + ) + + args = parser.parse_args() + + main(args) diff --git a/settings.ini b/settings.ini new file mode 100644 index 0000000..7cf372f --- /dev/null +++ b/settings.ini @@ -0,0 +1,48 @@ +[DEFAULT] +# All sections below are required unless otherwise specified. +# See https://github.com/fastai/nbdev/blob/master/settings.ini for examples. + +### Python library ### +host=github +repo = tsfeatures +lib_name = %(repo)s +description = Calculates various features from time series data. Python implementation of the R package tsfeatures. +keywords = time-series feature engineering forecasting +version = 0.4.5 +min_python = 3.7 +license = apache2 + +### nbdev ### +doc_path = _docs +lib_path = %(lib_name)s +nbs_path = nbs +recursive = True +tst_flags = notest +put_version_in_init = True + +### Docs ### +branch = main +custom_sidebar = False +doc_host = https://%(user)s.github.io +doc_baseurl = /%(repo)s/ +git_url = https://github.com/%(user)s/%(repo)s +title = %(lib_name)s + +### PyPI ### +audience = Developers +author = Nixtla +author_email = business@nixtla.io +copyright = Nixtla Inc. +language = English +status = 3 +user = Nixtla + +### Optional ### +requirements = rpy2==3.5.11 antropy>=0.1.4 arch>=4.11 pandas>=1.0.5 scikit-learn>=0.23.1 statsmodels>=0.13.2 supersmoother>=0.4 numba>=0.55.0 numpy>=1.21.6 tqdm +dev_requirements = nbdev fastcore + +black_formatting = True +jupyter_hooks = True +clean_ids = True +clear_all = False +readme_nb = index.ipynb diff --git a/setup.py b/setup.py index a847a2b..1737381 100644 --- a/setup.py +++ b/setup.py @@ -1,28 +1,94 @@ +import shlex +from configparser import ConfigParser + import setuptools +from pkg_resources import parse_version + +assert parse_version(setuptools.__version__) >= parse_version("36.2") + +# note: all settings are in settings.ini; edit there, not here +config = ConfigParser(delimiters=["="]) +config.read("settings.ini", encoding="utf-8") +cfg = config["DEFAULT"] + +cfg_keys = "version description keywords author author_email".split() +expected = ( + cfg_keys + + "lib_name user branch license status min_python audience language".split() +) +for o in expected: + assert o in cfg, "missing expected setting: {}".format(o) +setup_cfg = {o: cfg[o] for o in cfg_keys} + +licenses = { + "apache2": ( + "Apache Software License 2.0", + "OSI Approved :: Apache Software License", + ), + "mit": ("MIT License", "OSI Approved :: MIT License"), + "gpl2": ( + "GNU General Public License v2", + "OSI Approved :: GNU General Public License v2 (GPLv2)", + ), + "gpl3": ( + "GNU General Public License v3", + "OSI Approved :: GNU General Public License v3 (GPLv3)", + ), + "agpl3": ( + "GNU Affero General Public License v3", + "OSI Approved :: GNU Affero General Public License (AGPLv3)", + ), + "bsd3": ("BSD License", "OSI Approved :: BSD License"), +} +statuses = [ + "0 - Pre-Planning", + "1 - Planning", + "2 - Pre-Alpha", + "3 - Alpha", + "4 - Beta", + "5 - Production/Stable", + "6 - Mature", + "7 - Inactive", +] +py_versions = "3.7 3.8 3.9 3.10 3.11".split() -with open("README.md", "r") as fh: - long_description = fh.read() +requirements = shlex.split(cfg.get("requirements", "")) +if cfg.get("pip_requirements"): + requirements += shlex.split(cfg.get("pip_requirements", "")) +min_python = cfg["min_python"] +lic = licenses.get(cfg["license"].lower(), (cfg["license"], None)) +dev_requirements = (cfg.get("dev_requirements") or "").split() +project_urls = {} +if cfg.get("doc_host"): + project_urls["Documentation"] = cfg["doc_host"] + cfg.get("doc_baseurl", "") setuptools.setup( - name="tsfeatures", - version="0.4.5", - description="Calculates various features from time series data.", - long_description=long_description, - long_description_content_type="text/markdown", - url="https://github.com/Nixtla/tsfeatures", - packages=setuptools.find_packages(), + name=cfg["lib_name"], + license=lic[0], classifiers=[ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: MIT License", - "Operating System :: OS Independent", - ], - python_requires='>=3.7', - install_requires=[ - "antropy>=0.1.4", - "arch>=4.11", - "pandas>=1.0.5", - "scikit-learn>=0.23.1", - "statsmodels>=0.13.2", - "supersmoother>=0.4" + "Development Status :: " + statuses[int(cfg["status"])], + "Intended Audience :: " + cfg["audience"].title(), + "Natural Language :: " + cfg["language"].title(), + ] + + [ + "Programming Language :: Python :: " + o + for o in py_versions[py_versions.index(min_python) :] ] + + (["License :: " + lic[1]] if lic[1] else []), + url=cfg["git_url"], + packages=setuptools.find_packages(), + include_package_data=True, + install_requires=requirements, + extras_require={"dev": dev_requirements}, + dependency_links=cfg.get("dep_links", "").split(), + python_requires=">=" + cfg["min_python"], + long_description=open("README.md", encoding="utf8").read(), + long_description_content_type="text/markdown", + zip_safe=False, + entry_points={ + "console_scripts": cfg.get("console_scripts", "").split(), + "nbdev": [f'{cfg.get("lib_path")}={cfg.get("lib_path")}._modidx:d'], + }, + project_urls=project_urls, + **setup_cfg, ) diff --git a/tsfeatures/__init__.py b/tsfeatures/__init__.py index 26574de..9632ea4 100644 --- a/tsfeatures/__init__.py +++ b/tsfeatures/__init__.py @@ -1,4 +1,2 @@ -#!/usr/bin/env python -# coding: utf-8 - +__version__ = "0.4.5" from .tsfeatures import * diff --git a/tsfeatures/_modidx.py b/tsfeatures/_modidx.py new file mode 100644 index 0000000..a0c227a --- /dev/null +++ b/tsfeatures/_modidx.py @@ -0,0 +1,52 @@ +# Autogenerated by nbdev + +d = { 'settings': { 'branch': 'main', + 'doc_baseurl': '/tsfeatures/', + 'doc_host': 'https://Nixtla.github.io', + 'git_url': 'https://github.com/Nixtla/tsfeatures', + 'lib_path': 'tsfeatures'}, + 'syms': { 'tsfeatures.m4_data': { 'tsfeatures.m4_data.m4_parser': ('m4_data.html#m4_parser', 'tsfeatures/m4_data.py'), + 'tsfeatures.m4_data.maybe_download': ('m4_data.html#maybe_download', 'tsfeatures/m4_data.py'), + 'tsfeatures.m4_data.prepare_m4_data': ('m4_data.html#prepare_m4_data', 'tsfeatures/m4_data.py')}, + 'tsfeatures.tsfeatures': { 'tsfeatures.tsfeatures._get_feats': ('tsfeatures.html#_get_feats', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures._get_feats_wide': ( 'tsfeatures.html#_get_feats_wide', + 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.acf_features': ('tsfeatures.html#acf_features', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.arch_stat': ('tsfeatures.html#arch_stat', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.count_entropy': ('tsfeatures.html#count_entropy', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.crossing_points': ( 'tsfeatures.html#crossing_points', + 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.entropy': ('tsfeatures.html#entropy', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.flat_spots': ('tsfeatures.html#flat_spots', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.frequency': ('tsfeatures.html#frequency', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.guerrero': ('tsfeatures.html#guerrero', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.heterogeneity': ('tsfeatures.html#heterogeneity', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.holt_parameters': ( 'tsfeatures.html#holt_parameters', + 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.hurst': ('tsfeatures.html#hurst', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.hw_parameters': ('tsfeatures.html#hw_parameters', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.intervals': ('tsfeatures.html#intervals', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.lumpiness': ('tsfeatures.html#lumpiness', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.nonlinearity': ('tsfeatures.html#nonlinearity', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.pacf_features': ('tsfeatures.html#pacf_features', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.series_length': ('tsfeatures.html#series_length', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.sparsity': ('tsfeatures.html#sparsity', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.stability': ('tsfeatures.html#stability', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.statistics': ('tsfeatures.html#statistics', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.stl_features': ('tsfeatures.html#stl_features', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.tsfeatures': ('tsfeatures.html#tsfeatures', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.tsfeatures_wide': ( 'tsfeatures.html#tsfeatures_wide', + 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.unitroot_kpss': ('tsfeatures.html#unitroot_kpss', 'tsfeatures/tsfeatures.py'), + 'tsfeatures.tsfeatures.unitroot_pp': ('tsfeatures.html#unitroot_pp', 'tsfeatures/tsfeatures.py')}, + 'tsfeatures.tsfeatures_r': { 'tsfeatures.tsfeatures_r.tsfeatures_r': ( 'tsfeatures_r.html#tsfeatures_r', + 'tsfeatures/tsfeatures_r.py'), + 'tsfeatures.tsfeatures_r.tsfeatures_r_wide': ( 'tsfeatures_r.html#tsfeatures_r_wide', + 'tsfeatures/tsfeatures_r.py')}, + 'tsfeatures.utils': { 'tsfeatures.utils.embed': ('utils.html#embed', 'tsfeatures/utils.py'), + 'tsfeatures.utils.hurst_exponent': ('utils.html#hurst_exponent', 'tsfeatures/utils.py'), + 'tsfeatures.utils.lambda_coef_var': ('utils.html#lambda_coef_var', 'tsfeatures/utils.py'), + 'tsfeatures.utils.poly': ('utils.html#poly', 'tsfeatures/utils.py'), + 'tsfeatures.utils.scalets': ('utils.html#scalets', 'tsfeatures/utils.py'), + 'tsfeatures.utils.terasvirta_test': ('utils.html#terasvirta_test', 'tsfeatures/utils.py'), + 'tsfeatures.utils.ur_pp': ('utils.html#ur_pp', 'tsfeatures/utils.py')}}} diff --git a/tsfeatures/compare_with_r.py b/tsfeatures/compare_with_r.py deleted file mode 100644 index 3937a42..0000000 --- a/tsfeatures/compare_with_r.py +++ /dev/null @@ -1,71 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import argparse -import sys -import time - -from tsfeatures import tsfeatures -from .tsfeatures_r import tsfeatures_r -from .m4_data import prepare_m4_data -from .utils import FREQS - - -def compare_features_m4(dataset_name, directory, num_obs=1000000): - _, y_train_df, _, _ = prepare_m4_data(dataset_name=dataset_name, - directory = directory, - num_obs=num_obs) - - freq = FREQS[dataset_name[0]] - - print('Calculating python features...') - init = time.time() - py_feats = tsfeatures(y_train_df, freq=freq).set_index('unique_id') - print('Total time: ', time.time() - init) - - print('Calculating r features...') - init = time.time() - r_feats = tsfeatures_r(y_train_df, freq=freq, parallel=True).set_index('unique_id') - print('Total time: ', time.time() - init) - - diff = py_feats.sub(r_feats, 1).abs().sum(0).sort_values() - - return diff - -def main(args): - if args.num_obs: - num_obs = args.num_obs - else: - num_obs = 100000 - - if args.dataset_name: - datasets = [args.dataset_name] - else: - datasets = ['Daily', 'Hourly', 'Yearly', 'Quarterly', 'Weekly', 'Monthly'] - - for dataset_name in datasets: - diff = compare_features_m4(dataset_name, args.results_directory, num_obs) - diff.name = 'diff' - diff = diff.rename_axis('feature') - diff = diff.reset_index() - diff['diff'] = diff['diff'].map('{:.2f}'.format) - save_dir = args.results_directory + '/' + dataset_name + '_comparison_' - save_dir += str(num_obs) + '.csv' - diff.to_csv(save_dir, index=False) - - print('Comparison saved at: ', save_dir) - -if __name__=='__main__': - - parser = argparse.ArgumentParser(description='Get features for M4 data') - - parser.add_argument("--results_directory", required=True, type=str, - help="directory where M4 data will be downloaded") - parser.add_argument("--num_obs", required=False, type=int, - help="number of M4 time series to be tested (uses all data by default)") - parser.add_argument("--dataset_name", required=False, type=str, - help="type of dataset to get features") - - args = parser.parse_args() - - main(args) diff --git a/tsfeatures/m4_data.py b/tsfeatures/m4_data.py index ac3ce68..5c52123 100644 --- a/tsfeatures/m4_data.py +++ b/tsfeatures/m4_data.py @@ -1,33 +1,32 @@ -#!/usr/bin/env python -# coding: utf-8 +# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/m4_data.ipynb. +# %% auto 0 +__all__ = ['seas_dict', 'SOURCE_URL', 'maybe_download', 'm4_parser', 'prepare_m4_data'] + +# %% ../nbs/m4_data.ipynb 3 import os -import subprocess import urllib -import numpy as np import pandas as pd -#from six.moves import urllib - -seas_dict = {'Hourly': {'seasonality': 24, 'input_size': 24, - 'output_size': 48, 'freq': 'H'}, - 'Daily': {'seasonality': 7, 'input_size': 7, - 'output_size': 14, 'freq': 'D'}, - 'Weekly': {'seasonality': 52, 'input_size': 52, - 'output_size': 13, 'freq': 'W'}, - 'Monthly': {'seasonality': 12, 'input_size': 12, - 'output_size':18, 'freq': 'M'}, - 'Quarterly': {'seasonality': 4, 'input_size': 4, - 'output_size': 8, 'freq': 'Q'}, - 'Yearly': {'seasonality': 1, 'input_size': 4, - 'output_size': 6, 'freq': 'D'}} - -SOURCE_URL = 'https://raw.githubusercontent.com/Mcompetitions/M4-methods/master/Dataset/' - - +# %% ../nbs/m4_data.ipynb 4 +seas_dict = { + "Hourly": {"seasonality": 24, "input_size": 24, "output_size": 48, "freq": "H"}, + "Daily": {"seasonality": 7, "input_size": 7, "output_size": 14, "freq": "D"}, + "Weekly": {"seasonality": 52, "input_size": 52, "output_size": 13, "freq": "W"}, + "Monthly": {"seasonality": 12, "input_size": 12, "output_size": 18, "freq": "M"}, + "Quarterly": {"seasonality": 4, "input_size": 4, "output_size": 8, "freq": "Q"}, + "Yearly": {"seasonality": 1, "input_size": 4, "output_size": 6, "freq": "D"}, +} + +# %% ../nbs/m4_data.ipynb 5 +SOURCE_URL = ( + "https://raw.githubusercontent.com/Mcompetitions/M4-methods/master/Dataset/" +) + +# %% ../nbs/m4_data.ipynb 6 def maybe_download(filename, directory): - """ Download the data from M4's website, unless it's already here. + """Download the data from M4's website, unless it's already here. Parameters ---------- @@ -49,12 +48,13 @@ def maybe_download(filename, directory): filepath, _ = urllib.request.urlretrieve(SOURCE_URL + filename, filepath) size = os.path.getsize(filepath) - print('Successfully downloaded', filename, size, 'bytes.') + print("Successfully downloaded", filename, size, "bytes.") return filepath +# %% ../nbs/m4_data.ipynb 7 def m4_parser(dataset_name, directory, num_obs=1000000): - """ Transform M4 data into a panel. + """Transform M4 data into a panel. Parameters ---------- @@ -68,60 +68,66 @@ def m4_parser(dataset_name, directory, num_obs=1000000): data_directory = directory + "/m4" train_directory = data_directory + "/Train/" test_directory = data_directory + "/Test/" - freq = seas_dict[dataset_name]['freq'] + freq = seas_dict[dataset_name]["freq"] - m4_info = pd.read_csv(data_directory+'/M4-info.csv', usecols=['M4id','category']) - m4_info = m4_info[m4_info['M4id'].str.startswith(dataset_name[0])].reset_index(drop=True) + m4_info = pd.read_csv(data_directory + "/M4-info.csv", usecols=["M4id", "category"]) + m4_info = m4_info[m4_info["M4id"].str.startswith(dataset_name[0])].reset_index( + drop=True + ) # Train data - train_path='{}{}-train.csv'.format(train_directory, dataset_name) + train_path = "{}{}-train.csv".format(train_directory, dataset_name) train_df = pd.read_csv(train_path, nrows=num_obs) - train_df = train_df.rename(columns={'V1':'unique_id'}) + train_df = train_df.rename(columns={"V1": "unique_id"}) - train_df = pd.wide_to_long(train_df, stubnames=["V"], i="unique_id", j="ds").reset_index() - train_df = train_df.rename(columns={'V':'y'}) + train_df = pd.wide_to_long( + train_df, stubnames=["V"], i="unique_id", j="ds" + ).reset_index() + train_df = train_df.rename(columns={"V": "y"}) train_df = train_df.dropna() - train_df['split'] = 'train' - train_df['ds'] = train_df['ds']-1 + train_df["split"] = "train" + train_df["ds"] = train_df["ds"] - 1 # Get len of series per unique_id - len_series = train_df.groupby('unique_id').agg({'ds': 'max'}).reset_index() - len_series.columns = ['unique_id', 'len_serie'] + len_series = train_df.groupby("unique_id").agg({"ds": "max"}).reset_index() + len_series.columns = ["unique_id", "len_serie"] # Test data - test_path='{}{}-test.csv'.format(test_directory, dataset_name) + test_path = "{}{}-test.csv".format(test_directory, dataset_name) test_df = pd.read_csv(test_path, nrows=num_obs) - test_df = test_df.rename(columns={'V1':'unique_id'}) + test_df = test_df.rename(columns={"V1": "unique_id"}) - test_df = pd.wide_to_long(test_df, stubnames=["V"], i="unique_id", j="ds").reset_index() - test_df = test_df.rename(columns={'V':'y'}) + test_df = pd.wide_to_long( + test_df, stubnames=["V"], i="unique_id", j="ds" + ).reset_index() + test_df = test_df.rename(columns={"V": "y"}) test_df = test_df.dropna() - test_df['split'] = 'test' - test_df = test_df.merge(len_series, on='unique_id') - test_df['ds'] = test_df['ds'] + test_df['len_serie'] - 1 - test_df = test_df[['unique_id','ds','y','split']] + test_df["split"] = "test" + test_df = test_df.merge(len_series, on="unique_id") + test_df["ds"] = test_df["ds"] + test_df["len_serie"] - 1 + test_df = test_df[["unique_id", "ds", "y", "split"]] - df = pd.concat((train_df,test_df)) - df = df.sort_values(by=['unique_id', 'ds']).reset_index(drop=True) + df = pd.concat((train_df, test_df)) + df = df.sort_values(by=["unique_id", "ds"]).reset_index(drop=True) # Create column with dates with freq of dataset - len_series = df.groupby('unique_id').agg({'ds': 'max'}).reset_index() + len_series = df.groupby("unique_id").agg({"ds": "max"}).reset_index() dates = [] for i in range(len(len_series)): - len_serie = len_series.iloc[i,1] - ranges = pd.date_range(start='1970/01/01', periods=len_serie, freq=freq) - dates += list(ranges) - df.loc[:,'ds'] = dates + len_serie = len_series.iloc[i, 1] + ranges = pd.date_range(start="1970/01/01", periods=len_serie, freq=freq) + dates += list(ranges) + df.loc[:, "ds"] = dates - df = df.merge(m4_info, left_on=['unique_id'], right_on=['M4id']) - df.drop(columns=['M4id'], inplace=True) - df = df.rename(columns={'category': 'x'}) + df = df.merge(m4_info, left_on=["unique_id"], right_on=["M4id"]) + df.drop(columns=["M4id"], inplace=True) + df = df.rename(columns={"category": "x"}) - X_train_df = df[df['split']=='train'].filter(items=['unique_id', 'ds', 'x']) - y_train_df = df[df['split']=='train'].filter(items=['unique_id', 'ds', 'y']) - X_test_df = df[df['split']=='test'].filter(items=['unique_id', 'ds', 'x']) - y_test_df = df[df['split']=='test'].filter(items=['unique_id', 'ds', 'y']) + X_train_df = df[df["split"] == "train"].filter(items=["unique_id", "ds", "x"]) + y_train_df = df[df["split"] == "train"].filter(items=["unique_id", "ds", "y"]) + X_test_df = df[df["split"] == "test"].filter(items=["unique_id", "ds", "x"]) + y_test_df = df[df["split"] == "test"].filter(items=["unique_id", "ds", "y"]) X_train_df = X_train_df.reset_index(drop=True) y_train_df = y_train_df.reset_index(drop=True) @@ -130,6 +136,7 @@ def m4_parser(dataset_name, directory, num_obs=1000000): return X_train_df, y_train_df, X_test_df, y_test_df +# %% ../nbs/m4_data.ipynb 8 def prepare_m4_data(dataset_name, directory, num_obs): """Pipeline that obtains M4 times series, tranforms it and gets naive2 predictions. @@ -143,23 +150,25 @@ def prepare_m4_data(dataset_name, directory, num_obs): num_obs: int Number of time series to return. """ - m4info_filename = maybe_download('M4-info.csv', directory) - - dailytrain_filename = maybe_download('Train/Daily-train.csv', directory) - hourlytrain_filename = maybe_download('Train/Hourly-train.csv', directory) - monthlytrain_filename = maybe_download('Train/Monthly-train.csv', directory) - quarterlytrain_filename = maybe_download('Train/Quarterly-train.csv', directory) - weeklytrain_filename = maybe_download('Train/Weekly-train.csv', directory) - yearlytrain_filename = maybe_download('Train/Yearly-train.csv', directory) - - dailytest_filename = maybe_download('Test/Daily-test.csv', directory) - hourlytest_filename = maybe_download('Test/Hourly-test.csv', directory) - monthlytest_filename = maybe_download('Test/Monthly-test.csv', directory) - quarterlytest_filename = maybe_download('Test/Quarterly-test.csv', directory) - weeklytest_filename = maybe_download('Test/Weekly-test.csv', directory) - yearlytest_filename = maybe_download('Test/Yearly-test.csv', directory) - print('\n') - - X_train_df, y_train_df, X_test_df, y_test_df = m4_parser(dataset_name, directory, num_obs) + m4info_filename = maybe_download("M4-info.csv", directory) + + dailytrain_filename = maybe_download("Train/Daily-train.csv", directory) + hourlytrain_filename = maybe_download("Train/Hourly-train.csv", directory) + monthlytrain_filename = maybe_download("Train/Monthly-train.csv", directory) + quarterlytrain_filename = maybe_download("Train/Quarterly-train.csv", directory) + weeklytrain_filename = maybe_download("Train/Weekly-train.csv", directory) + yearlytrain_filename = maybe_download("Train/Yearly-train.csv", directory) + + dailytest_filename = maybe_download("Test/Daily-test.csv", directory) + hourlytest_filename = maybe_download("Test/Hourly-test.csv", directory) + monthlytest_filename = maybe_download("Test/Monthly-test.csv", directory) + quarterlytest_filename = maybe_download("Test/Quarterly-test.csv", directory) + weeklytest_filename = maybe_download("Test/Weekly-test.csv", directory) + yearlytest_filename = maybe_download("Test/Yearly-test.csv", directory) + print("\n") + + X_train_df, y_train_df, X_test_df, y_test_df = m4_parser( + dataset_name, directory, num_obs + ) return X_train_df, y_train_df, X_test_df, y_test_df diff --git a/tsfeatures/metrics/__init__.py b/tsfeatures/metrics/__init__.py deleted file mode 100644 index 91847eb..0000000 --- a/tsfeatures/metrics/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -from .metrics import * diff --git a/tsfeatures/metrics/metrics.py b/tsfeatures/metrics/metrics.py deleted file mode 100644 index f27ea75..0000000 --- a/tsfeatures/metrics/metrics.py +++ /dev/null @@ -1,345 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import numpy as np -import pandas as pd - -from functools import partial -from math import sqrt -from multiprocessing import Pool -from typing import Callable, Optional - -AVAILABLE_METRICS = ['mse', 'rmse', 'mape', 'smape', 'mase', 'rmsse', - 'mini_owa', 'pinball_loss'] - -###################################################################### -# METRICS -###################################################################### - -def mse(y: np.array, y_hat:np.array) -> float: - """Calculates Mean Squared Error. - - MSE measures the prediction accuracy of a - forecasting method by calculating the squared deviation - of the prediction and the true value at a given time and - averages these devations over the length of the series. - - Parameters - ---------- - y: numpy array - actual test values - y_hat: numpy array - predicted values - - Returns - ------- - scalar: - MSE - """ - mse = np.mean(np.square(y - y_hat)) - - return mse - -def rmse(y: np.array, y_hat:np.array) -> float: - """Calculates Root Mean Squared Error. - - RMSE measures the prediction accuracy of a - forecasting method by calculating the squared deviation - of the prediction and the true value at a given time and - averages these devations over the length of the series. - Finally the RMSE will be in the same scale - as the original time series so its comparison with other - series is possible only if they share a common scale. - - Parameters - ---------- - y: numpy array - actual test values - y_hat: numpy array - predicted values - - Returns - ------- - scalar: RMSE - """ - rmse = sqrt(np.mean(np.square(y - y_hat))) - - return rmse - -def mape(y: np.array, y_hat:np.array) -> float: - """Calculates Mean Absolute Percentage Error. - - MAPE measures the relative prediction accuracy of a - forecasting method by calculating the percentual deviation - of the prediction and the true value at a given time and - averages these devations over the length of the series. - - Parameters - ---------- - y: numpy array - actual test values - y_hat: numpy array - predicted values - - Returns - ------- - scalar: MAPE - """ - mape = np.mean(np.abs(y - y_hat) / np.abs(y)) - mape = 100 * mape - - return mape - -def smape(y: np.array, y_hat:np.array) -> float: - """Calculates Symmetric Mean Absolute Percentage Error. - - SMAPE measures the relative prediction accuracy of a - forecasting method by calculating the relative deviation - of the prediction and the true value scaled by the sum of the - absolute values for the prediction and true value at a - given time, then averages these devations over the length - of the series. This allows the SMAPE to have bounds between - 0% and 200% which is desireble compared to normal MAPE that - may be undetermined. - - Parameters - ---------- - y: numpy array - actual test values - y_hat: numpy array - predicted values - - Returns - ------- - scalar: SMAPE - """ - scale = np.abs(y) + np.abs(y_hat) - scale[scale == 0] = 1e-3 - smape = np.mean(np.abs(y - y_hat) / scale) - smape = 200 * smape - - return smape - -def mase(y: np.array, y_hat: np.array, - y_train: np.array, seasonality: int = 1) -> float: - """Calculates the M4 Mean Absolute Scaled Error. - - MASE measures the relative prediction accuracy of a - forecasting method by comparinng the mean absolute errors - of the prediction and the true value against the mean - absolute errors of the seasonal naive model. - - Parameters - ---------- - y: numpy array - actual test values - y_hat: numpy array - predicted values - y_train: numpy array - actual train values for Naive1 predictions - seasonality: int - main frequency of the time series - Hourly 24, Daily 7, Weekly 52, - Monthly 12, Quarterly 4, Yearly 1 - - Returns - ------- - scalar: MASE - """ - scale = np.mean(abs(y_train[seasonality:] - y_train[:-seasonality])) - mase = np.mean(abs(y - y_hat)) / scale - mase = 100 * mase - - return mase - -def rmsse(y: np.array, y_hat: np.array, - y_train: np.array, seasonality: int = 1) -> float: - """Calculates the M5 Root Mean Squared Scaled Error. - - Parameters - ---------- - y: numpy array - actual test values - y_hat: numpy array of len h (forecasting horizon) - predicted values - y_train: numpy array - actual train values - seasonality: int - main frequency of the time series - Hourly 24, Daily 7, Weekly 52, - Monthly 12, Quarterly 4, Yearly 1 - - Returns - ------- - scalar: RMSSE - """ - scale = np.mean(np.square(y_train[seasonality:] - y_train[:-seasonality])) - rmsse = sqrt(mse(y, y_hat) / scale) - rmsse = 100 * rmsse - - return rmsse - -def mini_owa(y: np.array, y_hat: np.array, - y_train: np.array, - seasonality: int, - y_bench: np.array): - """Calculates the Overall Weighted Average for a single series. - - MASE, sMAPE for Naive2 and current model - then calculatess Overall Weighted Average. - - Parameters - ---------- - y: numpy array - actual test values - y_hat: numpy array of len h (forecasting horizon) - predicted values - y_train: numpy array - insample values of the series for scale - seasonality: int - main frequency of the time series - Hourly 24, Daily 7, Weekly 52, - Monthly 12, Quarterly 4, Yearly 1 - y_bench: numpy array of len h (forecasting horizon) - predicted values of the benchmark model - - Returns - ------- - return: mini_OWA - """ - mase_y = mase(y, y_hat, y_train, seasonality) - mase_bench = mase(y, y_bench, y_train, seasonality) - - smape_y = smape(y, y_hat) - smape_bench = smape(y, y_bench) - - mini_owa = ((mase_y/mase_bench) + (smape_y/smape_bench))/2 - - return mini_owa - -def pinball_loss(y: np.array, y_hat: np.array, tau: int = 0.5): - """Calculates the Pinball Loss. - - The Pinball loss measures the deviation of a quantile forecast. - By weighting the absolute deviation in a non symmetric way, the - loss pays more attention to under or over estimation. - A common value for tau is 0.5 for the deviation from the median. - - Parameters - ---------- - y: numpy array - actual test values - y_hat: numpy array of len h (forecasting horizon) - predicted values - tau: float - Fixes the quantile against which the predictions are compared. - - Returns - ------- - return: pinball_loss - """ - delta_y = y - y_hat - pinball = np.maximum(tau * delta_y, (tau-1) * delta_y) - pinball = pinball.mean() - - return pinball - -###################################################################### -# PANEL EVALUATION -###################################################################### - -def _evaluate_ts(uid, y_test, y_hat, - y_train, metric, - seasonality, y_bench, metric_name): - y_test_uid = y_test.loc[uid].y.values - y_hat_uid = y_hat.loc[uid].y_hat.values - - if metric_name in ['mase', 'rmsse']: - y_train_uid = y_train.loc[uid].y.values - evaluation_uid = metric(y=y_test_uid, y_hat=y_hat_uid, - y_train=y_train_uid, - seasonality=seasonality) - elif metric_name in ['mini_owa']: - y_train_uid = y_train.loc[uid].y.values - y_bench_uid = y_bench.loc[uid].y_hat.values - evaluation_uid = metric(y=y_test_uid, y_hat=y_hat_uid, - y_train=y_train_uid, - seasonality=seasonality, - y_bench=y_bench_uid) - - else: - evaluation_uid = metric(y=y_test_uid, y_hat=y_hat_uid) - - return uid, evaluation_uid - -def evaluate_panel(y_test: pd.DataFrame, - y_hat: pd.DataFrame, - y_train: pd.DataFrame, - metric: Callable, - seasonality: Optional[int] = None, - y_bench: Optional[pd.DataFrame] = None, - threads: Optional[int] = None): - """Calculates a specific metric for y and y_hat (and y_train, if needed). - - Parameters - ---------- - y_test: pandas df - df with columns ['unique_id', 'ds', 'y'] - y_hat: pandas df - df with columns ['unique_id', 'ds', 'y_hat'] - y_train: pandas df - df with columns ['unique_id', 'ds', 'y'] (train) - This is used in the scaled metrics ('mase', 'rmsse'). - metric: callable - loss function - seasonality: int - Main frequency of the time series. - Used in ('mase', 'rmsse'). - Commonly used seasonalities: - Hourly: 24, - Daily: 7, - Weekly: 52, - Monthly: 12, - Quarterly: 4, - Yearly: 1. - y_bench: pandas df - df with columns ['unique_id', 'ds', 'y_hat'] - predicted values of the benchmark model - This is used in 'mini_owa'. - threads: int - Number of threads to use. Use None (default) for parallel processing. - - Returns - ------ - pandas dataframe: - loss ofr each unique_id in the panel data - """ - metric_name = metric.__code__.co_name - uids = y_test['unique_id'].unique() - y_hat_uids = y_hat['unique_id'].unique() - - assert len(y_test)==len(y_hat), "not same length" - assert all(uids == y_hat_uids), "not same u_ids" - - y_test = y_test.set_index(['unique_id', 'ds']) - y_hat = y_hat.set_index(['unique_id', 'ds']) - - if metric_name in ['mase', 'rmsse']: - y_train = y_train.set_index(['unique_id', 'ds']) - - elif metric_name in ['mini_owa']: - y_train = y_train.set_index(['unique_id', 'ds']) - y_bench = y_bench.set_index(['unique_id', 'ds']) - - partial_evaluation = partial(_evaluate_ts, y_test=y_test, y_hat=y_hat, - y_train=y_train, metric=metric, - seasonality=seasonality, - y_bench=y_bench, - metric_name=metric_name) - - with Pool(threads) as pool: - evaluations = pool.map(partial_evaluation, uids) - - evaluations = pd.DataFrame(evaluations, columns=['unique_id', 'error']) - - return evaluations diff --git a/tsfeatures/tests/__init__.py b/tsfeatures/tests/__init__.py deleted file mode 100644 index 8b13789..0000000 --- a/tsfeatures/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ - diff --git a/tsfeatures/tests/test_acf_features.py b/tsfeatures/tests/test_acf_features.py deleted file mode 100644 index 833f328..0000000 --- a/tsfeatures/tests/test_acf_features.py +++ /dev/null @@ -1,27 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -from math import isclose -from tsfeatures import acf_features -from tsfeatures.utils import WWWusage, USAccDeaths - -def test_acf_features_seasonal(): - z = acf_features(USAccDeaths, 12) - assert isclose(len(z), 7) - assert isclose(z['x_acf1'], 0.70, abs_tol=0.01) - assert isclose(z['x_acf10'], 1.20, abs_tol=0.01) - assert isclose(z['diff1_acf1'], 0.023, abs_tol=0.01) - assert isclose(z['diff1_acf10'], 0.27, abs_tol=0.01) - assert isclose(z['diff2_acf1'], -0.48, abs_tol=0.01) - assert isclose(z['diff2_acf10'], 0.74, abs_tol=0.01) - assert isclose(z['seas_acf1'], 0.62, abs_tol=0.01) - -def test_acf_features_non_seasonal(): - z = acf_features(WWWusage, 1) - assert isclose(len(z), 6) - assert isclose(z['x_acf1'], 0.96, abs_tol=0.01) - assert isclose(z['x_acf10'], 4.19, abs_tol=0.01) - assert isclose(z['diff1_acf1'], 0.79, abs_tol=0.01) - assert isclose(z['diff1_acf10'], 1.40, abs_tol=0.01) - assert isclose(z['diff2_acf1'], 0.17, abs_tol=0.01) - assert isclose(z['diff2_acf10'], 0.33, abs_tol=0.01) diff --git a/tsfeatures/tests/test_arch_stat.py b/tsfeatures/tests/test_arch_stat.py deleted file mode 100644 index 18a95d2..0000000 --- a/tsfeatures/tests/test_arch_stat.py +++ /dev/null @@ -1,16 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -from math import isclose -from tsfeatures import arch_stat -from tsfeatures.utils import WWWusage, USAccDeaths - -def test_arch_stat_seasonal(): - z = arch_stat(USAccDeaths, 12) - assert isclose(len(z), 1) - assert isclose(z['arch_lm'], 0.54, abs_tol=0.01) - -def test_arch_stat_non_seasonal(): - z = arch_stat(WWWusage, 12) - assert isclose(len(z), 1) - assert isclose(z['arch_lm'], 0.98, abs_tol=0.01) diff --git a/tsfeatures/tests/test_holt_parameters.py b/tsfeatures/tests/test_holt_parameters.py deleted file mode 100644 index 2454564..0000000 --- a/tsfeatures/tests/test_holt_parameters.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -from math import isclose -from tsfeatures import holt_parameters -from tsfeatures.utils import WWWusage, USAccDeaths - -def test_holt_parameters_seasonal(): - z = holt_parameters(USAccDeaths, 12) - assert isclose(len(z), 2) - assert isclose(z['alpha'], 0.96, abs_tol=0.07) - assert isclose(z['beta'], 0.00, abs_tol=0.1) - -def test_holt_parameters_non_seasonal(): - z = holt_parameters(WWWusage, 1) - assert isclose(len(z), 2) - assert isclose(z['alpha'], 0.99, abs_tol=0.02) - assert isclose(z['beta'], 0.99, abs_tol=0.02) diff --git a/tsfeatures/tests/test_mutability.py b/tsfeatures/tests/test_mutability.py deleted file mode 100644 index 5dbcc55..0000000 --- a/tsfeatures/tests/test_mutability.py +++ /dev/null @@ -1,31 +0,0 @@ -import numpy as np -import pandas as pd -from tsfeatures import ( - tsfeatures, acf_features, arch_stat, crossing_points, - entropy, flat_spots, heterogeneity, holt_parameters, - lumpiness, nonlinearity, pacf_features, stl_features, - stability, hw_parameters, unitroot_kpss, unitroot_pp, - series_length, sparsity, hurst -) - - -def test_mutability(): - z = np.zeros(100) - z[-1] = 1 - z_df = pd.DataFrame({'unique_id': 1, 'ds': range(1, 101), 'y': z}) - feats=[sparsity, acf_features, arch_stat, crossing_points, - entropy, flat_spots, holt_parameters, - lumpiness, nonlinearity, pacf_features, stl_features, - stability, hw_parameters, unitroot_kpss, unitroot_pp, - series_length, hurst] - feats_2=[acf_features, arch_stat, crossing_points, - entropy, flat_spots, holt_parameters, - lumpiness, nonlinearity, pacf_features, stl_features, - stability, hw_parameters, unitroot_kpss, unitroot_pp, - series_length, hurst, sparsity] - feats_df = tsfeatures(z_df, freq=7, features=feats, scale=False) - feats_2_df = tsfeatures(z_df, freq=7, features=feats_2, scale=False) - pd.testing.assert_frame_equal(feats_df, feats_2_df[feats_df.columns]) - -if __name__=="__main__": - test_mutability() diff --git a/tsfeatures/tests/test_pacf_features.py b/tsfeatures/tests/test_pacf_features.py deleted file mode 100644 index 40d4d37..0000000 --- a/tsfeatures/tests/test_pacf_features.py +++ /dev/null @@ -1,10 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import numpy as np -from tsfeatures import pacf_features - - -def test_pacf_features_seasonal_short(): - z = np.random.normal(size=15) - pacf_features(z, freq=7) diff --git a/tsfeatures/tests/test_pipeline.py b/tsfeatures/tests/test_pipeline.py deleted file mode 100644 index 58a07b0..0000000 --- a/tsfeatures/tests/test_pipeline.py +++ /dev/null @@ -1,18 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -from tsfeatures import tsfeatures -from tsfeatures.m4_data import prepare_m4_data -from tsfeatures.utils import FREQS - -def test_pipeline(): - def calculate_features_m4(dataset_name, directory, num_obs=1000000): - _, y_train_df, _, _ = prepare_m4_data(dataset_name=dataset_name, - directory = directory, - num_obs=num_obs) - - freq = FREQS[dataset_name[0]] - py_feats = tsfeatures(y_train_df, freq=freq).set_index('unique_id') - - calculate_features_m4('Hourly', 'data', 100) - calculate_features_m4('Daily', 'data', 100) diff --git a/tsfeatures/tests/test_small_ts.py b/tsfeatures/tests/test_small_ts.py deleted file mode 100644 index 8cefeb4..0000000 --- a/tsfeatures/tests/test_small_ts.py +++ /dev/null @@ -1,36 +0,0 @@ -import numpy as np -import pandas as pd -from tsfeatures import ( - tsfeatures, acf_features, arch_stat, crossing_points, - entropy, flat_spots, heterogeneity, holt_parameters, - lumpiness, nonlinearity, pacf_features, stl_features, - stability, hw_parameters, unitroot_kpss, unitroot_pp, - series_length, sparsity, hurst, statistics -) - - -def test_small(): - z = np.zeros(2) - z[-1] = 1 - z_df = pd.DataFrame({'unique_id': 1, 'ds': range(1, 3), 'y': z}) - feats=[sparsity, acf_features, arch_stat, crossing_points, - entropy, flat_spots, holt_parameters, - lumpiness, nonlinearity, pacf_features, stl_features, - stability, hw_parameters, unitroot_kpss, unitroot_pp, - series_length, hurst, statistics] - feats_df = tsfeatures(z_df, freq=12, features=feats, scale=False) - -def test_small_1(): - z = np.zeros(1) - z[-1] = 1 - z_df = pd.DataFrame({'unique_id': 1, 'ds': range(1, 2), 'y': z}) - feats=[sparsity, acf_features, arch_stat, crossing_points, - entropy, flat_spots, holt_parameters, - lumpiness, nonlinearity, pacf_features, stl_features, - stability, hw_parameters, unitroot_kpss, unitroot_pp, - series_length, hurst, statistics] - feats_df = tsfeatures(z_df, freq=12, features=feats, scale=False) - -if __name__=="__main__": - test_small() - test_small_1() diff --git a/tsfeatures/tests/test_sparsity.py b/tsfeatures/tests/test_sparsity.py deleted file mode 100644 index 59706bd..0000000 --- a/tsfeatures/tests/test_sparsity.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import numpy as np -import pandas as pd -from tsfeatures import sparsity, tsfeatures - -def test_non_zero_sparsity(): - # if we scale the data, the sparsity should be zero - z = np.zeros(10) - z[-1] = 1 - df = pd.DataFrame({'unique_id': 1, 'ds': range(1, 11), 'y': z}) - features = tsfeatures(df, freq=7, scale=True, features=[sparsity]) - z_sparsity = features['sparsity'].values[0] - assert z_sparsity == 0. - - -def test_sparsity(): - z = np.zeros(10) - z[-1] = 1 - df = pd.DataFrame({'unique_id': 1, 'ds': range(1, 11), 'y': z}) - features = tsfeatures(df, freq=7, scale=False, features=[sparsity]) - print(features) - z_sparsity = features['sparsity'].values[0] - assert z_sparsity == 0.9 diff --git a/tsfeatures/tests/test_statistics.py b/tsfeatures/tests/test_statistics.py deleted file mode 100644 index cc452c9..0000000 --- a/tsfeatures/tests/test_statistics.py +++ /dev/null @@ -1,25 +0,0 @@ -#!/usr/bin/env python -# coding: utf-8 - -import numpy as np -import pandas as pd -from tsfeatures import statistics, tsfeatures - -def test_scale(): - z = np.zeros(10) - z[-1] = 1 - df = pd.DataFrame({'unique_id': 1, 'ds': range(1, 11), 'y': z}) - features = tsfeatures(df, freq=7, scale=True, features=[statistics]) - print(features) - -def test_no_scale(): - z = np.zeros(10) - z[-1] = 1 - df = pd.DataFrame({'unique_id': 1, 'ds': range(1, 11), 'y': z}) - features = tsfeatures(df, freq=7, scale=False, features=[statistics]) - print(features) - - -if __name__=="__main__": - test_scale() - test_no_scale() diff --git a/tsfeatures/tsfeatures.py b/tsfeatures/tsfeatures.py index d762075..60abede 100644 --- a/tsfeatures/tsfeatures.py +++ b/tsfeatures/tsfeatures.py @@ -1,18 +1,33 @@ -#!/usr/bin/env python -# coding: utf-8 +# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/tsfeatures.ipynb. +# %% auto 0 +__all__ = ['acf_features', 'arch_stat', 'count_entropy', 'crossing_points', 'entropy', 'flat_spots', 'frequency', 'guerrero', + 'heterogeneity', 'holt_parameters', 'hurst', 'hw_parameters', 'intervals', 'lumpiness', 'nonlinearity', + 'pacf_features', 'series_length', 'sparsity', 'stability', 'stl_features', 'unitroot_kpss', 'unitroot_pp', + 'statistics', 'tsfeatures', 'tsfeatures_wide'] + +# %% ../nbs/tsfeatures.ipynb 3 +import os import warnings +from collections import ChainMap +from functools import partial +from multiprocessing import Pool +from typing import Callable, List, Optional + +import pandas as pd + +# %% ../nbs/tsfeatures.ipynb 4 warnings.warn = lambda *a, **kw: False -import os + +# %% ../nbs/tsfeatures.ipynb 5 os.environ["MKL_NUM_THREADS"] = "1" os.environ["NUMEXPR_NUM_THREADS"] = "1" os.environ["OMP_NUM_THREADS"] = "1" -from collections import ChainMap -from functools import partial + +# %% ../nbs/tsfeatures.ipynb 6 from itertools import groupby -from math import log, e -from multiprocessing import cpu_count, Pool -from typing import List, Dict, Optional, Callable +from math import e # maybe change with numpy e +from typing import Dict import numpy as np import pandas as pd @@ -20,18 +35,16 @@ from arch import arch_model from scipy.optimize import minimize_scalar from sklearn.linear_model import LinearRegression -from statsmodels.api import add_constant, OLS +from statsmodels.api import OLS, add_constant from statsmodels.tsa.ar_model import AR from statsmodels.tsa.holtwinters import ExponentialSmoothing from statsmodels.tsa.seasonal import STL -from statsmodels.tsa.stattools import acf, pacf, kpss +from statsmodels.tsa.stattools import acf, kpss, pacf from supersmoother import SuperSmoother -from .utils import (embed, FREQS, hurst_exponent, - lambda_coef_var, poly, - scalets, terasvirta_test, ur_pp) - +from .utils import * +# %% ../nbs/tsfeatures.ipynb 8 def acf_features(x: np.array, freq: int = 1) -> Dict[str, float]: """Calculates autocorrelation function features. @@ -64,17 +77,19 @@ def acf_features(x: np.array, freq: int = 1) -> Dict[str, float]: if size_x > 10: acfdiff1x = acf(np.diff(x, n=1), nlags=10, fft=False) else: - acfdiff1x = [np.nan]*2 + acfdiff1x = [np.nan] * 2 if size_x > 11: acfdiff2x = acf(np.diff(x, n=2), nlags=10, fft=False) else: acfdiff2x = [np.nan] * 2 # first autocorrelation coefficient + try: acf_1 = acfx[1] except: acf_1 = np.nan + # sum of squares of first 10 autocorrelation coefficients sum_of_sq_acf10 = np.sum((acfx[1:11]) ** 2) if size_x > 10 else np.nan # first autocorrelation ciefficient of differenced series @@ -87,21 +102,23 @@ def acf_features(x: np.array, freq: int = 1) -> Dict[str, float]: diff2_acf10 = np.sum((acfdiff2x[1:11]) ** 2) if size_x > 11 else np.nan output = { - 'x_acf1': acf_1, - 'x_acf10': sum_of_sq_acf10, - 'diff1_acf1': diff1_acf1, - 'diff1_acf10': diff1_acf10, - 'diff2_acf1': diff2_acf1, - 'diff2_acf10': diff2_acf10 + "x_acf1": acf_1, + "x_acf10": sum_of_sq_acf10, + "diff1_acf1": diff1_acf1, + "diff1_acf10": diff1_acf10, + "diff2_acf1": diff2_acf1, + "diff2_acf10": diff2_acf10, } if m > 1: - output['seas_acf1'] = acfx[m] if len(acfx) > m else np.nan + output["seas_acf1"] = acfx[m] if len(acfx) > m else np.nan return output -def arch_stat(x: np.array, freq: int = 1, - lags: int = 12, demean: bool = True) -> Dict[str, float]: +# %% ../nbs/tsfeatures.ipynb 10 +def arch_stat( + x: np.array, freq: int = 1, lags: int = 12, demean: bool = True +) -> Dict[str, float]: """Arch model features. Parameters @@ -117,12 +134,12 @@ def arch_stat(x: np.array, freq: int = 1, 'arch_lm': R^2 value of an autoregressive model of order lags applied to x**2. """ if len(x) <= lags + 1: - return {'arch_lm': np.nan} + return {"arch_lm": np.nan} if demean: x = x - np.mean(x) size_x = len(x) - mat = embed(x ** 2, lags + 1) + mat = embed(x**2, lags + 1) X = mat[:, 1:] y = np.vstack(mat[:, 0]) @@ -131,8 +148,9 @@ def arch_stat(x: np.array, freq: int = 1, except: r_squared = np.nan - return {'arch_lm': r_squared} + return {"arch_lm": r_squared} +# %% ../nbs/tsfeatures.ipynb 12 def count_entropy(x: np.array, freq: int = 1) -> Dict[str, float]: """Count entropy. @@ -151,8 +169,9 @@ def count_entropy(x: np.array, freq: int = 1) -> Dict[str, float]: entropy = x[x > 0] * np.log(x[x > 0]) entropy = -entropy.sum() - return {'count_entropy': entropy} + return {"count_entropy": entropy} +# %% ../nbs/tsfeatures.ipynb 13 def crossing_points(x: np.array, freq: int = 1) -> Dict[str, float]: """Crossing points. @@ -171,12 +190,13 @@ def crossing_points(x: np.array, freq: int = 1) -> Dict[str, float]: midline = np.median(x) ab = x <= midline lenx = len(x) - p1 = ab[:(lenx - 1)] + p1 = ab[: (lenx - 1)] p2 = ab[1:] cross = (p1 & (~p2)) | (p2 & (~p1)) - return {'crossing_points': cross.sum()} + return {"crossing_points": cross.sum()} +# %% ../nbs/tsfeatures.ipynb 14 def entropy(x: np.array, freq: int = 1, base: float = e) -> Dict[str, float]: """Calculates sample entropy. @@ -193,13 +213,14 @@ def entropy(x: np.array, freq: int = 1, base: float = e) -> Dict[str, float]: 'entropy': Wrapper of the function spectral_entropy. """ try: - with np.errstate(divide='ignore'): + with np.errstate(divide="ignore"): entropy = spectral_entropy(x, 1, normalize=True) except: entropy = np.nan - return {'entropy': entropy} + return {"entropy": entropy} +# %% ../nbs/tsfeatures.ipynb 15 def flat_spots(x: np.array, freq: int = 1) -> Dict[str, float]: """Flat spots. @@ -218,12 +239,12 @@ def flat_spots(x: np.array, freq: int = 1) -> Dict[str, float]: try: cutx = pd.cut(x, bins=10, include_lowest=True, labels=False) + 1 except: - return {'flat_spots': np.nan} - - rlex = np.array([sum(1 for i in g) for k,g in groupby(cutx)]).max() + return {"flat_spots": np.nan} - return {'flat_spots': rlex} + rlex = np.array([sum(1 for i in g) for k, g in groupby(cutx)]).max() + return {"flat_spots": rlex} +# %% ../nbs/tsfeatures.ipynb 16 def frequency(x: np.array, freq: int = 1) -> Dict[str, float]: """Frequency. @@ -240,10 +261,12 @@ def frequency(x: np.array, freq: int = 1) -> Dict[str, float]: 'frequency': Wrapper of freq. """ - return {'frequency': freq} + return {"frequency": freq} -def guerrero(x: np.array, freq: int = 1, - lower: int = -1, upper: int = 2) -> Dict[str, float]: +# %% ../nbs/tsfeatures.ipynb 17 +def guerrero( + x: np.array, freq: int = 1, lower: int = -1, upper: int = 2 +) -> Dict[str, float]: """Applies Guerrero's (1993) method to select the lambda which minimises the coefficient of variation for subseries of x. @@ -271,10 +294,11 @@ def guerrero(x: np.array, freq: int = 1, func_to_min = lambda lambda_par: lambda_coef_var(lambda_par, x=x, period=freq) min_ = minimize_scalar(func_to_min, bounds=[lower, upper]) - min_ = min_['fun'] + min_ = min_["fun"] - return {'guerrero': min_} + return {"guerrero": min_} +# %% ../nbs/tsfeatures.ipynb 18 def heterogeneity(x: np.array, freq: int = 1) -> Dict[str, float]: """Heterogeneity. @@ -304,39 +328,40 @@ def heterogeneity(x: np.array, freq: int = 1) -> Dict[str, float]: order_ar = int(order_ar) try: - x_whitened = AR(x).fit(maxlag=order_ar, ic='aic', trend='c').resid + x_whitened = AR(x).fit(maxlag=order_ar, ic="aic", trend="c").resid except: try: - x_whitened = AR(x).fit(maxlag=order_ar, ic='aic', trend='nc').resid + x_whitened = AR(x).fit(maxlag=order_ar, ic="aic", trend="nc").resid except: output = { - 'arch_acf': np.nan, - 'garch_acf': np.nan, - 'arch_r2': np.nan, - 'garch_r2': np.nan + "arch_acf": np.nan, + "garch_acf": np.nan, + "arch_r2": np.nan, + "garch_r2": np.nan, } return output # arch and box test - x_archtest = arch_stat(x_whitened, m)['arch_lm'] - LBstat = (acf(x_whitened ** 2, nlags=12, fft=False)[1:] ** 2).sum() - #Fit garch model - garch_fit = arch_model(x_whitened, vol='GARCH', rescale=False).fit(disp='off') + x_archtest = arch_stat(x_whitened, m)["arch_lm"] + LBstat = (acf(x_whitened**2, nlags=12, fft=False)[1:] ** 2).sum() + # Fit garch model + garch_fit = arch_model(x_whitened, vol="GARCH", rescale=False).fit(disp="off") # compare arch test before and after fitting garch garch_fit_std = garch_fit.resid - x_garch_archtest = arch_stat(garch_fit_std, m)['arch_lm'] + x_garch_archtest = arch_stat(garch_fit_std, m)["arch_lm"] # compare Box test of squared residuals before and after fittig.garch - LBstat2 = (acf(garch_fit_std ** 2, nlags=12, fft=False)[1:] ** 2).sum() + LBstat2 = (acf(garch_fit_std**2, nlags=12, fft=False)[1:] ** 2).sum() output = { - 'arch_acf': LBstat, - 'garch_acf': LBstat2, - 'arch_r2': x_archtest, - 'garch_r2': x_garch_archtest + "arch_acf": LBstat, + "garch_acf": LBstat2, + "arch_r2": x_archtest, + "garch_r2": x_garch_archtest, } return output +# %% ../nbs/tsfeatures.ipynb 19 def holt_parameters(x: np.array, freq: int = 1) -> Dict[str, float]: """Fitted parameters of a Holt model. @@ -353,20 +378,18 @@ def holt_parameters(x: np.array, freq: int = 1) -> Dict[str, float]: 'alpha': Level paramater of the Holt model. 'beta': Trend parameter of the Hold model. """ - try : - fit = ExponentialSmoothing(x, trend='add', seasonal=None).fit() + try: + fit = ExponentialSmoothing(x, trend="add", seasonal=None).fit() params = { - 'alpha': fit.params['smoothing_level'], - 'beta': fit.params['smoothing_trend'] + "alpha": fit.params["smoothing_level"], + "beta": fit.params["smoothing_trend"], } except: - params = { - 'alpha': np.nan, - 'beta': np.nan - } + params = {"alpha": np.nan, "beta": np.nan} return params +# %% ../nbs/tsfeatures.ipynb 20 def hurst(x: np.array, freq: int = 1) -> Dict[str, float]: """Hurst index. @@ -387,8 +410,9 @@ def hurst(x: np.array, freq: int = 1) -> Dict[str, float]: except: hurst_index = np.nan - return {'hurst': hurst_index} + return {"hurst": hurst_index} +# %% ../nbs/tsfeatures.ipynb 21 def hw_parameters(x: np.array, freq: int = 1) -> Dict[str, float]: """Fitted parameters of a Holt-Winters model. @@ -407,21 +431,20 @@ def hw_parameters(x: np.array, freq: int = 1) -> Dict[str, float]: 'hw_gamma': Seasonal parameter of the HW model. """ try: - fit = ExponentialSmoothing(x, seasonal_periods=freq, trend='add', seasonal='add').fit() + fit = ExponentialSmoothing( + x, seasonal_periods=freq, trend="add", seasonal="add" + ).fit() params = { - 'hw_alpha': fit.params['smoothing_level'], - 'hw_beta': fit.params['smoothing_trend'], - 'hw_gamma': fit.params['smoothing_seasonal'] + "hw_alpha": fit.params["smoothing_level"], + "hw_beta": fit.params["smoothing_trend"], + "hw_gamma": fit.params["smoothing_seasonal"], } except: - params = { - 'hw_alpha': np.nan, - 'hw_beta': np.nan, - 'hw_gamma': np.nan - } + params = {"hw_alpha": np.nan, "hw_beta": np.nan, "hw_gamma": np.nan} return params +# %% ../nbs/tsfeatures.ipynb 22 def intervals(x: np.array, freq: int = 1) -> Dict[str, float]: """Intervals with demand. @@ -443,8 +466,9 @@ def intervals(x: np.array, freq: int = 1) -> Dict[str, float]: y = [sum(val) for keys, val in groupby(x, key=lambda k: k != 0) if keys != 0] y = np.array(y) - return {'intervals_mean': np.mean(y), 'intervals_sd': np.std(y, ddof=1)} + return {"intervals_mean": np.mean(y), "intervals_sd": np.std(y, ddof=1)} +# %% ../nbs/tsfeatures.ipynb 23 def lumpiness(x: np.array, freq: int = 1) -> Dict[str, float]: """lumpiness. @@ -469,15 +493,16 @@ def lumpiness(x: np.array, freq: int = 1) -> Dict[str, float]: lo = np.arange(0, nr, width) up = lo + width nsegs = nr / width - varx = [np.nanvar(x[lo[idx]:up[idx]], ddof=1) for idx in np.arange(int(nsegs))] + varx = [np.nanvar(x[lo[idx] : up[idx]], ddof=1) for idx in np.arange(int(nsegs))] if len(x) < 2 * width: lumpiness = 0 else: lumpiness = np.nanvar(varx, ddof=1) - return {'lumpiness': lumpiness} + return {"lumpiness": lumpiness} +# %% ../nbs/tsfeatures.ipynb 24 def nonlinearity(x: np.array, freq: int = 1) -> Dict[str, float]: """Nonlinearity. @@ -500,8 +525,9 @@ def nonlinearity(x: np.array, freq: int = 1) -> Dict[str, float]: except: test = np.nan - return {'nonlinearity': test} + return {"nonlinearity": test} +# %% ../nbs/tsfeatures.ipynb 25 def pacf_features(x: np.array, freq: int = 1) -> Dict[str, float]: """Calculates partial autocorrelation function features. @@ -532,7 +558,7 @@ def pacf_features(x: np.array, freq: int = 1) -> Dict[str, float]: if len(x) > 1: try: - pacfx = pacf(x, nlags=nlags_, method='ldb') + pacfx = pacf(x, nlags=nlags_, method="ldb") except: pacfx = np.nan else: @@ -545,8 +571,8 @@ def pacf_features(x: np.array, freq: int = 1) -> Dict[str, float]: # Sum of first 5 PACs of difference series squared if len(x) > 6: try: - diff1_pacf = pacf(np.diff(x, n=1), nlags=5, method='ldb')[1:6] - diff1_pacf_5 = np.sum(diff1_pacf ** 2) + diff1_pacf = pacf(np.diff(x, n=1), nlags=5, method="ldb")[1:6] + diff1_pacf_5 = np.sum(diff1_pacf**2) except: diff1_pacf_5 = np.nan else: @@ -554,27 +580,25 @@ def pacf_features(x: np.array, freq: int = 1) -> Dict[str, float]: # Sum of first 5 PACs of twice differenced series squared if len(x) > 7: try: - diff2_pacf = pacf(np.diff(x, n = 2), nlags = 5, method='ldb')[1:6] - diff2_pacf_5 = np.sum(diff2_pacf ** 2) + diff2_pacf = pacf(np.diff(x, n=2), nlags=5, method="ldb")[1:6] + diff2_pacf_5 = np.sum(diff2_pacf**2) except: diff2_pacf_5 = np.nan else: diff2_pacf_5 = np.nan output = { - 'x_pacf5': pacf_5, - 'diff1x_pacf5': diff1_pacf_5, - 'diff2x_pacf5': diff2_pacf_5 + "x_pacf5": pacf_5, + "diff1x_pacf5": diff1_pacf_5, + "diff2x_pacf5": diff2_pacf_5, } if m > 1: - try: - output['seas_pacf'] = pacfx[m] if len(pacfx) > m else np.nan - except: - output['seas_pacf'] = np.nan + output["seas_pacf"] = pacfx[m] if len(pacfx) > m else np.nan return output +# %% ../nbs/tsfeatures.ipynb 26 def series_length(x: np.array, freq: int = 1) -> Dict[str, float]: """Series length. @@ -591,8 +615,9 @@ def series_length(x: np.array, freq: int = 1) -> Dict[str, float]: 'series_length': Wrapper of len(x). """ - return {'series_length': len(x)} + return {"series_length": len(x)} +# %% ../nbs/tsfeatures.ipynb 27 def sparsity(x: np.array, freq: int = 1) -> Dict[str, float]: """Sparsity. @@ -609,8 +634,9 @@ def sparsity(x: np.array, freq: int = 1) -> Dict[str, float]: 'sparsity': Average obs with zero values. """ - return {'sparsity': np.mean(x == 0)} + return {"sparsity": np.mean(x == 0)} +# %% ../nbs/tsfeatures.ipynb 28 def stability(x: np.array, freq: int = 1) -> Dict[str, float]: """Stability. @@ -635,15 +661,16 @@ def stability(x: np.array, freq: int = 1) -> Dict[str, float]: lo = np.arange(0, nr, width) up = lo + width nsegs = nr / width - meanx = [np.nanmean(x[lo[idx]:up[idx]]) for idx in np.arange(int(nsegs))] + meanx = [np.nanmean(x[lo[idx] : up[idx]]) for idx in np.arange(int(nsegs))] if len(x) < 2 * width: stability = 0 else: stability = np.nanvar(meanx, ddof=1) - return {'stability': stability} + return {"stability": stability} +# %% ../nbs/tsfeatures.ipynb 29 def stl_features(x: np.array, freq: int = 1) -> Dict[str, float]: """Calculates seasonal trend using loess decomposition. @@ -681,17 +708,17 @@ def stl_features(x: np.array, freq: int = 1) -> Dict[str, float]: stlfit = STL(x, m, 13).fit() except: output = { - 'nperiods': nperiods, - 'seasonal_period': m, - 'trend': np.nan, - 'spike': np.nan, - 'linearity': np.nan, - 'curvature': np.nan, - 'e_acf1': np.nan, - 'e_acf10': np.nan, - 'seasonal_strength': np.nan, - 'peak': np.nan, - 'trough': np.nan + "nperiods": nperiods, + "seasonal_period": m, + "trend": np.nan, + "spike": np.nan, + "linearity": np.nan, + "curvature": np.nan, + "e_acf1": np.nan, + "e_acf10": np.nan, + "seasonal_strength": np.nan, + "peak": np.nan, + "trough": np.nan, } return output @@ -706,14 +733,14 @@ def stl_features(x: np.array, freq: int = 1) -> Dict[str, float]: trend0 = SuperSmoother().fit(t, deseas).predict(t) except: output = { - 'nperiods': nperiods, - 'seasonal_period': m, - 'trend': np.nan, - 'spike': np.nan, - 'linearity': np.nan, - 'curvature': np.nan, - 'e_acf1': np.nan, - 'e_acf10': np.nan + "nperiods": nperiods, + "seasonal_period": m, + "trend": np.nan, + "spike": np.nan, + "linearity": np.nan, + "curvature": np.nan, + "e_acf1": np.nan, + "e_acf10": np.nan, } return output @@ -730,13 +757,13 @@ def stl_features(x: np.array, freq: int = 1) -> Dict[str, float]: vare = np.nanvar(remainder, ddof=1) vardetrend = np.nanvar(detrend, ddof=1) vardeseason = np.nanvar(deseason, ddof=1) - #Measure of trend strength + # Measure of trend strength if varx < np.finfo(float).eps: trend = 0 - elif (vardeseason/varx < 1e-10): + elif vardeseason / varx < 1e-10: trend = 0 else: - trend = max(0, min(1, 1 - vare/vardeseason)) + trend = max(0, min(1, 1 - vare / vardeseason)) # Measure of seasonal strength if m > 1: if varx < np.finfo(float).eps: @@ -753,7 +780,7 @@ def stl_features(x: np.array, freq: int = 1) -> Dict[str, float]: trough = m if trough == 0 else trough # Compute measure of spikiness d = (remainder - np.nanmean(remainder)) ** 2 - varloo = (vare * (n-1) - d) / (n - 2) + varloo = (vare * (n - 1) - d) / (n - 2) spike = np.nanvar(varloo, ddof=1) # Compute measures of linearity and curvature time = np.arange(n) + 1 @@ -773,23 +800,24 @@ def stl_features(x: np.array, freq: int = 1) -> Dict[str, float]: acfremainder = acf_features(remainder, m) # Assemble features output = { - 'nperiods': nperiods, - 'seasonal_period': m, - 'trend': trend, - 'spike': spike, - 'linearity': linearity, - 'curvature': curvature, - 'e_acf1': acfremainder['x_acf1'], - 'e_acf10': acfremainder['x_acf10'] + "nperiods": nperiods, + "seasonal_period": m, + "trend": trend, + "spike": spike, + "linearity": linearity, + "curvature": curvature, + "e_acf1": acfremainder["x_acf1"], + "e_acf10": acfremainder["x_acf10"], } if m > 1: - output['seasonal_strength'] = season - output['peak'] = peak - output['trough'] = trough + output["seasonal_strength"] = season + output["peak"] = peak + output["trough"] = trough return output +# %% ../nbs/tsfeatures.ipynb 30 def unitroot_kpss(x: np.array, freq: int = 1) -> Dict[str, float]: """Unit root kpss. @@ -813,8 +841,9 @@ def unitroot_kpss(x: np.array, freq: int = 1) -> Dict[str, float]: except: test_kpss = np.nan - return {'unitroot_kpss': test_kpss} + return {"unitroot_kpss": test_kpss} +# %% ../nbs/tsfeatures.ipynb 31 def unitroot_pp(x: np.array, freq: int = 1) -> Dict[str, float]: """Unit root pp. @@ -835,8 +864,9 @@ def unitroot_pp(x: np.array, freq: int = 1) -> Dict[str, float]: except: test_pp = np.nan - return {'unitroot_pp': test_pp} + return {"unitroot_pp": test_pp} +# %% ../nbs/tsfeatures.ipynb 32 def statistics(x: np.array, freq: int = 1) -> Dict[str, float]: """Computes basic statistics of x. @@ -861,7 +891,7 @@ def statistics(x: np.array, freq: int = 1) -> Dict[str, float]: 'p95': 95 percentile. 'p97point5': 97.5 percentile. 'max': Max value. - 'min': Min value. + 'min': Min value. """ res = dict( total_sum=np.sum(x), @@ -880,42 +910,53 @@ def statistics(x: np.array, freq: int = 1) -> Dict[str, float]: return res -############################################################################### -#### MAIN FUNCTIONS ########################################################### -############################################################################### - -def _get_feats(index, - ts, - freq, - scale = True, - features = [acf_features, arch_stat, crossing_points, - entropy, flat_spots, heterogeneity, holt_parameters, - lumpiness, nonlinearity, pacf_features, stl_features, - stability, hw_parameters, unitroot_kpss, unitroot_pp, - series_length, hurst], - dict_freqs = FREQS): - +# %% ../nbs/tsfeatures.ipynb 33 +def _get_feats( + index, + ts, + freq, + scale=True, + features=[ + acf_features, + arch_stat, + crossing_points, + entropy, + flat_spots, + heterogeneity, + holt_parameters, + lumpiness, + nonlinearity, + pacf_features, + stl_features, + stability, + hw_parameters, + unitroot_kpss, + unitroot_pp, + series_length, + hurst, + ], + dict_freqs=FREQS, +): if freq is None: - inf_freq = pd.infer_freq(ts['ds']) + inf_freq = pd.infer_freq(ts["ds"]) if inf_freq is None: raise Exception( - 'Failed to infer frequency from the `ds` column, ' - 'please provide the frequency using the `freq` argument.' + "Failed to infer frequency from the `ds` column, " + "please provide the frequency using the `freq` argument." ) freq = dict_freqs.get(inf_freq) if freq is None: raise Exception( - 'Error trying to convert infered frequency from the `ds` column ' - 'to integer. Please provide a dictionary with that frequency ' - 'as key and the integer frequency as value. ' - f'Infered frequency: {inf_freq}' + "Error trying to convert infered frequency from the `ds` column " + "to integer. Please provide a dictionary with that frequency " + "as key and the integer frequency as value. " + f"Infered frequency: {inf_freq}" ) - if isinstance(ts, pd.DataFrame): - assert 'y' in ts.columns - ts = ts['y'].values + assert "y" in ts.columns + ts = ts["y"].values if isinstance(ts, pd.Series): ts = ts.values @@ -923,21 +964,39 @@ def _get_feats(index, if scale: ts = scalets(ts) - c_map = ChainMap(*[dict_feat for dict_feat in [func(ts, freq) for func in features]]) - - return pd.DataFrame(dict(c_map), index = [index]) - -def tsfeatures(ts: pd.DataFrame, - freq: Optional[int] = None, - features: List[Callable] = [acf_features, arch_stat, crossing_points, - entropy, flat_spots, heterogeneity, - holt_parameters, lumpiness, nonlinearity, - pacf_features, stl_features, stability, - hw_parameters, unitroot_kpss, unitroot_pp, - series_length, hurst], - dict_freqs: Dict[str, int] = FREQS, - scale: bool = True, - threads: Optional[int] = None) -> pd.DataFrame: + c_map = ChainMap( + *[dict_feat for dict_feat in [func(ts, freq) for func in features]] + ) + + return pd.DataFrame(dict(c_map), index=[index]) + +# %% ../nbs/tsfeatures.ipynb 34 +def tsfeatures( + ts: pd.DataFrame, + freq: Optional[int] = None, + features: List[Callable] = [ + acf_features, + arch_stat, + crossing_points, + entropy, + flat_spots, + heterogeneity, + holt_parameters, + lumpiness, + nonlinearity, + pacf_features, + stl_features, + stability, + hw_parameters, + unitroot_kpss, + unitroot_pp, + series_length, + hurst, + ], + dict_freqs: Dict[str, int] = FREQS, + scale: bool = True, + threads: Optional[int] = None, +) -> pd.DataFrame: """Calculates features for time series. Parameters @@ -964,49 +1023,81 @@ def tsfeatures(ts: pd.DataFrame, Pandas DataFrame where each column is a feature and each row a time series. """ - partial_get_feats = partial(_get_feats, freq=freq, scale=scale, - features=features, dict_freqs=dict_freqs) + partial_get_feats = partial( + _get_feats, freq=freq, scale=scale, features=features, dict_freqs=dict_freqs + ) with Pool(threads) as pool: - ts_features = pool.starmap(partial_get_feats, ts.groupby('unique_id')) + ts_features = pool.starmap(partial_get_feats, ts.groupby("unique_id")) - ts_features = pd.concat(ts_features).rename_axis('unique_id') + ts_features = pd.concat(ts_features).rename_axis("unique_id") ts_features = ts_features.reset_index() return ts_features -################################################################################ -#### MAIN WIDE FUNCTION ######################################################## -################################################################################ - -def _get_feats_wide(index, - ts, - scale = True, - features = [acf_features, arch_stat, crossing_points, - entropy, flat_spots, heterogeneity, holt_parameters, - lumpiness, nonlinearity, pacf_features, stl_features, - stability, hw_parameters, unitroot_kpss, unitroot_pp, - series_length, hurst]): - seasonality = ts['seasonality'].item() - y = ts['y'].item() +# %% ../nbs/tsfeatures.ipynb 35 +def _get_feats_wide( + index, + ts, + scale=True, + features=[ + acf_features, + arch_stat, + crossing_points, + entropy, + flat_spots, + heterogeneity, + holt_parameters, + lumpiness, + nonlinearity, + pacf_features, + stl_features, + stability, + hw_parameters, + unitroot_kpss, + unitroot_pp, + series_length, + hurst, + ], +): + seasonality = ts["seasonality"].item() + y = ts["y"].item() y = np.array(y) if scale: y = scalets(y) - c_map = ChainMap(*[dict_feat for dict_feat in [func(y, seasonality) for func in features]]) - - return pd.DataFrame(dict(c_map), index = [index]) + c_map = ChainMap( + *[dict_feat for dict_feat in [func(y, seasonality) for func in features]] + ) -def tsfeatures_wide(ts: pd.DataFrame, - features: List[Callable] = [acf_features, arch_stat, crossing_points, - entropy, flat_spots, heterogeneity, - holt_parameters, lumpiness, nonlinearity, - pacf_features, stl_features, stability, - hw_parameters, unitroot_kpss, unitroot_pp, - series_length, hurst], - scale: bool = True, - threads: Optional[int] = None) -> pd.DataFrame: + return pd.DataFrame(dict(c_map), index=[index]) + +# %% ../nbs/tsfeatures.ipynb 36 +def tsfeatures_wide( + ts: pd.DataFrame, + features: List[Callable] = [ + acf_features, + arch_stat, + crossing_points, + entropy, + flat_spots, + heterogeneity, + holt_parameters, + lumpiness, + nonlinearity, + pacf_features, + stl_features, + stability, + hw_parameters, + unitroot_kpss, + unitroot_pp, + series_length, + hurst, + ], + scale: bool = True, + threads: Optional[int] = None, +) -> pd.DataFrame: """Calculates features for time series. Parameters @@ -1027,13 +1118,12 @@ def tsfeatures_wide(ts: pd.DataFrame, Pandas DataFrame where each column is a feature and each row a time series. """ - partial_get_feats = partial(_get_feats_wide, scale=scale, - features=features) + partial_get_feats = partial(_get_feats_wide, scale=scale, features=features) with Pool(threads) as pool: - ts_features = pool.starmap(partial_get_feats, ts.groupby('unique_id')) + ts_features = pool.starmap(partial_get_feats, ts.groupby("unique_id")) - ts_features = pd.concat(ts_features).rename_axis('unique_id') + ts_features = pd.concat(ts_features).rename_axis("unique_id") ts_features = ts_features.reset_index() return ts_features diff --git a/tsfeatures/tsfeatures_r.py b/tsfeatures/tsfeatures_r.py index 669f4cf..2a84962 100644 --- a/tsfeatures/tsfeatures_r.py +++ b/tsfeatures/tsfeatures_r.py @@ -1,21 +1,40 @@ -#!/usr/bin/env python -# coding: utf-8 +# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/tsfeatures_r.ipynb. +# %% auto 0 +__all__ = ['tsfeatures_r', 'tsfeatures_r_wide'] + +# %% ../nbs/tsfeatures_r.ipynb 3 from typing import List import pandas as pd import rpy2.robjects as robjects from rpy2.robjects import pandas2ri -def tsfeatures_r(ts: pd.DataFrame, - freq: int, - features: List[str] = ["length", "acf_features", "arch_stat", - "crossing_points", "entropy", "flat_spots", - "heterogeneity", "holt_parameters", - "hurst", "hw_parameters", "lumpiness", - "nonlinearity", "pacf_features", "stability", - "stl_features", "unitroot_kpss", "unitroot_pp"], - **kwargs) -> pd.DataFrame: +# %% ../nbs/tsfeatures_r.ipynb 4 +def tsfeatures_r( + ts: pd.DataFrame, + freq: int, + features: List[str] = [ + "length", + "acf_features", + "arch_stat", + "crossing_points", + "entropy", + "flat_spots", + "heterogeneity", + "holt_parameters", + "hurst", + "hw_parameters", + "lumpiness", + "nonlinearity", + "pacf_features", + "stability", + "stl_features", + "unitroot_kpss", + "unitroot_pp", + ], + **kwargs, +) -> pd.DataFrame: """tsfeatures wrapper using r. Parameters @@ -75,19 +94,35 @@ def tsfeatures_r(ts: pd.DataFrame, feats = rfunc(ts, freq, features, **kwargs) pandas2ri.deactivate() - renamer={'ARCH.LM': 'arch_lm', 'length': 'series_length'} + renamer = {"ARCH.LM": "arch_lm", "length": "series_length"} feats = feats.rename(columns=renamer) return feats -def tsfeatures_r_wide(ts: pd.DataFrame, - features: List[str] = ["length", "acf_features", "arch_stat", - "crossing_points", "entropy", "flat_spots", - "heterogeneity", "holt_parameters", - "hurst", "hw_parameters", "lumpiness", - "nonlinearity", "pacf_features", "stability", - "stl_features", "unitroot_kpss", "unitroot_pp"], - **kwargs) -> pd.DataFrame: + +def tsfeatures_r_wide( + ts: pd.DataFrame, + features: List[str] = [ + "length", + "acf_features", + "arch_stat", + "crossing_points", + "entropy", + "flat_spots", + "heterogeneity", + "holt_parameters", + "hurst", + "hw_parameters", + "lumpiness", + "nonlinearity", + "pacf_features", + "stability", + "stl_features", + "unitroot_kpss", + "unitroot_pp", + ], + **kwargs, +) -> pd.DataFrame: """tsfeatures wrapper using r. Parameters @@ -142,14 +177,14 @@ def tsfeatures_r_wide(ts: pd.DataFrame, pandas2ri.activate() rfunc = robjects.r(rstring) - uids = ts['unique_id'].to_list() - seasonalities = ts['seasonality'].to_list() - ys = ts['y'].to_list() + uids = ts["unique_id"].to_list() + seasonalities = ts["seasonality"].to_list() + ys = ts["y"].to_list() feats = rfunc(uids, seasonalities, ys, features, **kwargs) pandas2ri.deactivate() - renamer={'ARCH.LM': 'arch_lm', 'length': 'series_length'} + renamer = {"ARCH.LM": "arch_lm", "length": "series_length"} feats = feats.rename(columns=renamer) return feats diff --git a/tsfeatures/utils.py b/tsfeatures/utils.py index 40c8e6f..0fc2983 100644 --- a/tsfeatures/utils.py +++ b/tsfeatures/utils.py @@ -1,27 +1,38 @@ -#!/usr/bin/env python -# coding: utf-8 +# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/utils.ipynb. +# %% auto 0 +__all__ = ['FREQS', 'WWWusage', 'USAccDeaths', 'scalets', 'poly', 'embed', 'terasvirta_test', 'hurst_exponent', 'ur_pp', + 'lambda_coef_var'] + +# %% ../nbs/utils.ipynb 3 import numpy as np import statsmodels.api as sm -from scipy.signal import periodogram, welch +# %% ../nbs/utils.ipynb 5 +np.seterr(divide="ignore", invalid="ignore") -np.seterr(divide='ignore', invalid='ignore') +# %% ../nbs/utils.ipynb 6 +FREQS = {"H": 24, "D": 1, "M": 12, "Q": 4, "W": 1, "Y": 1} -################################################################################ -########### GENERAL UTILS ###################################################### -################################################################################ +# %% ../nbs/utils.ipynb 7 +def scalets(x: np.array) -> np.array: + """Mean-std scale a time series. -FREQS = {'H': 24, 'D': 1, - 'M': 12, 'Q': 4, - 'W': 1, 'Y': 1} + Scales the time series x by removing the mean and dividing by the standard deviation. -def scalets(x: np.array) -> float: - """Mean-std scale.""" - scaledx = (x - x.mean()) / x.std(ddof=1) + Parameters + ---------- + x : np.array + The input time series data. - return scaledx + Returns + ------- + np.array + The scaled time series values. + """ + return (x - x.mean()) / x.std(ddof=1) +# %% ../nbs/utils.ipynb 8 def poly(x: np.array, p: int) -> np.array: """Returns or evaluates orthogonal polynomials of degree 1 to degree over the specified set of points x: @@ -38,10 +49,11 @@ def poly(x: np.array, p: int) -> np.array: ---------- https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/poly """ - X = np.transpose(np.vstack(list((x ** k for k in range(p + 1))))) + X = np.transpose(np.vstack([x**k for k in range(p + 1)])) return np.linalg.qr(X)[0][:, 1:] +# %% ../nbs/utils.ipynb 9 def embed(x: np.array, p: int) -> np.array: """Embeds the time series x into a low-dimensional Euclidean space. @@ -56,15 +68,10 @@ def embed(x: np.array, p: int) -> np.array: ---------- https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/embed """ - x = np.transpose(np.vstack(list((np.roll(x, k) for k in range(p))))) - x = x[p - 1:] - - return x - -################################################################################ -####### CUSTOM FUNCS ########################################################### -################################################################################ + x = np.transpose(np.vstack([np.roll(x, k) for k in range(p)])) + return x[p - 1 :] +# %% ../nbs/utils.ipynb 10 def terasvirta_test(x: np.array, lag: int = 1, scale: bool = True) -> float: """Generically computes Teraesvirta's neural network test for neglected nonlinearity either for the time series x or the regression y~x. @@ -87,7 +94,8 @@ def terasvirta_test(x: np.array, lag: int = 1, scale: bool = True) -> float: ---------- https://www.rdocumentation.org/packages/tseries/versions/0.10-47/topics/terasvirta.test """ - if scale: x = scalets(x) + if scale: + x = scalets(x) size_x = len(x) y = embed(x, lag + 1) @@ -100,7 +108,7 @@ def terasvirta_test(x: np.array, lag: int = 1, scale: bool = True) -> float: ols = sm.OLS(y, X).fit() u = ols.resid - ssr0 = (u ** 2).sum() + ssr0 = (u**2).sum() X_nn_list = [] @@ -122,12 +130,13 @@ def terasvirta_test(x: np.array, lag: int = 1, scale: bool = True) -> float: ols_nn = sm.OLS(u, X_nn).fit() v = ols_nn.resid - ssr = (v ** 2).sum() + ssr = (v**2).sum() stat = size_x * np.log(ssr0 / ssr) return stat +# %% ../nbs/utils.ipynb 11 def hurst_exponent(x: np.array) -> float: """Computes hurst exponent. @@ -146,12 +155,10 @@ def hurst_exponent(x: np.array) -> float: y = x.cumsum() # marginally more efficient than: np.cumsum(sig) mean_t = y / t # running mean - s_t = np.sqrt( - np.array([np.mean((x[:i + 1] - mean_t[i]) ** 2) for i in range(n)]) - ) - r_t = np.array([np.ptp(y[:i + 1] - t[:i + 1] * mean_t[i]) for i in range(n)]) + s_t = np.sqrt(np.array([np.mean((x[: i + 1] - mean_t[i]) ** 2) for i in range(n)])) + r_t = np.array([np.ptp(y[: i + 1] - t[: i + 1] * mean_t[i]) for i in range(n)]) - with np.errstate(invalid='ignore'): + with np.errstate(invalid="ignore"): r_s = r_t / s_t r_s = np.log(r_s)[1:] @@ -161,6 +168,7 @@ def hurst_exponent(x: np.array) -> float: return hurst_exponent +# %% ../nbs/utils.ipynb 12 def ur_pp(x: np.array) -> float: """Performs the Phillips and Perron unit root test. @@ -179,7 +187,7 @@ def ur_pp(x: np.array) -> float: lmax, _ = divmod(lmax, 1) lmax = int(lmax) - y, y_l1 = x[1:], x[:n - 1] + y, y_l1 = x[1:], x[: n - 1] n -= 1 @@ -187,16 +195,16 @@ def ur_pp(x: np.array) -> float: model = sm.OLS(y, y_l1).fit() my_tstat, res = model.tvalues[0], model.resid - s = 1 / (n * np.sum(res ** 2)) - myybar = (1 / n ** 2) * (((y - y.mean()) ** 2).sum()) - myy = (1 / n ** 2) * ((y ** 2).sum()) + s = 1 / (n * np.sum(res**2)) + myybar = (1 / n**2) * (((y - y.mean()) ** 2).sum()) + myy = (1 / n**2) * ((y**2).sum()) my = (n ** (-3 / 2)) * (y.sum()) idx = np.arange(lmax) coprods = [] for i in idx: - first_del = res[i + 1:] - sec_del = res[:n - i - 1] + first_del = res[i + 1 :] + sec_del = res[: n - i - 1] prod = first_del * sec_del coprods.append(prod.sum()) coprods = np.array(coprods) @@ -212,6 +220,7 @@ def ur_pp(x: np.array) -> float: return test_stat +# %% ../nbs/utils.ipynb 13 def lambda_coef_var(lambda_par: float, x: np.array, period: int = 2): """Calculates coefficient of variation for subseries of x. @@ -247,22 +256,10 @@ def lambda_coef_var(lambda_par: float, x: np.array, period: int = 2): return value -################################################################################ -####### TS ##################################################################### -################################################################################ - -WWWusage = [88,84,85,85,84,85,83,85,88,89,91,99,104,112,126, - 138,146,151,150,148,147,149,143,132,131,139,147,150, - 148,145,140,134,131,131,129,126,126,132,137,140,142,150,159, - 167,170,171,172,172,174,175,172,172,174,174,169,165,156,142, - 131,121,112,104,102,99,99,95,88,84,84,87,89,88,85,86,89,91, - 91,94,101,110,121,135,145,149,156,165,171,175,177, - 182,193,204,208,210,215,222,228,226,222,220] - -USAccDeaths = [9007,8106,8928,9137,10017,10826,11317,10744,9713,9938,9161, - 8927,7750,6981,8038,8422,8714,9512,10120,9823,8743,9129,8710, - 8680,8162,7306,8124,7870,9387,9556,10093,9620,8285,8466,8160, - 8034,7717,7461,7767,7925,8623,8945,10078,9179,8037,8488,7874, - 8647,7792,6957,7726,8106,8890,9299,10625,9302,8314, - 8850,8265,8796,7836,6892,7791,8192,9115,9434,10484, - 9827,9110,9070,8633,9240] +# %% ../nbs/utils.ipynb 14 +# fmt: off +WWWusage = [88, 84, 85, 85, 84, 85, 83, 85, 88, 89, 91, 99, 104, 112, 126, 138, 146, 151, 150, 148, 147, 149, 143, 132, 131, 139, 147, 150, 148, 145, 140, 134, 131, 131, 129, 126, 126, 132, 137, 140, 142, 150, 159, 167, 170, 171, 172, 172, 174, 175, 172, 172, 174, 174, 169, 165, 156, 142, 131, 121, 112, 104, 102, 99, 99, 95, 88, 84, 84, 87, 89, 88, 85, 86, 89, 91, 91, 94, 101, 110, 121, 135, 145, 149, 156, 165, 171, 175, 177, 182, 193, 204, 208, 210, 215, 222, 228, 226, 222, 220,] + +USAccDeaths = [9007,8106,8928,9137,10017,10826,11317,10744,9713,9938,9161,8927,7750,6981,8038,8422,8714,9512,10120,9823,8743,9129,8710,8680,8162,7306,8124,7870,9387,9556,10093,9620,8285,8466,8160,8034,7717,7461,7767,7925,8623,8945,10078,9179,8037,8488,7874,8647,7792,6957,7726,8106,8890,9299,10625,9302,8314,8850,8265,8796,7836,6892,7791,8192,9115,9434,10484,9827,9110,9070,8633,9240,] + +# fmt: on