# TimeGPT

In [None]:
#| default_exp timegpt

In [None]:
#| hide 
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import logging
import inspect
import json
import requests
import warnings
from typing import Dict, List, Optional, Union

import numpy as np
import pandas as pd

from nixtlats.client import Nixtla, SingleSeriesForecast

logging.basicConfig(level=logging.INFO)
main_logger = logging.getLogger(__name__)
httpx_logger = logging.getLogger('httpx')
httpx_logger.setLevel(logging.ERROR)

In [None]:
#| hide
import os
import warnings
from itertools import product

from dotenv import load_dotenv
from fastcore.test import test_eq, test_fail, test_warns
from nbdev.showdoc import show_doc
from tqdm import TqdmExperimentalWarning

load_dotenv()
logging.getLogger('statsforecast').setLevel(logging.ERROR)
warnings.filterwarnings("ignore", category=TqdmExperimentalWarning)

In [None]:
#| exporti
date_features_by_freq = {
    # Daily frequencies
    'B': ['year', 'month', 'day', 'weekday'],
    'C': ['year', 'month', 'day', 'weekday'],
    'D': ['year', 'month', 'day', 'weekday'],
    # Weekly
    'W': ['year', 'week', 'weekday'],
    # Monthly
    'M': ['year', 'month'],
    'SM': ['year', 'month', 'day'],
    'BM': ['year', 'month'],
    'CBM': ['year', 'month'],
    'MS': ['year', 'month'],
    'SMS': ['year', 'month', 'day'],
    'BMS': ['year', 'month'],
    'CBMS': ['year', 'month'],
    # Quarterly
    'Q': ['year', 'quarter'],
    'BQ': ['year', 'quarter'],
    'QS': ['year', 'quarter'],
    'BQS': ['year', 'quarter'],
    # Yearly
    'A': ['year'],
    'Y': ['year'],
    'BA': ['year'],
    'BY': ['year'],
    'AS': ['year'],
    'YS': ['year'],
    'BAS': ['year'],
    'BYS': ['year'],
    # Hourly
    'BH': ['year', 'month', 'day', 'hour', 'weekday'],
    'H': ['year', 'month', 'day', 'hour'],
    # Minutely
    'T': ['year', 'month', 'day', 'hour', 'minute'],
    'min': ['year', 'month', 'day', 'hour', 'minute'],
    # Secondly
    'S': ['year', 'month', 'day', 'hour', 'minute', 'second'],
    # Milliseconds
    'L': ['year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond'],
    'ms': ['year', 'month', 'day', 'hour', 'minute', 'second', 'millisecond'],
    # Microseconds
    'U': ['year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond'],
    'us': ['year', 'month', 'day', 'hour', 'minute', 'second', 'microsecond'],
    # Nanoseconds
    'N': []
}

In [None]:
#| exporti
class _TimeGPT:
    """
    A class used to interact with the TimeGPT API.
    """

    def __init__(self, token: str, environment: Optional[str] = None):
        """
        Constructs all the necessary attributes for the TimeGPT object.

        Parameters
        ----------
        token : str
            The authorization token to interact with the TimeGPT API.
        environment : str
            Custom environment. Pass only if provided.
        """
        if environment is None:
            environment = "https://dashboard.nixtla.io/api"
        self.client = Nixtla(base_url=environment, token=token)
        self.weights_x: pd.DataFrame = None

    @property
    def request_headers(self):
        headers = {
            "accept": "application/json",
            "content-type": "application/json",
            "authorization": f"Bearer {self.client._client_wrapper._token}"
        }
        return headers
        
    def _parse_response(self, response) -> Dict:
        """Parses responde."""
        response.raise_for_status()
        try:
            resp = response.json()
        except Exception as e:
            raise Exception(response)
        return resp
    
    def validate_token(self, log: bool = True) -> bool:
        """Returns True if your token is valid."""
        validation = self.client.validate_token()
        valid = False
        if 'message' in validation:
            if validation['message'] == 'success':
                valid = True
        elif 'detail' in validation:
            if 'Forecasting! :)' in validation['detail']:
                valid = True
        if 'support' in validation and log:
            main_logger.info(f'Happy Forecasting! :), {validation["support"]}')
        return valid

    def _validate_inputs(
            self,
            df: pd.DataFrame,
            X_df: pd.DataFrame,
            freq: str,
            id_col: str,
            time_col: str,
            target_col: str,
        ):
        main_logger.info('Validating inputs...')
        if freq is None and hasattr(df.index, 'freq'):
            freq = df.index.freq
            if freq is not None:
                freq = freq.rule_code
                main_logger.info(f'Inferred freq: {freq}')
            time_col = df.index.name
            if time_col is None:
                time_col = 'ds'
                df.index.name = time_col
            df = df.reset_index()
        renamer = {
            id_col: 'unique_id',
            time_col: 'ds',
            target_col: 'y',
        }
        df = df.rename(columns=renamer)
        if df.dtypes.ds != 'object':
            df['ds'] = df['ds'].astype(str)
        drop_uid = False
        if 'unique_id' not in df.columns:
            # Insert unique_id column
            df = df.assign(unique_id='ts_0')
            drop_uid = True
        if X_df is not None:
            X_df = X_df.rename(columns=renamer)
            if 'unique_id' not in X_df.columns:
                X_df = X_df.assign(unique_id='ts_0')
            if X_df.dtypes.ds != 'object':
                X_df['ds'] = X_df['ds'].astype(str)
        return df, X_df, drop_uid, freq

    def _validate_outputs(
            self,
            fcst_df: pd.DataFrame,
            id_col: str,
            time_col: str,
            target_col: str,
            drop_uid: bool,
        ):
        renamer = {
            'unique_id': id_col,
            'ds': time_col,
            'target_col': target_col,
        }
        if drop_uid:
            fcst_df = fcst_df.drop(columns='unique_id')
        fcst_df = fcst_df.rename(columns=renamer)
        return fcst_df

    def _infer_freq(self, df: pd.DataFrame, freq: Optional[str] = None):
        # special freqs that need to be checked
        # for example to ensure 'W'-> 'W-MON'
        special_freqs = ['W', 'M', 'Q', 'Y', 'A']
        if freq is None or freq in special_freqs:
            unique_id = df.iloc[0]['unique_id']
            df_id = df.query('unique_id == @unique_id')
            inferred_freq = pd.infer_freq(df_id['ds'])
            if inferred_freq is None:
                raise Exception(
                    'Could not infer frequency of ds column. This could be due to '
                    'inconsistent intervals. Please check your data for missing, '
                    'duplicated or irregular timestamps'
                )
            if freq is not None:
                # check we have the same base frequency
                # except when we have yearly frequency (A, and Y means the same)
                if (freq != inferred_freq[0] and freq != 'Y') or (freq == 'Y' and inferred_freq[0] != 'A'):
                    raise Exception(f'Failed to infer special date, inferred freq {inferred_freq}')
            main_logger.info(f'Inferred freq: {inferred_freq}')
            return inferred_freq
        return freq
    
    def _resample_dataframe(
            self,
            df: pd.DataFrame,
            freq: str,
        ):
        df = df.copy()
        df['ds'] = pd.to_datetime(df['ds'])
        resampled_df = df.set_index('ds').groupby('unique_id').resample(freq).bfill()
        resampled_df = resampled_df.drop(columns='unique_id').reset_index()
        resampled_df['ds'] = resampled_df['ds'].astype(str)
        return resampled_df
    
    def _compute_date_feature(self, dates, feature):
        if callable(feature):
            feat_name = feature.__name__
            feat_vals = feature(dates)
        else:
            feat_name = feature
            if feature in ("week", "weekofyear"):
                dates = dates.isocalendar()
            feat_vals = getattr(dates, feature)
        if not isinstance(feat_vals, pd.DataFrame):
            vals = np.asarray(feat_vals)
            feat_vals = pd.DataFrame({feat_name: vals})
        feat_vals['ds'] = dates
        return feat_vals
    
    def _make_future_dataframe(self, df: pd.DataFrame, h: int, freq: str, reconvert: bool = True):
        last_dates = df.groupby('unique_id')['ds'].max()
        def _future_date_range(last_date):
            future_dates = pd.date_range(last_date, freq=freq, periods=h+1)[-h:]
            return future_dates
        future_df = last_dates.apply(_future_date_range).reset_index()
        future_df = future_df.explode('ds').reset_index(drop=True)
        if reconvert and df.dtypes['ds'] == 'object':
            # avoid date 000
            future_df['ds'] = future_df['ds'].astype(str)
        return future_df
    
    def _add_date_features( 
            self,
            df: pd.DataFrame,
            X_df: Optional[pd.DataFrame],
            h: int,
            freq: str,
            date_features: List[str],
            date_features_to_one_hot: Optional[List[str]],
        ):
        # df contains exogenous variables
        # X_df are the future values of the exogenous variables
        # construct dates
        train_dates = df['ds'].unique().tolist()
        # if we dont have future exogenos variables
        # we need to compute the future dates
        if (h is not None) and X_df is None:
            X_df = self._make_future_dataframe(df=df, h=h, freq=freq)
            future_dates = X_df['ds'].unique().tolist()
        elif X_df is not None:
            future_dates = X_df['ds'].unique().tolist()
        else:
            future_dates = []
        dates = pd.DatetimeIndex(train_dates + future_dates)
        date_features_df = pd.DataFrame({'ds': dates})
        for feature in date_features:
            feat_df = self._compute_date_feature(dates, feature)
            date_features_df = date_features_df.merge(feat_df, on=['ds'], how='left')
        if df.dtypes['ds'] == 'object':
            date_features_df['ds'] = date_features_df['ds'].astype(str)
        if date_features_to_one_hot is not None:
            date_features_df = pd.get_dummies(
                date_features_df, 
                columns=date_features_to_one_hot, 
                dtype=int,
            )
        # remove duplicated columns if any
        date_features_df = date_features_df.drop(
            columns=[col for col in date_features_df.columns if col in df.columns and col not in ['unique_id', 'ds']]
        )
        # add date features to df
        df = df.merge(date_features_df, on='ds', how='left')
        # add date features to X_df
        if X_df is not None:
            X_df = X_df.merge(date_features_df, on='ds', how='left')
        return df, X_df
    
    def _preprocess_X_df(self, X_df: pd.DataFrame, freq: str):
        if X_df.isna().any().any():
            raise Exception('Some of your exogenous variables contain NA, please check')
        X_df = X_df.sort_values(['unique_id', 'ds']).reset_index(drop=True)
        X_df = self._resample_dataframe(X_df, freq)
        return X_df
        
    def _preprocess_dataframes(
            self, 
            df: pd.DataFrame, 
            h: int,
            X_df: Optional[pd.DataFrame],
            freq: str,
            date_features: Union[bool, List[str]],
            date_features_to_one_hot: Union[bool, List[str]],
        ):
        """Returns Y_df and X_df dataframes in the structure expected by the endpoints."""
        # add date features logic
        if isinstance(date_features, bool):
            if date_features:
                date_features = date_features_by_freq.get(freq)
                if date_features is None:
                    warnings.warn(
                        f'Non default date features for {freq} '
                        'please pass a list of date features'
                    )
            else:
                date_features = None
                
        if date_features is not None:
            if isinstance(date_features_to_one_hot, bool):
                if date_features_to_one_hot:
                    date_features_to_one_hot = [feat for feat in date_features if not callable(feat)]
                    date_features_to_one_hot = None if not date_features_to_one_hot else date_features_to_one_hot
                else:
                    date_features_to_one_hot = None
            df, X_df = self._add_date_features(
                df=df, X_df=X_df, 
                h=h, freq=freq,
                date_features=date_features,
                date_features_to_one_hot=date_features_to_one_hot,
            )
        y_cols = ['unique_id', 'ds', 'y']
        Y_df = df[y_cols]
        if Y_df['y'].isna().any():
            raise Exception('Your target variable contains NA, please check')
        # Azul: efficient this code
        # and think about returning dates that are not in the training set
        Y_df = self._resample_dataframe(Y_df, freq)
        x_cols = []
        if X_df is not None:
            x_cols = X_df.drop(columns=['unique_id', 'ds']).columns.to_list()
            if not all(col in df.columns for col in x_cols):
                raise Exception(
                    'You must include the exogenous variables in the `df` object, '
                    f'exogenous variables {",".join(x_cols)}'
                )
            if (h is not None) and (len(X_df) != df['unique_id'].nunique() * h):
                raise Exception(
                    f'You have to pass the {h} future values of your '
                    'exogenous variables for each time series'
                )
            X_df_history = df[['unique_id', 'ds'] + x_cols]
            X_df = pd.concat([X_df_history, X_df])
            X_df = self._preprocess_X_df(X_df, freq)
        elif (X_df is None) and (h is None) and (len(y_cols) < df.shape[1]):
            # case for just insample, 
            # we dont need h
            X_df = df.drop(columns='y')
            x_cols = X_df.drop(columns=['unique_id', 'ds']).columns.to_list()
            X_df = self._preprocess_X_df(X_df, freq)
        return Y_df, X_df, x_cols
    
    def _get_to_dict_args(self):
        to_dict_args = {'orient': 'split'}
        if 'index' in inspect.signature(pd.DataFrame.to_dict).parameters:
            to_dict_args['index'] = False
        return to_dict_args
    
    def _transform_dataframes(self, Y_df: pd.DataFrame, X_df: pd.DataFrame):
        # contruction of y and x for the payload
        to_dict_args = self._get_to_dict_args()
        y = Y_df.to_dict(**to_dict_args)
        x = X_df.to_dict(**to_dict_args) if X_df is not None else None
        return y, x
    
    def _get_model_params(self, freq: str):
        model_params = self.client.timegpt_model_params(request=SingleSeriesForecast(freq=freq))
        if 'data' in model_params:
            model_params = model_params['data']
        model_params = model_params['detail']
        input_size, model_horizon = model_params['input_size'], model_params['horizon']
        return input_size, model_horizon
    
    def _validate_input_size(
            self, Y_df: pd.DataFrame, 
            input_size: int,
            model_horizon: int,
            require_history: bool,
        ):
        if require_history:
            min_history = Y_df.groupby('unique_id').size().min()
            if min_history < input_size + model_horizon:
                raise Exception(
                    'Your time series data is too short '
                    'Please be sure that your unique time series contain '
                    f'at least {input_size + model_horizon} observations'
                )
        return True     
        
    def _hit_multi_series_endpoint(
            self,
            Y_df: pd.DataFrame,
            X_df: pd.DataFrame,
            x_cols: List[str],
            h: int,
            freq: str,
            finetune_steps: int,
            clean_ex_first: bool,
            level: Optional[List[Union[int, float]]],
            input_size: int,
            model_horizon: int,
        ):
        if h > model_horizon:
            main_logger.warning(
                'The specified horizon "h" exceeds the model horizon. '
                'This may lead to less accurate forecasts. '
                'Please consider using a smaller horizon.'
            )
        # restrict input if
        # - we dont want to finetune
        # - we dont have exogenous regegressors
        # - and we dont want to produce pred intervals
        restrict_input = finetune_steps == 0 and X_df is None and level is not None
        if restrict_input:
            # add sufficient info to compute
            # conformal interval
            new_input_size = 3 * input_size + max(model_horizon, h)
            Y_df = Y_df.groupby('unique_id').tail(new_input_size)
            if X_df is not None:
                X_df = X_df.groupby('unique_id').tail(new_input_size + h) # history plus exogenous
        self._validate_input_size(
            Y_df=Y_df, 
            input_size=input_size,
            model_horizon=model_horizon,
            require_history=finetune_steps > 0 or level is not None,
        )
        y, x = self._transform_dataframes(Y_df, X_df)
        response_timegpt = self.client.timegpt_multi_series(
            y=y,
            x=x,
            fh=h,
            freq=freq,
            level=level,
            finetune_steps=finetune_steps,
            clean_ex_first=clean_ex_first,
        )
        if 'data' in response_timegpt:
            response_timegpt = response_timegpt['data']
        if 'weights_x' in response_timegpt:
            self.weights_x = pd.DataFrame({
                'features': x_cols,
                'weights': response_timegpt['weights_x'],
            })
        return pd.DataFrame(**response_timegpt['forecast'])
    
    def _hit_multi_series_historic_endpoint(
            self,
            Y_df: pd.DataFrame,
            X_df: pd.DataFrame,
            freq: str,
            level: Optional[List[Union[int, float]]],
            input_size: int,
            model_horizon: int,
            clean_ex_first: bool,
        ):
        self._validate_input_size(
            Y_df=Y_df,  
            input_size=input_size,
            model_horizon=model_horizon,
            require_history=True,
        )
        y, x = self._transform_dataframes(Y_df, X_df)
        response_timegpt = self.client.timegpt_multi_series_historic(
            freq=freq,
            level=level,
            y=y,
            x=x,
            clean_ex_first=clean_ex_first,
        )
        return pd.DataFrame(**response_timegpt['data']['forecast'])
    
    def _multi_series_forecast(
            self,
            df: pd.DataFrame,
            h: int,
            freq: str,
            X_df: Optional[pd.DataFrame],
            level: Optional[List[Union[int, float]]],
            finetune_steps: int,
            clean_ex_first: bool,
            add_history: bool,
            date_features: Union[bool, List[str]],
            date_features_to_one_hot: Union[bool, List[str]],
        ):
        freq = self._infer_freq(df, freq)
        main_logger.info('Preprocessing dataframes...')
        Y_df, X_df, x_cols = self._preprocess_dataframes(
            df=df, h=h, X_df=X_df,
            freq=freq,
            date_features=date_features,
            date_features_to_one_hot=date_features_to_one_hot,
        )
        input_size, model_horizon = self._get_model_params(freq)
        main_logger.info('Calling Forecast Endpoint...')
        fcst_df = self._hit_multi_series_endpoint(
            Y_df=Y_df, X_df=X_df, h=h, freq=freq,
            clean_ex_first=clean_ex_first, 
            finetune_steps=finetune_steps,
            x_cols=x_cols,
            level=level,
            input_size=input_size,
            model_horizon=model_horizon,
        )
        if add_history:
            main_logger.info('Calling Historical Forecast Endpoint...')
            fitted_df = self._hit_multi_series_historic_endpoint(
                Y_df=Y_df, 
                X_df=X_df,
                freq=freq,
                clean_ex_first=clean_ex_first,
                level=level,
                input_size=input_size,
                model_horizon=model_horizon,
            )
            fitted_df = fitted_df.drop(columns='y')
            fcst_df = pd.concat([fitted_df, fcst_df]).sort_values(['unique_id', 'ds'])
        return fcst_df

    def _hit_multi_series_anomalies_endpoint(
            self,
            Y_df: pd.DataFrame,
            X_df: pd.DataFrame,
            x_cols: List[str],
            freq: str,
            level: Union[int, float],
            clean_ex_first: bool,
        ):
        y, x = self._transform_dataframes(Y_df, X_df)
        response_timegpt = self.client.timegpt_multi_series_anomalies(
            freq=freq,
            level=[level] if (isinstance(level, int) or isinstance(level, float)) else [level[0]],
            y=y,
            x=x,
            clean_ex_first=clean_ex_first,
        )
        if 'data' in response_timegpt:
            response_timegpt = response_timegpt['data']
        if 'weights_x' in response_timegpt:
            self.weights_x = pd.DataFrame({
                'features': x_cols,
                'weights': response_timegpt['weights_x'],
            })
        return pd.DataFrame(**response_timegpt['forecast'])

    def _multi_series_detect_anomalies(
            self,
            df: pd.DataFrame,
            freq: str,
            level: Union[int, float],
            clean_ex_first: bool,
            date_features: Union[bool, List[str]],
            date_features_to_one_hot: Union[bool, List[str]],
        ):
        freq = self._infer_freq(df, freq)
        main_logger.info('Preprocessing dataframes...')
        Y_df, X_df, x_cols = self._preprocess_dataframes(
            df=df, h=None, X_df=None,
            freq=freq,
            date_features=date_features,
            date_features_to_one_hot=date_features_to_one_hot,
        )
        main_logger.info('Calling Anomaly Detector Endpoint...')
        anomalies_df = self._hit_multi_series_anomalies_endpoint(
            Y_df=Y_df, 
            X_df=X_df,
            x_cols=x_cols,
            freq=freq,
            level=level,
            clean_ex_first=clean_ex_first,
        )
        anomalies_df = anomalies_df.drop(columns='y')
        return anomalies_df
        
    def _forecast(
            self,
            df: pd.DataFrame,
            h: int,
            freq: Optional[str] = None,    
            id_col: str = 'unique_id',
            time_col: str = 'ds',
            target_col: str = 'y',
            X_df: Optional[pd.DataFrame] = None,
            level: Optional[List[Union[int, float]]] = None,
            finetune_steps: int = 0,
            clean_ex_first: bool = True,
            validate_token: bool = False,
            add_history: bool = False,
            date_features: Union[bool, List[str]] = False,
            date_features_to_one_hot: Union[bool, List[str]] = True,
        ):
        """Forecast your time series using TimeGPT.

        Parameters
        ----------
        df : pandas.DataFrame
            The DataFrame on which the function will operate. Expected to contain at least the following columns:
            - time_col:
                Column name in `df` that contains the time indices of the time series. This is typically a datetime
                column with regular intervals, e.g., hourly, daily, monthly data points.
            - target_col:
                Column name in `df` that contains the target variable of the time series, i.e., the variable we 
                wish to predict or analyze.
            Additionally, you can pass multiple time series (stacked in the dataframe) considering an additional column:
            - id_col:
                Column name in `df` that identifies unique time series. Each unique value in this column
                corresponds to a unique time series.
        h : int
            Forecast horizon.
        freq : str
            Frequency of the data. By default, the freq will be inferred automatically.
            See [pandas' available frequencies](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases).
        id_col : str (default='unique_id')
            Column that identifies each serie.
        time_col : str (default='ds')
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str (default='y')
            Column that contains the target.
        X_df : pandas.DataFrame, optional (default=None)
            DataFrame with [`unique_id`, `ds`] columns and `df`'s future exogenous.
        level : List[float], optional (default=None)
            Confidence levels between 0 and 100 for prediction intervals.
        finetune_steps : int (default=0)
            Number of steps used to finetune TimeGPT in the
            new data.
        clean_ex_first : bool (default=True)
            Clean exogenous signal before making forecasts
            using TimeGPT.
        validate_token : bool (default=False)
            If True, validates token before 
            sending requests.
        add_history : bool (default=False)
            Return fitted values of the model.
        date_features : bool or list of str or callable, optional (default=False)
            Features computed from the dates. 
            Can be pandas date attributes or functions that will take the dates as input.
            If True automatically adds most used date features for the 
            frequency of `df`.
        date_features_to_one_hot : bool or list of str (default=True)
            Apply one-hot encoding to these date features.
            If `date_features=True`, then all date features are
            one-hot encoded by default.
        
        Returns
        -------
        fcsts_df : pandas.DataFrame
            DataFrame with TimeGPT forecasts for point predictions and probabilistic
            predictions (if level is not None).
        """
        if validate_token and not self.validate_token(log=False):
            raise Exception(
                'Token not valid, please email ops@nixtla.io'
            )
            
        df, X_df, drop_uid, freq = self._validate_inputs(
            df=df,
            X_df=X_df,
            freq=freq,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
        )
        fcst_df = self._multi_series_forecast(
            df=df, 
            h=h,
            freq=freq,
            X_df=X_df,
            level=level, 
            finetune_steps=finetune_steps,
            clean_ex_first=clean_ex_first,
            add_history=add_history,
            date_features=date_features,
            date_features_to_one_hot=date_features_to_one_hot,
        )
        fcst_df = self._validate_outputs(
            fcst_df=fcst_df,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
            drop_uid=drop_uid,
        )
        return fcst_df

    def _detect_anomalies(
            self,
            df: pd.DataFrame,
            freq: Optional[str] = None,    
            id_col: str = 'unique_id',
            time_col: str = 'ds',
            target_col: str = 'y',
            level: Union[int, float] = 99,
            clean_ex_first: bool = True,
            validate_token: bool = False,
            date_features: Union[bool, List[str]] = False,
            date_features_to_one_hot: Union[bool, List[str]] = True,
        ):
        """Detect anomalies in your time series using TimeGPT.

        Parameters
        ----------
        df : pandas.DataFrame
            The DataFrame on which the function will operate. Expected to contain at least the following columns:
            - time_col:
                Column name in `df` that contains the time indices of the time series. This is typically a datetime
                column with regular intervals, e.g., hourly, daily, monthly data points.
            - target_col:
                Column name in `df` that contains the target variable of the time series, i.e., the variable we 
                wish to predict or analyze.
            Additionally, you can pass multiple time series (stacked in the dataframe) considering an additional column:
            - id_col:
                Column name in `df` that identifies unique time series. Each unique value in this column
                corresponds to a unique time series.
        freq : str
            Frequency of the data. By default, the freq will be inferred automatically.
            See [pandas' available frequencies](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases).
        id_col : str (default='unique_id')
            Column that identifies each serie.
        time_col : str (default='ds')
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str (default='y')
            Column that contains the target.
        level : float (default=99)
            Confidence level between 0 and 100 for detecting the anomalies.
        clean_ex_first : bool (default=True)
            Clean exogenous signal before making forecasts
            using TimeGPT.
        validate_token : bool (default=False)
            If True, validates token before 
            sending requests.
        date_features : bool or list of str or callable, optional (default=False)
            Features computed from the dates. 
            Can be pandas date attributes or functions that will take the dates as input.
            If True automatically adds most used date features for the 
            frequency of `df`.
        date_features_to_one_hot : bool or list of str (default=True)
            Apply one-hot encoding to these date features.
            If `date_features=True`, then all date features are
            one-hot encoded by default.
        
        Returns
        -------
        anomalies_df : pandas.DataFrame
            DataFrame with anomalies flagged with 1 detected by TimeGPT.
        """
        if validate_token and not self.validate_token(log=False):
            raise Exception(
                'Token not valid, please email ops@nixtla.io'
            )
            
        df, _, drop_uid, freq = self._validate_inputs(
            df=df,
            X_df=None,
            freq=freq,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
        )
        anomalies_df = self._multi_series_detect_anomalies(
            df=df, 
            freq=freq,
            level=level,
            clean_ex_first=clean_ex_first,
            date_features=date_features,
            date_features_to_one_hot=date_features_to_one_hot,
        )
        anomalies_df = self._validate_outputs(
            fcst_df=anomalies_df,
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
            drop_uid=drop_uid,
        )
        return anomalies_df

    def plot(
            self,
            df: pd.DataFrame,
            forecasts_df: Optional[pd.DataFrame] = None,
            id_col: str = 'unique_id',
            time_col: str = 'ds',
            target_col: str = 'y',
            unique_ids: Union[Optional[List[str]], np.ndarray] = None,
            plot_random: bool = True,
            models: Optional[List[str]] = None,
            level: Optional[List[float]] = None,
            max_insample_length: Optional[int] = None,
            plot_anomalies: bool = False,
            engine: str = 'matplotlib',
            resampler_kwargs: Optional[Dict] = None,
        ):
        """Plot forecasts and insample values.

        Parameters
        ----------
        df : pandas.DataFrame
            The DataFrame on which the function will operate. Expected to contain at least the following columns:
            - time_col:
                Column name in `df` that contains the time indices of the time series. This is typically a datetime
                column with regular intervals, e.g., hourly, daily, monthly data points.
            - target_col:
                Column name in `df` that contains the target variable of the time series, i.e., the variable we 
                wish to predict or analyze.
            Additionally, you can pass multiple time series (stacked in the dataframe) considering an additional column:
            - id_col:
                Column name in `df` that identifies unique time series. Each unique value in this column
                corresponds to a unique time series.
        forecasts_df : pandas.DataFrame, optional (default=None)
            DataFrame with columns [`unique_id`, `ds`] and models.
        id_col : str (default='unique_id')
            Column that identifies each serie.
        time_col : str (default='ds')
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str (default='y')
            Column that contains the target.
        unique_ids : List[str], optional (default=None)
            Time Series to plot.
            If None, time series are selected randomly.
        plot_random : bool (default=True)
            Select time series to plot randomly.
        models : List[str], optional (default=None)
            List of models to plot.
        level : List[float], optional (default=None)
            List of prediction intervals to plot if paseed.
        max_insample_length : int, optional (default=None)
            Max number of train/insample observations to be plotted.
        plot_anomalies : bool (default=False)
            Plot anomalies for each prediction interval.
        engine : str (default='plotly')
            Library used to plot. 'plotly', 'plotly-resampler' or 'matplotlib'.
        resampler_kwargs : dict
            Kwargs to be passed to plotly-resampler constructor.
            For further custumization ("show_dash") call the method,
            store the plotting object and add the extra arguments to
            its `show_dash` method.
        """
        try:
            from utilsforecast.plotting import plot_series
        except ModuleNotFoundError:
            raise Exception(
                'You have to install additional dependencies to use this method, '
                'please install them using `pip install "nixtlats[plotting]"`'
            )
        df = df.copy()
        if id_col not in df:
            df[id_col] = 'ts_0'
        df[time_col] = pd.to_datetime(df[time_col])
        if forecasts_df is not None:
            forecasts_df = forecasts_df.copy()
            if id_col not in forecasts_df:
                forecasts_df[id_col] = 'ts_0'
            forecasts_df[time_col] = pd.to_datetime(forecasts_df[time_col])
            if 'anomaly' in forecasts_df:
                # special case to plot outputs
                # from detect_anomalies
                forecasts_df = forecasts_df.drop(columns='anomaly')
                cols = forecasts_df.columns
                cols = cols[cols.str.contains('TimeGPT-lo-')]
                level = cols.str.replace('TimeGPT-lo-', '')[0]
                level = float(level) if '.' in level else int(level)
                level = [level]
                plot_anomalies = True
                models = ['TimeGPT']
                forecasts_df = df.merge(forecasts_df, how='left')
                df = df.groupby('unique_id').head(1)
                # prevent double plotting
                df.loc[:, target_col] = np.nan
        return plot_series(
            df=df,
            forecasts_df=forecasts_df,
            ids=unique_ids,
            plot_random=plot_random,
            models=models,
            level=level,
            max_insample_length=max_insample_length,
            plot_anomalies=plot_anomalies,
            engine=engine,
            resampler_kwargs=resampler_kwargs,
            palette="tab20b",
            id_col=id_col,
            time_col=time_col,
            target_col=target_col,
        )

In [None]:
#| exporti
class TimeGPT(_TimeGPT):
    
    def forecast(
            self,
            df: pd.DataFrame,
            h: int,
            freq: Optional[str] = None,    
            id_col: str = 'unique_id',
            time_col: str = 'ds',
            target_col: str = 'y',
            X_df: Optional[pd.DataFrame] = None,
            level: Optional[List[Union[int, float]]] = None,
            finetune_steps: int = 0,
            clean_ex_first: bool = True,
            validate_token: bool = False,
            add_history: bool = False,
            date_features: Union[bool, List[str]] = False,
            date_features_to_one_hot: Union[bool, List[str]] = True,
            num_partitions: Optional[int] = None,
        ):
        """Forecast your time series using TimeGPT.

        Parameters
        ----------
        df : pandas.DataFrame
            The DataFrame on which the function will operate. Expected to contain at least the following columns:
            - time_col:
                Column name in `df` that contains the time indices of the time series. This is typically a datetime
                column with regular intervals, e.g., hourly, daily, monthly data points.
            - target_col:
                Column name in `df` that contains the target variable of the time series, i.e., the variable we 
                wish to predict or analyze.
            Additionally, you can pass multiple time series (stacked in the dataframe) considering an additional column:
            - id_col:
                Column name in `df` that identifies unique time series. Each unique value in this column
                corresponds to a unique time series.
        h : int
            Forecast horizon.
        freq : str
            Frequency of the data. By default, the freq will be inferred automatically.
            See [pandas' available frequencies](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases).
        id_col : str (default='unique_id')
            Column that identifies each serie.
        time_col : str (default='ds')
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str (default='y')
            Column that contains the target.
        X_df : pandas.DataFrame, optional (default=None)
            DataFrame with [`unique_id`, `ds`] columns and `df`'s future exogenous.
        level : List[float], optional (default=None)
            Confidence levels between 0 and 100 for prediction intervals.
        finetune_steps : int (default=0)
            Number of steps used to finetune TimeGPT in the
            new data.
        clean_ex_first : bool (default=True)
            Clean exogenous signal before making forecasts
            using TimeGPT.
        validate_token : bool (default=False)
            If True, validates token before 
            sending requests.
        add_history : bool (default=False)
            Return fitted values of the model.
        date_features : bool or list of str or callable, optional (default=False)
            Features computed from the dates. 
            Can be pandas date attributes or functions that will take the dates as input.
            If True automatically adds most used date features for the 
            frequency of `df`.
        date_features_to_one_hot : bool or list of str (default=True)
            Apply one-hot encoding to these date features.
            If `date_features=True`, then all date features are
            one-hot encoded by default.
        num_partitions : int (default=None)
            Number of partitions to use.
            Only used in distributed environments (spark, ray, dask).
            If None, the number of partitions will be equal
            to the available parallel resources.
        
        Returns
        -------
        fcsts_df : pandas.DataFrame
            DataFrame with TimeGPT forecasts for point predictions and probabilistic
            predictions (if level is not None).
        """
        if isinstance(df, pd.DataFrame):
            return self._forecast(
                df=df,
                h=h,
                freq=freq,    
                id_col=id_col,
                time_col=time_col,
                target_col=target_col,
                X_df=X_df,
                level=level,
                finetune_steps=finetune_steps,
                clean_ex_first=clean_ex_first,
                validate_token=validate_token,
                add_history=add_history,
                date_features=date_features,
                date_features_to_one_hot=date_features_to_one_hot,
            )
        else:
            from nixtlats.distributed.timegpt import _DistributedTimeGPT
            return _DistributedTimeGPT().forecast(
                token=self.client._client_wrapper._token,
                environment=self.client._client_wrapper._base_url,
                df=df,
                h=h,
                freq=freq,    
                id_col=id_col,
                time_col=time_col,
                target_col=target_col,
                X_df=X_df,
                level=level,
                finetune_steps=finetune_steps,
                clean_ex_first=clean_ex_first,
                validate_token=validate_token,
                add_history=add_history,
                date_features=date_features,
                date_features_to_one_hot=date_features_to_one_hot,
                num_partitions=num_partitions,
            )
            
    def detect_anomalies(
            self,
            df: pd.DataFrame,
            freq: Optional[str] = None,    
            id_col: str = 'unique_id',
            time_col: str = 'ds',
            target_col: str = 'y',
            level: Union[int, float] = 99,
            clean_ex_first: bool = True,
            validate_token: bool = False,
            date_features: Union[bool, List[str]] = False,
            date_features_to_one_hot: Union[bool, List[str]] = True,
        ):
        """Detect anomalies in your time series using TimeGPT.

        Parameters
        ----------
        df : pandas.DataFrame
            The DataFrame on which the function will operate. Expected to contain at least the following columns:
            - time_col:
                Column name in `df` that contains the time indices of the time series. This is typically a datetime
                column with regular intervals, e.g., hourly, daily, monthly data points.
            - target_col:
                Column name in `df` that contains the target variable of the time series, i.e., the variable we 
                wish to predict or analyze.
            Additionally, you can pass multiple time series (stacked in the dataframe) considering an additional column:
            - id_col:
                Column name in `df` that identifies unique time series. Each unique value in this column
                corresponds to a unique time series.
        freq : str
            Frequency of the data. By default, the freq will be inferred automatically.
            See [pandas' available frequencies](https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases).
        id_col : str (default='unique_id')
            Column that identifies each serie.
        time_col : str (default='ds')
            Column that identifies each timestep, its values can be timestamps or integers.
        target_col : str (default='y')
            Column that contains the target.
        level : float (default=99)
            Confidence level between 0 and 100 for detecting the anomalies.
        clean_ex_first : bool (default=True)
            Clean exogenous signal before making forecasts
            using TimeGPT.
        validate_token : bool (default=False)
            If True, validates token before 
            sending requests.
        date_features : bool or list of str or callable, optional (default=False)
            Features computed from the dates. 
            Can be pandas date attributes or functions that will take the dates as input.
            If True automatically adds most used date features for the 
            frequency of `df`.
        date_features_to_one_hot : bool or list of str (default=True)
            Apply one-hot encoding to these date features.
            If `date_features=True`, then all date features are
            one-hot encoded by default.
        
        Returns
        -------
        anomalies_df : pandas.DataFrame
            DataFrame with anomalies flagged with 1 detected by TimeGPT.
        """
        if isinstance(df, pd.DataFrame):
            return self._detect_anomalies(
                df=df,
                freq=freq,    
                id_col=id_col,
                time_col=time_col,
                target_col=target_col,
                level=level,
                clean_ex_first=clean_ex_first,
                validate_token=validate_token,
                date_features=date_features,
                date_features_to_one_hot=date_features_to_one_hot,
            )
        else:
            from nixtlats.distributed.timegpt import _DistributedTimeGPT
            return _DistributedTimeGPT().detect_anomalies(
                token=self.client._client_wrapper._token,
                environment=self.client._environment,
                df=df,
                freq=freq,    
                id_col=id_col,
                time_col=time_col,
                target_col=target_col,
                level=level,
                clean_ex_first=clean_ex_first,
                validate_token=validate_token,
                date_features=date_features,
                date_features_to_one_hot=date_features_to_one_hot,
                num_partitions=num_partitions,
            )

In [None]:
show_doc(TimeGPT.__init__, title_level=3, name='TimeGPT')

In [None]:
#| hide
timegpt = TimeGPT(token=os.environ['TIMEGPT_TOKEN'])

In [None]:
show_doc(TimeGPT.validate_token, title_level=4, name='TimeGPT.validate_token')

In [None]:
#| hide
timegpt.validate_token()

In [None]:
#| hide
_timegpt = TimeGPT(os.environ['TIMEGPT_CUSTOM_URL_TOKEN'], os.environ['TIMEGPT_CUSTOM_URL'])
_timegpt.validate_token()

In [None]:
#| hide
test_fail(
    lambda: TimeGPT(token='transphobic').forecast(df=pd.DataFrame(), h=None, validate_token=True),
    contains='nixtla'
)

In [None]:
#| hide
# test input_size
test_eq(
    timegpt.client.timegpt_model_params(request=SingleSeriesForecast(freq='D'))['data']['detail'],
    {'input_size': 28, 'horizon': 7},
)

Now you can start to make forecasts! Let's import an example:

In [None]:
#| hide
df = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/air_passengers.csv')
df.head()

In [None]:
#| hide
# test pass dataframe with index
df_ds_index = df.set_index('timestamp')
df_ds_index.index = pd.DatetimeIndex(df_ds_index.index, freq='MS')
fcst_inferred_df_index = timegpt.forecast(df_ds_index, h=10, time_col='timestamp', target_col='value')
anom_inferred_df_index = timegpt.detect_anomalies(df_ds_index, time_col='timestamp', target_col='value')
fcst_inferred_df = timegpt.forecast(df, h=10, time_col='timestamp', target_col='value')
anom_inferred_df = timegpt.detect_anomalies(df, time_col='timestamp', target_col='value')
pd.testing.assert_frame_equal(fcst_inferred_df_index, fcst_inferred_df)
pd.testing.assert_frame_equal(anom_inferred_df_index, anom_inferred_df)
for freq in ['Y', 'W-MON', 'Q-DEC', 'H']:
    df_ds_index.index = pd.date_range(end='2023-01-01', periods=len(df), freq=freq)
    fcst_inferred_df_index = timegpt.forecast(df_ds_index, h=10, time_col='timestamp', target_col='value')
    df_test = df_ds_index.reset_index()
    fcst_inferred_df = timegpt.forecast(df_test, h=10, time_col='timestamp', target_col='value')
    pd.testing.assert_frame_equal(fcst_inferred_df_index, fcst_inferred_df)

In [None]:
show_doc(TimeGPT.plot, name='TimeGPT.plot', title_level=4)

In [None]:
#| hide
timegpt.plot(df, time_col='timestamp', target_col='value', engine='plotly')

In [None]:
show_doc(TimeGPT.forecast, title_level=4)

In [None]:
#| hide
# test infer freq
input_freqs = ['W', 'W', 'W', 'Q', 'Q', 'Q', 'M', 'M', 'Y', 'Y', 'Y']
expected_freqs = ['W-MON', 'W-TUE', 'W-WED', 'Q-DEC', 'QS-OCT', 'QS-DEC', 'MS', 'M', 'AS-JAN', 'A-DEC', 'AS-DEC']
for input_freq, output_freq in zip(input_freqs, expected_freqs):
    df_freq = pd.DataFrame({
        'unique_id': 'test_ts',
        'ds': pd.date_range('2021-01-01', periods=10, freq=output_freq),
    })
    inferred_freq = timegpt._infer_freq(df_freq, input_freq)
    test_eq(output_freq, inferred_freq)

In [None]:
#| hide
# test make future dataframe for one series
df_ = df.rename(columns={'timestamp': 'ds', 'value': 'y'})
df_.insert(0, 'unique_id', 'AirPassengers')
df_actual_future = df_.tail(12)[['unique_id', 'ds']]
df_history = df_.drop(df_actual_future.index)
df_future = timegpt._make_future_dataframe(df_history, h=12, freq='MS')
pd.testing.assert_frame_equal(
    df_actual_future.reset_index(drop=True),
    df_future,
)

In [None]:
#| hide
# test add date features
date_features = ['year', 'month']
df_date_features, future_df = timegpt._add_date_features(
    df=df_, h=12, X_df=None, 
    freq='MS', 
    date_features=date_features,
    date_features_to_one_hot=None,
)
assert all(col in df_date_features for col in date_features)
assert all(col in future_df for col in date_features)

In [None]:
#| hide
from nixtlats.date_features import SpecialDates

In [None]:
#| hide
# test add callables
date_features = [SpecialDates({'first_dates': ['2021-01-1'], 'second_dates': ['2021-01-01']})]
df_daily = df_.copy()
df_daily['ds'] = pd.date_range(end='2021-01-01', periods=len(df_daily))
df_date_features, future_df = timegpt._add_date_features(
    df=df_daily, h=12, X_df=None, 
    freq='D', 
    date_features=date_features,
    date_features_to_one_hot=None,
)
assert all(col in df_date_features for col in ['first_dates', 'second_dates'])
assert all(col in future_df for col in ['first_dates', 'second_dates'])

In [None]:
#| hide
# test add date features one hot encoded
date_features = ['year', 'month']
date_features_to_one_hot = ['month']
df_date_features, future_df = timegpt._add_date_features(
    df=df_, h=12, X_df=None, 
    freq='MS', 
    date_features=date_features,
    date_features_to_one_hot=date_features_to_one_hot,
)

In [None]:
#| hide
# test future dataframe for multiple series
df_ = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/electricity-short-with-ex-vars.csv')
df_actual_future = pd.read_csv('https://raw.githubusercontent.com/Nixtla/transfer-learning-time-series/main/datasets/electricity-short-future-ex-vars.csv')
df_future = timegpt._make_future_dataframe(
    df=df_[['unique_id', 'ds', 'y']], 
    h=24, 
    freq='H',
)
pd.testing.assert_frame_equal(
    df_actual_future[['unique_id', 'ds']],
    df_future,
)

In [None]:
df_ds_index.info()

In [None]:
# test pass dataframe with index
df_ds_index = df_.set_index('ds')[['unique_id', 'y']]
df_ds_index.index = pd.DatetimeIndex(df_ds_index.index)
fcst_inferred_df_index = timegpt.forecast(df_ds_index, h=10)
anom_inferred_df_index = timegpt.detect_anomalies(df_ds_index)
fcst_inferred_df = timegpt.forecast(df_[['ds', 'unique_id', 'y']], h=10)
anom_inferred_df = timegpt.detect_anomalies(df_[['ds', 'unique_id', 'y']])
pd.testing.assert_frame_equal(fcst_inferred_df_index, fcst_inferred_df, atol=1e-3)
pd.testing.assert_frame_equal(anom_inferred_df_index, anom_inferred_df, atol=1e-3)
df_ds_index = df_ds_index.groupby('unique_id').tail(80)
for freq in ['Y', 'W-MON', 'Q-DEC', 'H']:
    df_ds_index.index = np.concatenate(
        df_ds_index['unique_id'].nunique() * [pd.date_range(end='2023-01-01', periods=80, freq=freq)]
    )
    fcst_inferred_df_index = timegpt.forecast(df_ds_index, h=10)
    df_test = df_ds_index.reset_index()
    fcst_inferred_df = timegpt.forecast(df_test, h=10)
    pd.testing.assert_frame_equal(fcst_inferred_df_index, fcst_inferred_df, atol=1e-3)

In [None]:
#| hide
# test add date features with exogenous variables 
# and multiple series
date_features = ['year', 'month']
df_date_features, future_df = timegpt._add_date_features(
    df=df_, h=24, X_df=df_actual_future, 
    freq='H', 
    date_features=date_features,
    date_features_to_one_hot=None,
)
assert all(col in df_date_features for col in date_features)
assert all(col in future_df for col in date_features)
pd.testing.assert_frame_equal(
    df_date_features[df_.columns],
    df_,
)
pd.testing.assert_frame_equal(
    future_df[df_actual_future.columns],
    df_actual_future,
)

In [None]:
#| hide
# test add date features one hot with exogenous variables 
# and multiple series
date_features = ['month', 'day']
df_date_features, future_df = timegpt._add_date_features(
    df=df_, h=24, X_df=df_actual_future, 
    freq='H', 
    date_features=date_features, 
    date_features_to_one_hot=date_features,
)
pd.testing.assert_frame_equal(
    df_date_features[df_.columns],
    df_,
)
pd.testing.assert_frame_equal(
    future_df[df_actual_future.columns],
    df_actual_future,
)

In [None]:
#| hide
# test warning horizon too long
timegpt.forecast(df=df.tail(3), h=100, time_col='timestamp', target_col='value')

In [None]:
#| hide 
# test short horizon with add_history
test_fail(
    lambda: timegpt.forecast(df=df.tail(3), h=12, time_col='timestamp', target_col='value', add_history=True),
    contains='be sure'
)

In [None]:
#| hide 
# test short horizon with finetunning
test_fail(
    lambda: timegpt.forecast(df=df.tail(3), h=12, time_col='timestamp', target_col='value', finetune_steps=10),
    contains='be sure'
)

In [None]:
#| hide 
# test short horizon with level
test_fail(
    lambda: timegpt.forecast(df=df.tail(3), h=12, time_col='timestamp', target_col='value', level=[80, 90]),
    contains='be sure'
)

In [None]:
#| hide
# test custom url
# same results
_timegpt_fcst_df = _timegpt.forecast(df=df, h=12, time_col='timestamp', target_col='value')
timegpt_fcst_df = timegpt.forecast(df=df, h=12, time_col='timestamp', target_col='value')
pd.testing.assert_frame_equal(
    _timegpt_fcst_df,
    timegpt_fcst_df,
)

In [None]:
show_doc(TimeGPT.detect_anomalies, title_level=4)