# Imports


In [1]:
import inspect
import json
from datetime import datetime as dt
from datetime import timedelta
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
from warnings import filterwarnings
from zoneinfo import ZoneInfo

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.graph_objs as go
import plotly.io as pio
import plotly.offline as pyo
import scipy as sc
import seaborn as sns
import statsmodels.api as sm
import talib
import yfinance as yf
from IPython.core.interactiveshell import InteractiveShell
from pandas import DataFrame, Series
from pandas.core.frame import DataFrame
from plotly.offline import init_notebook_mode
from plotly.subplots import make_subplots
from scipy import stats
from scipy.stats import chi2_contingency, kendalltau, spearmanr
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import VarianceThreshold, chi2, mutual_info_regression
from sklearn.inspection import permutation_importance
from sklearn.linear_model import ElasticNet, LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import (
    GridSearchCV,
    TimeSeriesSplit,
    cross_val_score,
    learning_curve,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.stattools import durbin_watson
from yfinance.ticker import Ticker
from statsmodels.tsa.ar_model import AutoReg
from statsmodels.tsa.stattools import acf, pacf

filterwarnings("ignore")
sns.set(style="darkgrid")
plt.style.use("dark_background")
plt.rcParams.update({"grid.linewidth": 0.5, "grid.alpha": 0.5})
plt.rc("figure", figsize=(16, 10))
plt.rc("lines", markersize=4)
plt.rcParams["figure.autolayout"] = True
sns.set_context("poster")
init_notebook_mode(connected=True)
pio.templates.default = "plotly_dark"
InteractiveShell.ast_node_interactivity = "all"


A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.2



# Data Collection

In [2]:
class Stock:
    """
    A class for retrieving historical stock data from Yahoo Finance using the yfinance library.

    Args:
    ----------
        symbol (str): The stock symbol to retrieve data for.
        
        start_date (str, optional): The start date of the date range to retrieve data for, in YYYY-MM-DD format. If not specified, defaults to the earliest available date for the given stock.
        
        end_date (str, optional): The end date of the date range to retrieve data for, in YYYY-MM-DD format. If not specified, defaults to the current date.
        
        interval (str, optional): The time interval between data points. Valid values are '1d' (daily), '1wk' (weekly), '1mo' (monthly), '1m' (1-minute), '2m' (2-minute), '5m' (5-minute), '15m' (15-minute), '30m' (30-minute), '60m' (60-minute), '90m' (90-minute), and '1h' (hourly). Default is '1d' for daily data.

    Attributes:
    ----------
        symbol (str): The stock symbol being retrieved.
        
        start_date (str): The actual start date of the retrieved data, in YYYY-MM-DD format.
        
        end_date (str): The actual end date of the retrieved data, in YYYY-MM-DD format.
        
        interval (str): The time interval between data points.
        
        data (pandas.DataFrame): A DataFrame containing the retrieved stock data.
        
        ticker (yfinance.ticker.Ticker): A Ticker object for the given stock symbol.
        
        info (dict): A dictionary containing information about the stock, such as its name and industry.

    Methods:
    ----------
        startup(symbol: str, start_date: str, end_date: str, interval: str) -> None:
            Constructs a new StockData object and downloads the stock data from Yahoo Finance.

    Usage:
    ----------
        # Instantiate a Stock object
        >>> stock = Stock(symbol="AAPL", start_date="2020-01-01", end_date="2022-04-17", interval="1d")

        # Access stock data
        >>> stock.data

        # Access stock information
        >>> stock.info
    """

    def __init__(self, symbol, start_date=None, end_date=None, interval="1d") -> None:
        self.startup(symbol, start_date, end_date, interval)

    def startup(self, symbol, start_date, end_date, interval) -> None:
        """
        Constructs a new StockData object and downloads the stock data from Yahoo Finance.

        Args:
            symbol (str): The stock symbol to retrieve data for.

            start_date (str): The start date of the date range to retrieve data for, in YYYY-MM-DD format.

            end_date (str): The end date of the date range to retrieve data for, in YYYY-MM-DD format.

            interval (str, optional): The time interval between data points. Default is '1d' for daily data. Valid values are '1d' (daily), '1wk' (weekly), '1mo' (monthly), '1m' (1-minute), '2m' (2-minute), '5m' (5-minute), '15m' (15-minute), '30m' (30-minute), '60m' (60-minute), '90m' (90-minute), and '1h' (hourly).
        """
        self.symbol: str = symbol
        self.start_date: str = start_date
        self.end_date: str = end_date
        self.interval: str = interval
        self.data: DataFrame = yf.download(
            symbol, start=start_date, end=end_date, interval=interval
        )
        if self.start_date is None and self.end_date is None:
            self.start_date = str(self.data.iloc[0].name)
            self.end_date = str(self.data.iloc[-1].name)
        self.ticker: Ticker = yf.Ticker(symbol)
        self.info: dict[Any, Any] = self.ticker.info

    def __repr__(self) -> str:
        """
        Returns a string representation of the StockData object.
        """
        return f"StockData(symbol={self.symbol}, start_date={self.start_date}, end_date={self.end_date}, interval={self.interval})"

    def __str__(self) -> str:
        """
        Returns a string representation of the stock data.
        """
        return str(self.data)

# Data Preparation


## Generating all input variables $X_i$


We will construct the class LagVariableGenerator in order to automate the process of generating lag variables for time series analysis. Lag variables are often used in time series modeling to capture the relationship between a variable and its past values. By incorporating lag variables, it becomes possible to consider the effect of previous observations on the current observation.

This class provides a systematic approach for generating lag variables by leveraging autocorrelation analysis and information criteria (AIC and BIC). Autocorrelation analysis helps identify the lag values with the highest autocorrelation and partial autocorrelation, which indicate the potential influence of past observations. The information criteria (AIC and BIC) are used to select the lag values that minimize the model's complexity while still capturing important temporal dependencies.

By encapsulating the lag variable generation process within a class, it promotes reusability and simplifies the implementation for time series analysis tasks. The class allows users to specify the maximum number of lag variables (max_lags) and provides methods to calculate autocorrelation, partial autocorrelation, AIC, BIC, and choose the lag variables based on the analysis. This abstraction allows users to focus on the analysis itself rather than dealing with the intricacies of lag variable generation.

In [218]:
class LagVariableGenerator:
    """
    LagVariableGenerator is a class that automates the generation of lag variables for time series analysis.

    Lag variables capture the relationship between a variable and its past values, allowing the consideration
    of temporal dependencies in time series modeling.

    Parameters:
        max_lags (int): The maximum number of lag variables to generate.

    Methods:
        autocorrelation_analysis(df, target_col):
            Calculates the autocorrelation and partial autocorrelation values for the target column of the given dataframe.

        aic(y, y_hat, p):
            Calculates the Akaike Information Criterion (AIC) for a set of observed values, predicted values, and the number of parameters.

        bic(y, y_hat, p):
            Calculates the Bayesian Information Criterion (BIC) for a set of observed values, predicted values, and the number of parameters.

        information_criteria(df, target_col):
            Calculates the lag values that minimize the AIC and BIC values for the given dataframe and target column.

        choose_lag_variables(df, target_col):
            Selects the lag variables to be included based on the results of autocorrelation analysis and information criteria.

        generate_lag_variables(df, target_col):
            Generates lag variables for the given dataframe and target column using the chosen lag values.

    Usage Example:
    >>> generator = LagVariableGenerator(max_lags=5)
    >>> lag_variables = generator.generate_lag_variables(df, 'target')
    """    
    def __init__(self, max_lags) -> None:
        self.max_lags = max_lags

    def autocorrelation_analysis(self, df, target_col):
        autocorr_values = acf(df[target_col], nlags=self.max_lags, fft=True)
        partial_autocorr_values = pacf(df[target_col], nlags=self.max_lags)
        max_autocorr_lag = np.argmax(np.abs(autocorr_values[1:])) + 1
        max_partial_autocorr_lag = np.argmax(np.abs(partial_autocorr_values[1:])) + 1
        return max_autocorr_lag, max_partial_autocorr_lag

    def aic(self, y, y_hat, p):
        n = len(y)
        resid = y - y_hat
        sse = np.sum(resid**2)
        aic = n * np.log(sse / n) + 2 * p
        return aic

    def bic(self, y, y_hat, p):
        n = len(y)
        resid = y - y_hat
        sse = np.sum(resid**2)
        bic = n * np.log(sse / n) + p * np.log(n)
        return bic

    def information_criteria(self, df, target_col):
        aic_values = []
        bic_values = []
        for lag in range(1, self.max_lags + 1):
            ar_model = AutoReg(df[target_col].values, lags=lag, old_names=False)
            ar_result = ar_model.fit()
            y_hat = ar_result.predict(start=lag)
            aic_values.append(self.aic(df[target_col].values[lag:], y_hat, lag))
            bic_values.append(self.bic(df[target_col].values[lag:], y_hat, lag))
        min_aic_lag = np.argmin(aic_values) + 1
        min_bic_lag = np.argmin(bic_values) + 1
        return min_aic_lag, min_bic_lag

    def choose_lag_variables(self, df, target_col):
        max_autocorr_lag, max_partial_autocorr_lag = self.autocorrelation_analysis(
            df, target_col
        )
        min_aic_lag, min_bic_lag = self.information_criteria(df, target_col)
        chosen_lags = set(
            [max_autocorr_lag, max_partial_autocorr_lag, min_aic_lag, min_bic_lag]
        )
        chosen_lags.discard(0)  # Remove any zero lags
        return chosen_lags

    def generate_lag_variables(self, df, target_col):
        chosen_lags = self.choose_lag_variables(df, target_col)
        for lag in chosen_lags:
            df[f"{target_col}_lag_{lag}"] = df[target_col].shift(lag)
        return df

We create the class InputVariables in order to generate our input variables.


In [219]:
class InputVariables(LagVariableGenerator):
    def __init__(self, data: DataFrame, col="Close", max_lags: int = 200) -> None:
        super().__init__(max_lags)
        self.data: DataFrame = data.copy()
        self.col: str = col
        self.lag_columns: list[str] = []

    def calculate_log_returns(self) -> None:
        self.data["log_returns"] = np.log(self.data[self.col]) - np.log(
            self.data[self.col].shift(1)
        )

    def add_Indicators(self) -> None:
        inds = pd.DataFrame()
        succeed = []
        failed = []
        for func in talib.get_functions():
            try:
                ind = getattr(talib, func)(self.data.loc[:, self.col]).rename(func)
                inds = pd.concat([inds, ind], axis=1)
                succeed += [func]
            except:
                failed += [func]
        self.data = pd.concat([self.data, inds], axis=1)

    @staticmethod
    def clean_data(df: DataFrame, threshold=0.05) -> DataFrame:
        # calculate the percentage of missing values for each column
        percent_missing = df.isna().sum() / len(df)

        # create a boolean mask of columns that exceed the threshold percentage
        mask = percent_missing > threshold

        # use the boolean mask to select only the columns that don't exceed the threshold
        return df.loc[:, ~mask].dropna()

    def generate_all(self, threshold: float = 0.05, num_lags: int = 100) -> DataFrame:
        self.calculate_log_returns()
        self.add_Indicators()
        self.data = self.generate_lag_variables(self.data, self.col)
        self.data = self.clean_data(self.data, threshold)
        return self.data

In [132]:
stock = Stock("MSFT")

[*********************100%***********************]  1 of 1 completed


In [220]:
# Usage example:
tv = InputVariables(stock.data)
all_representations: DataFrame = tv.generate_all()
all_representations.shape

(8883, 60)

In [221]:
all_representations.columns

Index(['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume', 'Close_lag_1',
       'Close_lag_10', 'Close_lag_100', 'Close_lag_198', 'log_returns',
       'HT_DCPERIOD', 'HT_DCPHASE', 'HT_TRENDMODE', 'MAX', 'MAXINDEX', 'MIN',
       'MININDEX', 'SUM', 'ATAN', 'CEIL', 'COS', 'COSH', 'EXP', 'FLOOR', 'LN',
       'LOG10', 'SIN', 'SINH', 'SQRT', 'TAN', 'TANH', 'APO', 'CMO', 'MOM',
       'PPO', 'ROC', 'ROCP', 'ROCR', 'ROCR100', 'RSI', 'TRIX', 'DEMA', 'EMA',
       'HT_TRENDLINE', 'KAMA', 'MA', 'MIDPOINT', 'SMA', 'T3', 'TEMA', 'TRIMA',
       'WMA', 'LINEARREG', 'LINEARREG_ANGLE', 'LINEARREG_INTERCEPT',
       'LINEARREG_SLOPE', 'STDDEV', 'TSF', 'VAR'],
      dtype='object')

In [223]:
all_representations.head()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Close_lag_1,Close_lag_10,Close_lag_100,Close_lag_198,...,TEMA,TRIMA,WMA,LINEARREG,LINEARREG_ANGLE,LINEARREG_INTERCEPT,LINEARREG_SLOPE,STDDEV,TSF,VAR
1988-02-29 00:00:00,0.40625,0.413194,0.402778,0.412326,0.256688,39326400,0.404514,0.383681,0.512153,0.400174,...,0.408178,0.381536,0.39254,0.418998,0.186154,0.376761,0.003249,0.004892,0.422247,2.4e-05
1988-03-01 00:00:00,0.414931,0.416667,0.40625,0.409722,0.255067,43300800,0.412326,0.399306,0.506944,0.386285,...,0.409693,0.382671,0.393948,0.417535,0.145927,0.384425,0.002547,0.004613,0.420082,2.1e-05
1988-03-02 00:00:00,0.409722,0.421875,0.407986,0.413194,0.257229,67896000,0.409722,0.399306,0.5,0.380208,...,0.411581,0.384162,0.395553,0.416716,0.11357,0.390948,0.001982,0.003936,0.418698,1.5e-05
1988-03-03 00:00:00,0.416667,0.427083,0.414931,0.423611,0.263714,83664000,0.413194,0.402778,0.498264,0.387153,...,0.415052,0.385992,0.397797,0.418427,0.101327,0.395437,0.001768,0.00625,0.420196,3.9e-05
1988-03-04 00:00:00,0.423611,0.425347,0.414931,0.421875,0.262633,68356800,0.423611,0.407986,0.496528,0.375868,...,0.417702,0.388003,0.399825,0.418948,0.081871,0.400372,0.001429,0.005534,0.420377,3.1e-05


In [222]:
all_representations.tail()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume,Close_lag_1,Close_lag_10,Close_lag_100,Close_lag_198,...,TEMA,TRIMA,WMA,LINEARREG,LINEARREG_ANGLE,LINEARREG_INTERCEPT,LINEARREG_SLOPE,STDDEV,TSF,VAR
2023-05-23 00:00:00,320.029999,322.720001,315.25,315.26001,315.26001,30797200,321.179993,307.0,234.529999,282.299988,...,320.00253,302.203914,307.966664,318.336568,43.08513,306.17771,0.935297,2.551387,319.271865,6.509575
2023-05-24 00:00:00,314.730011,316.5,312.609985,313.850006,313.850006,23384900,315.26001,312.309998,241.009995,289.160004,...,319.96376,303.507539,308.755739,317.768,37.055633,307.951992,0.755078,2.592444,318.523078,6.720766
2023-05-25 00:00:00,323.23999,326.899994,320.0,325.920013,325.920013,43301700,313.850006,310.109985,239.820007,287.019989,...,322.054035,304.809497,310.258235,320.810576,46.542956,307.090848,1.055364,4.325368,321.865939,18.708805
2023-05-26 00:00:00,324.019989,333.399994,323.880005,332.890015,332.890015,36600900,325.920013,308.970001,239.580002,291.910004,...,325.082807,306.139247,312.132817,324.923437,54.878801,306.440847,1.421738,7.018821,326.345174,49.263844
2023-05-30 00:00:00,335.230011,335.73999,330.529999,331.674988,331.674988,19449440,332.890015,309.459991,229.100006,293.470001,...,327.4449,307.46281,313.828472,327.795577,57.872718,307.093706,1.592452,8.012506,329.388028,64.200251


## Preparing data


We create the class data_prep in order to train, split and plot the data.


In [239]:
from sklearn.model_selection import TimeSeriesSplit

In [None]:
def split_time_series_data(X, y, n_splits, initial_train_size=0, step=1):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    folds = tscv.split(X)

    train_index, test_index = next(folds)  # Get the first fold
    train_end = initial_train_size if initial_train_size > 0 else test_index[0]

    X_train, y_train = X[:train_end], y[:train_end]

    for test_start, test_end in folds:
        X_test, y_test = X[test_start:test_end], y[test_start:test_end]

        yield X_train, y_train, X_test, y_test

        if step > 1:
            train_end = test_end - step
        else:
            train_end = test_end

        X_train, y_train = X[:train_end], y[:train_end]


def time_series_cv(
    model, X, y, n_splits, initial_train_size=0, step=1, return_predictions=False
):
    predictions = []
    true_values = []

    for X_train, y_train, X_test, y_test in split_time_series_data(
        X, y, n_splits, initial_train_size, step
    ):
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        if return_predictions:
            predictions.append(y_pred)
            true_values.append(y_test)

    if return_predictions:
        return predictions, true_values

In [224]:
class DataPreparer(InputVariables):
    def __init__(
        self,
        df: pd.DataFrame,
        target_col: str,
        testsize: float = 0.2,
        random_state: int = 101,
        shuffle=False,
        features: Optional[list] = None,
        scale: bool = True,
    ) -> None:
        super().__init__(df, target_col)
        self.target_col: str = target_col
        self.testsize: float = testsize
        self.random_state: int = random_state
        self.shuffle: bool = shuffle
        self.scale: bool = scale
        self.startup(features)

    def startup(self, features) -> None:
        self.data = self.generate_all().dropna()
        if self.scale:
            self.X: DataFrame = self.scale_data(
                self.data.drop(columns=[self.target_col]).loc[:, features]
                if features
                else self.data.drop(columns=[self.target_col])
            )
            self.X = self.clean_data(self.X)
        else:
            self.X: DataFrame = (
                self.data.drop(columns=[self.target_col]).loc[:, features]
                if features
                else self.data.drop(columns=[self.target_col])
            )
        self.y: DataFrame = self.data[[self.target_col]]
        self.train_test_split()

    def scale_data(self, X: pd.DataFrame) -> pd.DataFrame:
        scaler = StandardScaler()
        scaled_X = pd.DataFrame(
            scaler.fit_transform(X), index=X.index, columns=X.columns
        )
        return scaled_X

    def train_test_split(self) -> None:
        X_train, X_test, y_train, y_test = train_test_split(
            self.X,
            self.y,
            test_size=self.testsize,
            shuffle=self.shuffle,
            random_state=self.random_state,
        )
        self.X_train = X_train.sort_index()
        self.X_test = X_test.sort_index()
        self.y_train = y_train.sort_index()
        self.y_test = y_test.sort_index()

    def plot(self) -> go.Figure:
        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=self.X_train.index,
                y=self.y_train[self.target_col],
                mode="lines",
                name=f"Train Data {1 - self.testsize}%",
            )
        )
        fig.add_trace(
            go.Scatter(
                x=self.X_test.index,
                y=self.y_test[self.target_col],
                mode="lines",
                name=f"Test Data {self.testsize}%",
            )
        )

        fig.update_layout(
            title="Train and Test Data",
            xaxis_title="Date",
            yaxis_title=self.target_col,
        )
        return fig

In [225]:
dp = DataPreparer(stock.data, target_column, testsize=0.1)

# Use the prepared data for training and evaluating models or creating plots
dp.plot()

In [39]:
stock = Stock('TSLA')

# Choose the target variable representation (e.g., 'returns', 'log_returns', 'diff', or 'close')
target_column = "Close"

# Initialize the data_prep class and prepare the data
dp = DataPreparer(stock.data, target_column, testsize=0.1)

# Use the prepared data for training and evaluating models or creating plots
dp.plot()

[*********************100%***********************]  1 of 1 completed


In [48]:
dp.X_train.shape[0]
dp.X_test.shape[0]

8352

928

In [49]:
dp.X_test.head()

Unnamed: 0,Open,High,Low,Adj Close,Volume,log_returns,HT_DCPERIOD,HT_DCPHASE,HT_TRENDMODE,MAX,MAXINDEX,MIN,MININDEX,SUM,ATAN,CEIL,COS,COSH,EXP,FLOOR,LN,LOG10,SIN,SINH,SQRT,TAN,TANH,APO,CMO,MOM,PPO,ROC,ROCP,ROCR,ROCR100,RSI,TRIX,DEMA,EMA,HT_TRENDLINE,KAMA,MA,MIDPOINT,SMA,T3,TEMA,TRIMA,WMA,LINEARREG,LINEARREG_ANGLE,LINEARREG_INTERCEPT,LINEARREG_SLOPE,STDDEV,TSF,VAR
2019-09-06 00:00:00,1.302977,1.284275,1.298722,1.295921,-1.012865,-0.358603,-1.597019,-0.173889,-1.997982,1.228371,1.379509,1.335865,1.383589,1.280606,0.548931,1.294865,0.81923,-0.022544,-0.022544,1.29468,1.166942,1.166942,0.955959,-0.022544,1.502272,0.019598,0.370826,0.115696,0.091621,0.223963,-0.084185,-0.017073,-0.017073,-0.017073,-0.017073,0.091621,0.020794,1.274029,1.278978,1.277624,1.262517,1.280606,1.267226,1.280606,1.2747,1.262346,1.272572,1.272647,1.271856,0.272178,1.270469,0.17529,0.5971,1.271673,0.001964
2019-09-09 00:00:00,1.296713,1.278217,1.273678,1.274244,-0.878202,-0.574492,-1.670141,0.077238,-1.997982,1.224159,1.379882,1.335865,1.383589,1.278775,0.548711,1.266402,0.97306,-0.022544,-0.022544,1.266217,1.160696,1.160696,-1.186122,-0.022544,1.485909,0.002357,0.370826,0.075475,-0.227888,0.860694,-0.104323,0.324146,0.324146,0.324146,0.324146,-0.227888,0.008368,1.27411,1.279496,1.279945,1.262682,1.278775,1.267226,1.278775,1.279523,1.2627,1.272807,1.273064,1.275332,0.442834,1.265135,0.292702,0.614185,1.275819,0.009091
2019-09-10 00:00:00,1.256996,1.237922,1.245612,1.254487,-0.793055,-0.532537,-1.725096,0.399451,-1.997982,1.214922,1.380256,1.335865,1.383589,1.276401,0.548506,1.25217,-0.797667,-0.022544,-0.022544,1.251985,1.154941,1.154941,-1.465904,-0.022544,1.470913,0.022495,0.370826,0.023403,-0.501632,0.067612,-0.130402,-0.095739,-0.095739,-0.095739,-0.095739,-0.501632,-0.005092,1.271595,1.278645,1.279835,1.262605,1.276401,1.267226,1.276401,1.278401,1.259282,1.273214,1.272266,1.271563,0.34298,1.266518,0.223835,0.604398,1.271659,0.004996
2019-09-11 00:00:00,1.244326,1.229187,1.25396,1.255035,-0.90671,-0.026627,-1.780707,0.889231,-1.997982,1.210847,1.389961,1.335865,1.383589,1.274373,0.548512,1.25217,-0.751698,-0.022544,-0.022544,1.251985,1.155102,1.155102,-1.497998,-0.022544,1.471331,0.023694,0.370826,0.059866,-0.493268,0.010959,-0.112217,-0.125292,-0.125292,-0.125292,-0.125292,-0.493268,-0.019303,1.269364,1.277886,1.280077,1.262544,1.274373,1.267226,1.274373,1.273432,1.256387,1.27371,1.271658,1.271531,0.44716,1.261073,0.295698,0.783998,1.272039,0.085655
2019-09-12 00:00:00,1.271943,1.259479,1.279579,1.274244,-0.84457,0.438065,-1.844904,1.147644,-1.997982,1.210847,1.389961,1.335865,1.383589,1.274972,0.548711,1.266402,0.97306,-0.022544,-0.022544,1.266217,1.160696,1.160696,-1.186122,-0.022544,1.485909,0.002357,0.370826,0.089964,-0.199339,0.368983,-0.097226,0.060774,0.060774,0.060774,0.060774,-0.199339,-0.032734,1.269812,1.278475,1.281181,1.262635,1.274972,1.267226,1.274972,1.269376,1.257468,1.274316,1.272476,1.275653,0.61561,1.256379,0.413273,0.387013,1.276831,-0.077032


In [66]:
dp.X_train.tail()

Unnamed: 0,Open,High,Low,Adj Close,Volume,log_returns,HT_DCPERIOD,HT_DCPHASE,HT_TRENDMODE,MAX,...,TEMA,TRIMA,WMA,LINEARREG,LINEARREG_ANGLE,LINEARREG_INTERCEPT,LINEARREG_SLOPE,STDDEV,TSF,VAR
2019-08-23 00:00:00,137.190002,138.350006,132.800003,128.916718,38508600,-0.032381,14.076048,185.539095,0,141.339996,...,136.177,137.31475,136.915634,136.626283,1.50318,136.285144,0.026241,1.94015,136.652525,3.764182
2019-08-26 00:00:00,134.990005,135.559998,133.899994,130.907654,20312600,0.015325,14.042129,177.360054,1,141.339996,...,135.941445,137.18125,136.793139,136.115426,-3.473795,136.904572,-0.060704,1.909392,136.054722,3.645778
2019-08-27 00:00:00,136.389999,136.720001,134.660004,131.187943,23102100,0.002139,14.068112,178.993083,1,141.339996,...,135.78973,137.08,136.696774,135.741713,-7.026429,137.344,-0.123253,1.890511,135.61846,3.574033
2019-08-28 00:00:00,134.880005,135.759995,133.550003,131.013977,17393300,-0.001327,14.132536,231.876638,1,141.339996,...,135.626687,136.989875,136.591677,135.824284,-4.229676,136.785714,-0.073956,1.391325,135.750328,1.935784
2019-08-29 00:00:00,137.25,138.440002,136.910004,133.488144,20168700,0.018709,14.105157,266.793988,0,141.339996,...,135.950996,136.900666,136.653268,136.491713,1.38742,136.176855,0.02422,1.500977,136.515933,2.252932


In [67]:
dp.y_test.tail()

Unnamed: 0,Close
2023-05-02 00:00:00,305.410004
2023-05-03 00:00:00,304.399994
2023-05-04 00:00:00,305.410004
2023-05-05 00:00:00,310.649994
2023-05-08 00:00:00,308.649994


In [68]:
dp.y_train.tail()

Unnamed: 0,Close
2019-08-23 00:00:00,133.389999
2019-08-26 00:00:00,135.449997
2019-08-27 00:00:00,135.740005
2019-08-28 00:00:00,135.559998
2019-08-29 00:00:00,138.119995


# PREDICTORS


When working with time series prediction, it is important to choose models that can handle time-dependent patterns in the data. Here's a list of compatible models that can be used for time series prediction, along with some additional models specifically designed for time series forecasting:

1. Tree-based Models:

    - Random Forest Regressor (sklearn.ensemble.RandomForestRegressor)
    - Extra Trees Regressor (sklearn.ensemble.ExtraTreesRegressor)
    - Gradient Boosting Regressor (sklearn.ensemble.GradientBoostingRegressor)

1. Linear Models:

    - Lasso (sklearn.linear_model.Lasso)
    - Ridge (sklearn.linear_model.Ridge)
    - ElasticNet (sklearn.linear_model.ElasticNet)
    - Lasso LARS (sklearn.linear_model.LassoLars)
    - Lasso LARS IC (sklearn.linear_model.LassoLarsIC)
    - Lars (sklearn.linear_model.Lars)
    - Lars CV (sklearn.linear_model.LarsCV)
    - Lasso CV (sklearn.linear_model.LassoCV)
    - Ridge CV (sklearn.linear_model.RidgeCV)
    - ElasticNet CV (sklearn.linear_model.ElasticNetCV)
    - Orthogonal Matching Pursuit (sklearn.linear_model.OrthogonalMatchingPursuit)
    - Orthogonal Matching Pursuit CV (sklearn.linear_model.OrthogonalMatchingPursuitCV)

1. Ensemble Models:

    - AdaBoost Regressor (sklearn.ensemble.AdaBoostRegressor)
    - Bagging Regressor (sklearn.ensemble.BaggingRegressor)

1. Time Series Models:

    - Autoregression (statsmodels.tsa.ar_model.AutoReg)
    - SARIMAX (statsmodels.tsa.statespace.sarimax.SARIMAX)
    - Exponential Smoothing State Space Model (statsmodels.tsa.statespace.exponential_smoothing.ExponentialSmoothing)

We will pick a few of these and optimize them for the close for our case.


## ModelEvaluator

We introduce a utility class that trains a model using cross validation and calculates all the quantites related to the training of the model. It finally plots the predictions and the learning curve.

In [272]:
class LearningCurve:
    def calculate_learning_curve(self, train_sizes=None, cv=None):
        if not hasattr(self, "model"):
            raise ValueError("There is no model available.")
        if train_sizes is None:
            train_sizes = np.linspace(0.1, 1.0, 10)
        if self.cv is None:
            self.cv = cv
        # Compute the learning curve
        (
            self.train_sizes_abs,
            self.train_scores,
            self.val_scores,
            self.fit_times,
            _,
        ) = learning_curve(
            self.model,
            self.data_preparer.X,
            self.data_preparer.y,
            train_sizes=train_sizes,
            cv=self.cv,
            scoring="neg_root_mean_squared_error",
            n_jobs=-1,
            verbose=1,
            return_times=True,
        )
        # Compute the mean and standard deviation of the scores
        self.train_scores_mean = -np.mean(self.train_scores, axis=1)
        self.train_scores_std = -np.std(self.train_scores, axis=1)
        self.val_scores_mean = -np.mean(self.val_scores, axis=1)
        self.val_scores_std = -np.std(self.val_scores, axis=1)

        # Compute the mean fit time
        self.fit_times_mean = np.mean(self.fit_times, axis=1)

    def calculate_optimal_size(self):
        optimal_idx = np.argmin(self.val_scores_mean)
        self.optimal_size = self.train_sizes_abs[optimal_idx]
        self.optimal_size_pct = round(self.optimal_size / dp.X.shape[0], 2)

    def plot_learning_curve(self) -> go.Figure:
        fig = go.Figure()

        # Plot the training scores
        fig.add_trace(
            go.Scatter(
                x=self.train_sizes_abs,
                y=self.train_scores_mean,
                mode="lines+markers",
                name="Training score",
                line=dict(color="blue"),
                error_y=dict(
                    type="data", array=self.train_scores_std, visible=True, color="blue"
                ),
            )
        )

        # Plot the validation scores
        fig.add_trace(
            go.Scatter(
                x=self.train_sizes_abs,
                y=self.val_scores_mean,
                mode="lines+markers",
                name="Validation score",
                line=dict(color="green"),
                error_y=dict(
                    type="data", array=self.val_scores_std, visible=True, color="green"
                ),
            )
        )

        # Add a vertical line for the optimal training set size
        fig.add_shape(
            type="line",
            x0=self.optimal_size,
            x1=self.optimal_size,
            y0=0,
            y1=1,
            yref="paper",
            xref="x",
            line=dict(color="red", dash="dash"),
        )

        # Add a secondary y-axis for the training time
        fig.update_layout(
            yaxis2=dict(
                title="Training Time (s)", overlaying="y", side="right", showgrid=False
            ),
            title="Learning Curve",
            xaxis_title="Training Set Size",
            yaxis_title="Error (RMSE)",
        )

        # Plot the training time
        fig.add_trace(
            go.Scatter(
                x=self.train_sizes_abs,
                y=self.fit_times_mean,
                mode="lines+markers",
                name="Training Time",
                line=dict(color="orange"),
                yaxis="y2",
            )
        )

        # Add a text box with the model performance metric
        fig.add_annotation(
            x=0.05,
            y=0.95,
            xref="paper",
            yref="paper",
            text="Performance Metric: RMSE",
            showarrow=False,
            font=dict(size=12),
        )

        # Add a text box with the optimal training set size
        fig.add_annotation(
            x=self.optimal_size,
            y=0.05,
            xref="x",
            yref="paper",
            text=f"Optimal Size: {self.optimal_size}({self.optimal_size_pct}%)",
            showarrow=True,
            arrowhead=1,
            arrowcolor="red",
            font=dict(size=12),
        )

        return fig

In [284]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import numpy as np
import pandas as pd
import plotly.graph_objects as go


class ModelEvaluator(LearningCurve):
    def __init__(self, model, data_preparer) -> None:
        self.model = model
        self.data_preparer = data_preparer
        self.mae_scores = None
        self.rmse_scores = None
        self.predictions = None
        self.true_values = None

    def split_time_series_data(
        self, n_splits, train_size=None, test_size=1, rolling_origin=True
    ):
        if train_size is not None and train_size <= 0:
            raise ValueError("train_size must be a positive integer")
        if test_size <= 0:
            raise ValueError("test_size must be a positive integer")

        X, y = self.data_preparer.X_train, self.data_preparer.y_train

        self.cv = TimeSeriesSplit(n_splits=n_splits)
        folds = self.cv .split(X)

        for train_index, test_index in folds:
            if rolling_origin:
                train_end = (
                    min(train_index[-1] + 1, len(X))
                    if train_size is None
                    else min(train_index[0] + train_size, len(X))
                )
            else:
                train_end = (
                    min(train_index[0] + train_size, len(X))
                    if train_size is not None
                    else train_index[-1] + 1
                )

            test_start = min(train_end, len(X))
            test_end = min(test_start + test_size, len(X))

            X_train, y_train = (
                X.iloc[train_index[0] : train_end],
                y.iloc[train_index[0] : train_end],
            )
            X_test, y_test = X.iloc[test_start:test_end], y.iloc[test_start:test_end]

            yield X_train, y_train, X_test, y_test

    def time_series_cv_tune(
        self, n_splits, train_size=None, test_size=1, rolling_origin=True
    ):
        self.mae_scores = []
        self.rmse_scores = []
        self.predictions = []
        self.true_values = []

        for X_train, y_train, X_test, y_test in self.split_time_series_data(
            n_splits, train_size, test_size, rolling_origin
        ):
            self.model.fit(X_train, y_train)
            y_pred = self.model.predict(X_test)

            self.predictions.append(y_pred)
            self.true_values.append(y_test)

            mae = mean_absolute_error(y_test, y_pred)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))

            self.mae_scores.append(mae)
            self.rmse_scores.append(rmse)

    def perform_grid_search(self, param_grid=None, scoring="r2", n_splits=5):
        self.cv: int = n_splits
        if param_grid is None:
            param_grid = {
                # default parameter grid, can be updated based on the model
            }
        # Replace cv parameter with TimeSeriesSplit object
        tscv = TimeSeriesSplit(n_splits=n_splits)
        
        # Initialize the best_score to a very low value
        self.best_score = float('-inf')

        for train_index, test_index in tscv.split(self.data_preparer.X_train):
            self.X_train, self.X_test = self.data_preparer.X_train.iloc[train_index], self.data_preparer.X_train.iloc[test_index]
            self.y_train, self.y_test = self.data_preparer.y_train.iloc[train_index], self.data_preparer.y_train.iloc[test_index]
            
            grid_search = GridSearchCV(self.model, param_grid, cv=tscv, scoring=scoring)
            grid_search.fit(self.X_train, self.y_train)

            
            # Check if this is the best model so far
            if grid_search.best_score_ > self.best_score:
                self.model = grid_search.best_estimator_
                self.best_params = grid_search.best_params_
                self.best_score = grid_search.best_score_
                self.grid_search = grid_search  # Store the grid_search object for use in extract_features

        print(f"Best cross-validated score: {self.best_score:.2f}")


    def extract_features(self, threshold=0) -> DataFrame:
        if not hasattr(self, "grid_search"):
            raise ValueError("No grid search object found.")
        filtered_coef = None
        if hasattr(self.model, "coef_"):
            # Extract feature coefficients above the threshold
            self.coef_ = self.model.coef_
            coef = pd.DataFrame(
                self.model.coef_.T,
                index=self.data_preparer.X_train.columns,
                columns=["Coefficients"],
            )
            filtered_coef = coef[abs(coef["Coefficients"]) >= threshold]
        elif hasattr(self.model, "feature_importances_"):
            # Extract feature importances above the threshold
            self.feature_importances = self.model.feature_importances_
            importances = pd.DataFrame(
                self.model.feature_importances_,
                index=self.data_preparer.X_train.columns,
                columns=["Importances"],
            )
            filtered_coef = importances[importances["Importances"] >= threshold]
        else:
            raise ValueError(
                "The specified model does not have feature coefficients or importances."
            )

        return filtered_coef

    def make_predictions(self):
        self.predictions = pd.DataFrame(
            self.model.predict(self.data_preparer.X_test),
            index=self.data_preparer.y_test.index,
            columns=[self.data_preparer.target_col],
        )

    def plot(self) -> go.Figure:
        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=self.predictions.index,
                y=self.predictions[self.data_preparer.target_col],
                mode="lines",
                name=f"Predictions",
            )
        )

        fig.add_trace(
            go.Scatter(
                x=self.data_preparer.X_train.index,
                y=self.data_preparer.y_train[self.data_preparer.target_col],
                mode="lines",
                name=f"Train Data",
            )
        )

        fig.add_trace(
            go.Scatter(
                x=self.data_preparer.X_test.index,
                y=self.data_preparer.y_test[self.data_preparer.target_col],
                mode="lines",
                name=f"Test Data",
            )
        )

        fig.update_layout(
            title=f"Train and Test Data",
            xaxis_title="Index",
            yaxis_title=self.data_preparer.target_col,
        )
        return fig

## Linear Models


We will directly use ElasticNet in this section in order to have all the features of LinearRegression and controlable complexity through regularization.

The grid of our choice is the following
```py
    param_grid = {
        "alpha": [0.001, 0.01, 0.1, 1, 10, 100],
        "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
        "max_iter": [1000, 2000, 5000],
        "tol": [1e-4, 1e-5, 1e-6],
        "selection": ["cyclic", "random"],
    }
```

- 'alpha': This parameter is a regularization term that is a linear combination of L1 and L2 penalties. Higher values of alpha result in stronger regularization, which may help prevent overfitting. A smaller value of alpha allows the model to be more flexible and capture more complex patterns in the data, but at the risk of overfitting. In this param_grid, we're trying out six different values of alpha: [0.001, 0.01, 0.1, 1, 10, 100].

- 'l1_ratio': This parameter is the mixing parameter between L1 (Lasso) and L2 (Ridge) penalties. It varies between 0 and 1. A value of 0 corresponds to the Ridge penalty, and a value of 1 corresponds to the Lasso penalty. We're trying out five different values of l1_ratio: [0.1, 0.3, 0.5, 0.7, 0.9].

- 'max_iter': This parameter is the maximum number of iterations for the optimization algorithm to converge. If the algorithm does not converge in the specified number of iterations, it will stop early. In this param_grid, we're trying out three different values of max_iter: [1000, 2000, 5000].

- 'tol': This parameter is the tolerance for the optimization algorithm. The algorithm will stop when the update is smaller than tol. Smaller values of tol will result in a more accurate solution, but may take more iterations to converge. In this param_grid, we're trying out three different values of tol: [1e-4, 1e-5, 1e-6].

- 'selection': This parameter determines the method used to update the coefficients during the optimization process. The two possible values are 'cyclic' and 'random'. In the 'cyclic' method, the algorithm iterates through each feature sequentially, while in the 'random' method, the algorithm selects a random feature at each iteration. The param_grid includes both options.


In [287]:
class ElasticNetRegressor(ModelEvaluator):
    def __init__(self, data_preparer) -> None:
        super().__init__(ElasticNet(), data_preparer)

    def evaluate(self):
        mae = mean_absolute_error(self.data_preparer.y_test, self.predictions)
        mse = mean_squared_error(self.data_preparer.y_test, self.predictions)

        print(
            f"""Mean Absolute Error (MAE): {mae:.2f}
        Mean Squared Error (MSE): {mse:.2f}
        Cross-validated R-squared scores: {self.cv_scores}
        Average cross-validated R-squared score: {np.mean(self.cv_scores):.2f}"""
        )
        return (mae, mse, self.cv_scores, np.mean(self.cv_scores))

    def get_summary_dict(self, threshold) -> dict:
        filtered_coef = self.extract_features(threshold)
        summary_dict = {
            "model": type(self.model).__name__,
            "target_variable": self.data_preparer.target_col,
            "num_input_features": self.data_preparer.X_train.shape[1],
            "test_size": self.data_preparer.testsize,
            "cv_folds": self.cv,
            "best_params": self.best_params,
            "best_cv_score": self.best_score,
            "optimal_training_size": self.optimal_size_pct,
            "cv_scores": self.cv_scores,
            "coefficients": self.coef_,
            # "filtered_coef": filtered_coef.sort_values(by="Coefficients").rename(
            #     {"Coefficients": ""}, axis=1
            # ),
            "MAE": mean_absolute_error(self.data_preparer.y_test, self.predictions),
            "MSE": mean_squared_error(self.data_preparer.y_test, self.predictions),
            "Average cross-validated R-squared score": np.mean(self.cv_scores),
        }
        return summary_dict

    def summarize(self, threshold=0) -> str:
        self.threshold = threshold
        summary_dict = self.get_summary_dict(threshold)
        return f"""Model summary:
    Model: {summary_dict['model']}
    Target variable: {summary_dict['target_variable']}
    Number of input features: {summary_dict['num_input_features']}
    Test Size: {summary_dict['test_size']}
    Cross-validation fold: {summary_dict['cv_folds']}
Evaluation:    
    Best params: {summary_dict['best_params']}
    Optimal test size: {round(1-summary_dict['optimal_training_size'],2)}
    Best cross-validated score: {summary_dict['best_cv_score']:.2f}
    Mean Absolute Error (MAE): {summary_dict['MAE']:.2f}
    Mean Squared Error (MSE): {summary_dict['MSE']:.2f}
    Cross-validated R-squared scores: {summary_dict['cv_scores']}
Filtered_coef>{self.threshold}: {summary_dict['filtered_coef']}"""

# Support Vector Machines

Support Vector Machines (SVMs) are a set of supervised learning methods that are often used for classification and regression tasks. They can also be used to predict time series data, and there are several reasons you might want to do so:

- `Efficiency with High Dimensions:` Time series data can often become high dimensional, especially when you start to incorporate lagged features or moving window statistics. SVMs, particularly those with a radial basis function (RBF) kernel, can handle high-dimensional data efficiently, as they are based on the structural risk minimization principle, which minimizes an upper bound of the generalization error.

- `Handling of Non-linearity:` Time series data can have complex non-linear patterns, which linear models might struggle to capture. SVMs, especially when using non-linear kernels like the RBF, can capture these non-linear patterns effectively.

- `Sparsity of Solution:` In SVMs, only the support vectors are used to specify the separating hyperplane and in the construction of the prediction function. This leads to a sparse and computationally efficient representation.

- `Robustness:` SVMs are quite robust to overfitting, especially in high-dimensional spaces. This makes them very useful when we want to avoid overfitting in time series prediction.

- `Outliers Handling:` SVMs have good robustness and generalization performance in the presence of noise, which is a common problem in time series prediction.

- `Versatility:` SVMs can be modified and extended in numerous ways (like the use of custom kernels) to suit the specific task at hand.

However, there are also some potential drawbacks to consider. SVMs can be more computationally intensive to train than some other algorithms, especially on large datasets. They also rely heavily on the selection of an appropriate kernel and tuning of the hyperparameters. It is also more challenging to interpret the model predictions in comparison to some simpler models, which could be a disadvantage in situations where interpretability is important.

In [102]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import numpy as np


class SVMRegressor(ModelEvaluator):
    def __init__(self, data_preparer) -> None:
        super().__init__(SVR(), data_preparer)

    def evaluate(self):
        mae = mean_absolute_error(self.data_preparer.y_test, self.predictions)
        mse = mean_squared_error(self.data_preparer.y_test, self.predictions)

        print(
            f"""Mean Absolute Error (MAE): {mae:.2f}
        Mean Squared Error (MSE): {mse:.2f}
        Cross-validated R-squared scores: {self.cv_scores}
        Average cross-validated R-squared score: {np.mean(self.cv_scores):.2f}"""
        )
        return (mae, mse, self.cv_scores, np.mean(self.cv_scores))

    def get_summary_dict(self, threshold) -> dict:
        summary_dict = {
            "model": type(self.model).__name__,
            "target_variable": self.data_preparer.target_col,
            "num_input_features": self.data_preparer.X_train.shape[1],
            "test_size": self.data_preparer.testsize,
            "cv_folds": self.cv,
            "best_params": self.best_params,
            "best_cv_score": self.best_score,
            "optimal_training_size": self.optimal_size_pct,
            "cv_scores": self.cv_scores,
            "MAE": mean_absolute_error(self.data_preparer.y_test, self.predictions),
            "MSE": mean_squared_error(self.data_preparer.y_test, self.predictions),
            "Average cross-validated R-squared score": np.mean(self.cv_scores),
        }
        return summary_dict

    def summarize(self, threshold=0) -> str:
        self.threshold = threshold
        summary_dict = self.get_summary_dict(threshold)
        return f"""Model summary:
    Model: {summary_dict['model']}
    Target variable: {summary_dict['target_variable']}
    Number of input features: {summary_dict['num_input_features']}
    Test Size: {summary_dict['test_size']}
    Cross-validation fold: {summary_dict['cv_folds']}
Evaluation:    
    Best params: {summary_dict['best_params']}
    Optimal test size: {round(1-summary_dict['optimal_training_size'],2)}
    Best cross-validated score: {summary_dict['best_cv_score']:.2f}
    Mean Absolute Error (MAE): {summary_dict['MAE']:.2f}
    Mean Squared Error (MSE): {summary_dict['MSE']:.2f}
    Cross-validated R-squared scores: {summary_dict['cv_scores']}"""


# Random Forests

There are several reasons why one might consider using Random Forests or other tree/forest based models for time series prediction:

- `Feature Interactions:` Tree-based models naturally capture interactions between different predictors or features. In time series data, it's often the case that the effect of one predictor on the target variable depends on the levels of other predictors. Tree-based models can handle this kind of complex interaction.

- `Non-linearity:` Tree-based models, including Random Forests, are capable of modeling non-linear relationships between features and target variable, which can be particularly useful in time-series data where trends may not always be linear.

- `Robustness to Outliers:` Tree-based models are generally robust to outliers in the data. This can be particularly useful in time-series data, which can often contain outliers.

- `Variable Importance:` Tree-based models, like Random Forests, provide measures of variable importance, which can provide insights into which features are driving the predictions. This can be useful for understanding the underlying factors influencing a time series.

- `Handling of Missing Values:` Random Forests can handle missing values, and thus may require less preprocessing of the data.

Tree-based models may not perform well when the time series data has strong temporal dependencies, as these models do not inherently take into account the temporal ordering of observations. In such cases, other methods like ARIMA, State Space Models, or Recurrent Neural Networks might be more appropriate.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

In [115]:
class RandomForestRegressoR(ModelEvaluator):
    def __init__(self, data_preparer) -> None:
        super().__init__(RandomForestRegressor(), data_preparer)

    def evaluate(self):
        mae = mean_absolute_error(self.data_preparer.y_test, self.predictions)
        mse = mean_squared_error(self.data_preparer.y_test, self.predictions)

        print(
            f"""Mean Absolute Error (MAE): {mae:.2f}
            Mean Squared Error (MSE): {mse:.2f}
            Cross-validated R-squared scores: {self.cv_scores}
            Average cross-validated R-squared score: {np.mean(self.cv_scores):.2f}"""
        )
        return (mae, mse, self.cv_scores, np.mean(self.cv_scores))

    def get_summary_dict(self, threshold) -> dict:
        summary_dict = {
            "model": type(self.model).__name__,
            "target_variable": self.data_preparer.target_col,
            "num_input_features": self.data_preparer.X_train.shape[1],
            "test_size": self.data_preparer.testsize,
            "cv_folds": self.cv,
            "best_params": self.best_params,
            "best_cv_score": self.best_score,
            "optimal_training_size": self.optimal_size_pct,
            "cv_scores": self.cv_scores,
            "feature_importances": self.model.feature_importances_,
            "MAE": mean_absolute_error(self.data_preparer.y_test, self.predictions),
            "MSE": mean_squared_error(self.data_preparer.y_test, self.predictions),
            "Average cross-validated R-squared score": np.mean(self.cv_scores),
        }
        return summary_dict

    def summarize(self, threshold=0) -> str:
        self.threshold = threshold
        summary_dict = self.get_summary_dict(threshold)
        return f"""Model summary:
        Model: {summary_dict['model']}
        Target variable: {summary_dict['target_variable']}
        Number of input features: {summary_dict['num_input_features']}
        Test Size: {summary_dict['test_size']}
        Cross-validation fold: {summary_dict['cv_folds']}
    Evaluation:    
        Best params: {summary_dict['best_params']}
        Optimal test size: {round(1-summary_dict['optimal_training_size'],2)}
        Best cross-validated score: {summary_dict['best_cv_score']:.2f}
        Mean Absolute Error (MAE): {summary_dict['MAE']:.2f}
        Mean Squared Error (MSE): {summary_dict['MSE']:.2f}
        Cross-validated R-squared scores: {summary_dict['cv_scores']}
    Feature importances: {summary_dict['feature_importances']}"""


In [110]:
param_grid_general = {
    'n_estimators': [10, 50, 100, 200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

param_grid_basic = {
    'n_estimators': [100, 200, 500],
    'max_features': ['auto', 'sqrt'],
    'max_depth': [None, 10, 30],
    'min_samples_split': [2, 5],
    'bootstrap': [True]
}

param_grid_light = {
    'n_estimators': [100, 200],
    'max_features': ['auto'],
    'max_depth': [None, 10],
    'min_samples_split': [2],
    'bootstrap': [True]
}

In [116]:
# Initialize the StockPricePredictor class with the prepared data and a RandomForest model
regressor = RandomForestRegressoR(dp_TSLA)

# Perform tuning and evaluation with 5 folds
cv_scores = regressor.gridcv_tune(param_grid_light, cv=5)
regressor.calculate_learning_curve()
regressor.calculate_optimal_size()
# Evaluate the performance of the model
print(regressor.summarize(threshold=0.15))

Best cross-validated score: 0.47
[learning_curve] Training set sizes: [ 253  506  759 1012 1265 1518 1771 2024 2277 2531]


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Model summary:
        Model: RandomForestRegressor
        Target variable: Close
        Number of input features: 52
        Test Size: 0.2
        Cross-validation fold: 5
    Evaluation:    
        Best params: {'bootstrap': True, 'max_depth': None, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}
        Optimal test size: 0.2
        Best cross-validated score: 0.47
        Mean Absolute Error (MAE): 84.40
        Mean Squared Error (MSE): 10438.74
        Cross-validated R-squared scores: [-96.36506552   0.97810894   0.99985977   0.96082605  -1.84399071]
    Feature importances: [1.12606940e-02 2.31274051e-02 4.36840690e-02 9.44141946e-02
 1.70754181e-06 6.24138070e-06 3.10101121e-05 2.36205468e-06
 2.03838576e-08 3.92301083e-02 3.30006856e-02 8.04438871e-03
 2.38565986e-05 8.09868086e-03 7.71969643e-02 2.89823572e-02
 1.12643447e-04 5.13821487e-02 6.73663800e-02 5.93720553e-02
 7.19952999e-05 7.16186334e-02 2.37820786e-05 1.66365005e-04
 5.36521456e-06 1.7

[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   57.2s finished


In [117]:
# regressor.calculate_learning_curve()
# regressor.calculate_optimal_size()
regressor.plot_learning_curve()

In [118]:
regressor.plot()

# Gradient Boosting

Gradient Boosting models can be effectively used for time series prediction for several reasons:

- `Handling of Non-linear Relationships:` Gradient Boosting models are able to capture non-linear relationships and complex interactions between features, which might be present in time series data. They can model non-linear trends and seasonal patterns that could be missed by simpler models.

- `Robust to Outliers:` Gradient Boosting models are robust to outliers in the data. This is especially important in time series forecasting, where outliers could be caused by unusual events, such as a surge in sales during a holiday period.

- `Regularization:` The Gradient Boosting algorithm includes parameters for regularization, which can prevent overfitting to the training data. Overfitting is a common problem in time series prediction, where models that fit too closely to the past data may not perform well on future data.

- `Feature Importance:` Gradient Boosting models provide measures of feature importance, which can be useful in understanding the key drivers of the time series patterns.

- `Flexibility:` Gradient Boosting can be applied to both univariate and multivariate time series data. It can also handle missing values and does not require the data to be stationary (though stationarity can still help model performance).

However, it's important to note that Gradient Boosting models do not naturally account for temporal dependencies like autoregressive models (AR, ARIMA, SARIMA etc.) do. Therefore, in time series prediction, it can be necessary to manually create lagged features in order to provide the model with information about temporal dependencies.

Also, Gradient Boosting models can be more computationally expensive and harder to tune than simpler models, so they may not always be the best choice, especially for very large datasets or when computational resources or time are limited.

In [124]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error

class GradientBoostingRegressorModel(ModelEvaluator):
    def __init__(self, data_preparer) -> None:
        super().__init__(GradientBoostingRegressor(), data_preparer)

    def extract_features(self, threshold):
        importances = self.model.feature_importances_
        coef_df = pd.DataFrame(
            importances, index=self.data_preparer.X_train.columns, columns=["Importances"]
        ).sort_values(by="Importances", ascending=False)
        return coef_df[coef_df.Importances.abs() > threshold]

    def evaluate(self):
        mae = mean_absolute_error(self.data_preparer.y_test, self.predictions)
        mse = mean_squared_error(self.data_preparer.y_test, self.predictions)

        print(
            f"""Mean Absolute Error (MAE): {mae:.2f}
        Mean Squared Error (MSE): {mse:.2f}
        Cross-validated R-squared scores: {self.cv_scores}
        Average cross-validated R-squared score: {np.mean(self.cv_scores):.2f}"""
        )
        return (mae, mse, self.cv_scores, np.mean(self.cv_scores))

    def get_summary_dict(self, threshold) -> dict:
        filtered_feature_importances = self.extract_features(threshold)
        summary_dict = {
            "model": type(self.model).__name__,
            "target_variable": self.data_preparer.target_col,
            "num_input_features": self.data_preparer.X_train.shape[1],
            "test_size": self.data_preparer.testsize,
            "cv_folds": self.cv,
            "best_params": self.best_params,
            "best_cv_score": self.best_score,
            "optimal_training_size": self.optimal_size_pct,
            "cv_scores": self.cv_scores,
            "feature_importances": self.model.feature_importances_,
            "filtered_feature_importances": filtered_feature_importances.sort_values(by="Importances").rename(
                {"Importances": ""}, axis=1
            ),
            "MAE": mean_absolute_error(self.data_preparer.y_test, self.predictions),
            "MSE": mean_squared_error(self.data_preparer.y_test, self.predictions),
            "Average cross-validated R-squared score": np.mean(self.cv_scores),
        }
        return summary_dict

    def summarize(self, threshold=0) -> str:
        self.threshold = threshold
        summary_dict = self.get_summary_dict(threshold)
        return f"""Model summary:
    Model: {summary_dict['model']}
    Target variable: {summary_dict['target_variable']}
    Number of input features: {summary_dict['num_input_features']}
    Test Size: {summary_dict['test_size']}
    Cross-validation fold: {summary_dict['cv_folds']}
Evaluation:    
    Best params: {summary_dict['best_params']}
    Optimal test size: {round(1-summary_dict['optimal_training_size'],2)}
    Best cross-validated score: {summary_dict['best_cv_score']:.2f}
    Mean Absolute Error (MAE): {summary_dict['MAE']:.2f}
    Mean Squared Error (MSE): {summary_dict['MSE']:.2f}
    Cross-validated R-squared scores: {summary_dict['cv_scores']}
Filtered_feature_importances>{self.threshold}: {summary_dict['filtered_feature_importances']}"""

In [125]:
param_grid_general = {
    "n_estimators": [100, 200, 500],
    "learning_rate": [0.01, 0.1, 1],
    "subsample": [0.5, 0.8, 1.0],
    "max_depth": [3, 5, 8],
    "max_features": [None, "sqrt", "log2"],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 5],
}

param_grid_basic = {
    "n_estimators": [100, 200],
    "learning_rate": [0.01, 0.1],
    "subsample": [0.5, 1.0],
    "max_depth": [3, 5],
}

param_grid_light = {
    "n_estimators": [100],
    "learning_rate": [0.1],
    "subsample": [1.0],
    "max_depth": [3],
}

In [127]:
# Initialize the StockPricePredictor class with the prepared data and a Ridge model
regressor = GradientBoostingRegressorModel(dp_TSLA)


# Perform tuning and evaluation with 5 folds6
cv_scores = regressor.gridcv_tune(param_grid_light, cv=5)
regressor.calculate_learning_curve()
regressor.calculate_optimal_size()
# Evaluate the performance of the model
print(regressor.summarize(threshold=0.15))

Best cross-validated score: 0.11
[learning_curve] Training set sizes: [ 253  506  759 1012 1265 1518 1771 2024 2277 2531]


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Model summary:
    Model: GradientBoostingRegressor
    Target variable: Close
    Number of input features: 52
    Test Size: 0.2
    Cross-validation fold: 5
Evaluation:    
    Best params: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
    Optimal test size: 0.2
    Best cross-validated score: 0.11
    Mean Absolute Error (MAE): 80.94
    Mean Squared Error (MSE): 9846.44
    Cross-validated R-squared scores: [-98.04229294   0.93629265   0.99569303   0.95808183  -1.70171269]
Filtered_feature_importances>0.15:                
LOG10  0.224982


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   28.0s finished


In [128]:
# regressor.calculate_learning_curve()
# regressor.calculate_optimal_size()
regressor.plot_learning_curve()

In [129]:
regressor.plot()

# Testing Predictors

In [228]:
MSFT = Stock('MSFT')
TSLA = Stock('TSLA')
AAPL = Stock('AAPL')
GOOG = Stock('GOOG')

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [229]:
dp_MSFT = DataPreparer(MSFT.data, 'Close')
dp_TSLA = DataPreparer(TSLA.data, 'Close')
dp_AAPL = DataPreparer(AAPL.data, 'Close')
dp_GOOG = DataPreparer(GOOG.data, 'Close')

In [230]:
# check for missing values
for dp_ in [dp_MSFT, dp_TSLA, dp_AAPL, dp_GOOG]:
    if any(dp_.X.isna().sum()):
        dp_.X.shape

## ElasticNet

In [288]:
# Initialize the StockPricePredictor class with the prepared data and a Ridge model
regressor = ElasticNetRegressor(dp_TSLA)

# Define the range of regularization parameters to be tested
param_grid = (
    {
        "alpha": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9, 1],
        # "max_iter": [1000, 2000, 5000],
        # "tol": [1e-4, 1e-5, 1e-6],
        # "selection": ["cyclic", "random"],
    },
)

# Perform tuning and evaluation with 5 folds6
regressor.time_series_cv_tune(5)
regressor.calculate_learning_curve()
regressor.calculate_optimal_size()
# Evaluate the performance of the model
print(regressor.summarize(threshold=0.15))

[learning_curve] Training set sizes: [ 52 105 158 211 264 317 370 423 476 529]


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.3s finished


ValueError: No grid search object found.

In [232]:
# regressor.calculate_learning_curve()
# regressor.calculate_optimal_size()
regressor.plot_learning_curve()

In [233]:
regressor.plot()

## SVMs

In [237]:
param_grid_general = {
    'C': [0.1, 1, 10, 100, 1000, 10000],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 'scale', 'auto'],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'degree': [2, 3, 4, 5],  # only applicable for 'poly' kernel
    'coef0': [0.0, 0.1, 0.5, 1.0],  # applicable for 'poly' and 'sigmoid'
    'shrinking': [True, False]
}

param_grid_basic = {
    'C': [0.1, 1, 10, 100],
    'gamma': [0.1, 0.01, 'scale'],
    'kernel': ['linear', 'rbf']
}

param_grid_light = {
    'C': [0.1, 1],
    'gamma': ['scale'],
    'kernel': ['rbf']
}



In [238]:
# Initialize the SVMRegressor class with the prepared data
regressor = SVMRegressor(dp_TSLA)


# Perform tuning and evaluation with 5 folds
cv_scores = regressor.gridcv_tune(param_grid_light, cv=5)
regressor.calculate_learning_curve()
regressor.calculate_optimal_size()

# Evaluate the performance of the model
print(regressor.summarize(threshold=0.15))

Best cross-validated score: -283.63
[learning_curve] Training set sizes: [ 253  506  759 1012 1265 1518 1771 2024 2277 2531]


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Model summary:
    Model: SVR
    Target variable: Close
    Number of input features: 54
    Test Size: 0.2
    Cross-validation fold: 5
Evaluation:    
    Best params: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
    Optimal test size: 0.2
    Best cross-validated score: -283.63
    Mean Absolute Error (MAE): 217.28
    Mean Squared Error (MSE): 51325.51
    Cross-validated R-squared scores: [-1.11268996e+04 -2.22850710e+01 -7.58712997e-01  8.53812586e-01
 -1.30419783e+01]


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   16.1s finished


In [None]:
regressor.plot_learning_curve()

In [None]:
regressor.plot()