# Imports


In [1]:
import inspect
import json
from datetime import datetime as dt
from datetime import timedelta
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
from warnings import filterwarnings
from zoneinfo import ZoneInfo

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.graph_objs as go
import plotly.io as pio
import plotly.offline as pyo
import scipy as sc
import seaborn as sns
import statsmodels.api as sm
import talib
import yfinance as yf
from IPython.core.interactiveshell import InteractiveShell
from pandas import DataFrame, Series
from pandas.core.frame import DataFrame
from plotly.offline import init_notebook_mode
from plotly.subplots import make_subplots
from scipy import stats
from scipy.stats import chi2_contingency, kendalltau, spearmanr
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import VarianceThreshold, chi2, mutual_info_regression
from sklearn.inspection import permutation_importance
from sklearn.linear_model import ElasticNet, LinearRegression, Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import (
    GridSearchCV,
    TimeSeriesSplit,
    cross_val_score,
    learning_curve,
    train_test_split,
)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.stattools import durbin_watson
from yfinance.ticker import Ticker


filterwarnings("ignore")
sns.set(style="darkgrid")
plt.style.use("dark_background")
plt.rcParams.update({"grid.linewidth": 0.5, "grid.alpha": 0.5})
plt.rc("figure", figsize=(16, 10))
plt.rc("lines", markersize=4)
plt.rcParams["figure.autolayout"] = True
sns.set_context("poster")
init_notebook_mode(connected=True)
pio.templates.default = "plotly_dark"
InteractiveShell.ast_node_interactivity = "all"


A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.2



# Data Collection

In [2]:
class Stock:
    """
    A class for retrieving historical stock data from Yahoo Finance using the yfinance library.

    Args:
    ----------
        symbol (str): The stock symbol to retrieve data for.
        
        start_date (str, optional): The start date of the date range to retrieve data for, in YYYY-MM-DD format. If not specified, defaults to the earliest available date for the given stock.
        
        end_date (str, optional): The end date of the date range to retrieve data for, in YYYY-MM-DD format. If not specified, defaults to the current date.
        
        interval (str, optional): The time interval between data points. Valid values are '1d' (daily), '1wk' (weekly), '1mo' (monthly), '1m' (1-minute), '2m' (2-minute), '5m' (5-minute), '15m' (15-minute), '30m' (30-minute), '60m' (60-minute), '90m' (90-minute), and '1h' (hourly). Default is '1d' for daily data.

    Attributes:
    ----------
        symbol (str): The stock symbol being retrieved.
        
        start_date (str): The actual start date of the retrieved data, in YYYY-MM-DD format.
        
        end_date (str): The actual end date of the retrieved data, in YYYY-MM-DD format.
        
        interval (str): The time interval between data points.
        
        data (pandas.DataFrame): A DataFrame containing the retrieved stock data.
        
        ticker (yfinance.ticker.Ticker): A Ticker object for the given stock symbol.
        
        info (dict): A dictionary containing information about the stock, such as its name and industry.

    Methods:
    ----------
        startup(symbol: str, start_date: str, end_date: str, interval: str) -> None:
            Constructs a new StockData object and downloads the stock data from Yahoo Finance.

    Usage:
    ----------
        # Instantiate a Stock object
        >>> stock = Stock(symbol="AAPL", start_date="2020-01-01", end_date="2022-04-17", interval="1d")

        # Access stock data
        >>> stock.data

        # Access stock information
        >>> stock.info
    """

    def __init__(self, symbol, start_date=None, end_date=None, interval="1d") -> None:
        self.startup(symbol, start_date, end_date, interval)

    def startup(self, symbol, start_date, end_date, interval) -> None:
        """
        Constructs a new StockData object and downloads the stock data from Yahoo Finance.

        Args:
            symbol (str): The stock symbol to retrieve data for.

            start_date (str): The start date of the date range to retrieve data for, in YYYY-MM-DD format.

            end_date (str): The end date of the date range to retrieve data for, in YYYY-MM-DD format.

            interval (str, optional): The time interval between data points. Default is '1d' for daily data. Valid values are '1d' (daily), '1wk' (weekly), '1mo' (monthly), '1m' (1-minute), '2m' (2-minute), '5m' (5-minute), '15m' (15-minute), '30m' (30-minute), '60m' (60-minute), '90m' (90-minute), and '1h' (hourly).
        """
        self.symbol: str = symbol
        self.start_date: str = start_date
        self.end_date: str = end_date
        self.interval: str = interval
        self.data: DataFrame = yf.download(
            symbol, start=start_date, end=end_date, interval=interval
        )
        if self.start_date is None and self.end_date is None:
            self.start_date = str(self.data.iloc[0].name)
            self.end_date = str(self.data.iloc[-1].name)
        self.ticker: Ticker = yf.Ticker(symbol)
        self.info: dict[Any, Any] = self.ticker.info

    def __repr__(self) -> str:
        """
        Returns a string representation of the StockData object.
        """
        return f"StockData(symbol={self.symbol}, start_date={self.start_date}, end_date={self.end_date}, interval={self.interval})"

    def __str__(self) -> str:
        """
        Returns a string representation of the stock data.
        """
        return str(self.data)

# Data Preparation


## Generating all input variables $X_i$


We create the class InputVariables in order to generate our input variables.


In [45]:
class InputVariables:
    def __init__(self, data: DataFrame, col="Close") -> None:
        self.data: DataFrame = data.copy()
        self.col: str = col

    def calculate_log_returns(self) -> None:
        self.data["log_returns"] = np.log(self.data[self.col]) - np.log(
            self.data[self.col].shift(1)
        )

    def add_Indicators(self) -> None:
        inds = pd.DataFrame()
        succeed = []
        failed = []
        for func in talib.get_functions():
            try:
                ind = getattr(talib, func)(self.data.loc[:, self.col]).rename(func)
                inds = pd.concat([inds, ind], axis=1)
                succeed += [func]
            except:
                failed += [func]
        self.data = pd.concat([self.data, inds], axis=1)

    @staticmethod
    def clean_data(df: DataFrame, threshold=0.05) -> DataFrame:
        # calculate the percentage of missing values for each column
        percent_missing = df.isna().sum() / len(df)

        # create a boolean mask of columns that exceed the threshold percentage
        mask = percent_missing > threshold

        # use the boolean mask to select only the columns that don't exceed the threshold
        return df.loc[:, ~mask]

    def generate_all(self, threshold=0.05) -> DataFrame:
        self.calculate_log_returns()
        self.add_Indicators()
        self.data = self.clean_data(self.data, threshold)
        return self.data

In [30]:
stock = Stock("MSFT")

[*********************100%***********************]  1 of 1 completed


In [31]:
# Usage example:
tv = InputVariables(stock.data)
all_representations: DataFrame = tv.generate_all()
all_representations.shape

(9379, 56)

## Preparing data


We create the class data_prep in order to train, split and plot the data.


In [58]:
class DataPreparer(InputVariables):
    def __init__(
        self,
        df: pd.DataFrame,
        target_col: str,
        testsize: float = 0.2,
        random_state: int = 101,
        shuffle=False,
        features: Optional[list] = None,
        scale: bool = True,
    ) -> None:
        super().__init__(df, target_col)
        self.target_col: str = target_col
        self.testsize: float = testsize
        self.random_state: int = random_state
        self.shuffle: bool = shuffle
        self.scale: bool = scale
        self.startup(features)

    def startup(self, features) -> None:
        self.data = self.generate_all().dropna()
        if self.scale:
            self.X: DataFrame = self.scale_data(
                self.data.drop(columns=[self.target_col]).loc[:, features]
                if features
                else self.data.drop(columns=[self.target_col])
            )
            self.X = self.clean_data(self.X)
        else:
            self.X: DataFrame = (
                self.data.drop(columns=[self.target_col]).loc[:, features]
                if features
                else self.data.drop(columns=[self.target_col])
            )
        self.y: DataFrame = self.data[[self.target_col]]
        self.train_test_split()

    def scale_data(self, X: pd.DataFrame) -> pd.DataFrame:
        scaler = StandardScaler()
        scaled_X = pd.DataFrame(
            scaler.fit_transform(X), index=X.index, columns=X.columns
        )
        return scaled_X

    def train_test_split(self) -> None:
        X_train, X_test, y_train, y_test = train_test_split(
            self.X,
            self.y,
            test_size=self.testsize,
            shuffle=self.shuffle,
            random_state=self.random_state,
        )
        self.X_train = X_train.sort_index()
        self.X_test = X_test.sort_index()
        self.y_train = y_train.sort_index()
        self.y_test = y_test.sort_index()

    def plot(self) -> go.Figure:
        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=self.X_train.index,
                y=self.y_train[self.target_col],
                mode="lines",
                name=f"Train Data {1 - self.testsize}%",
            )
        )
        fig.add_trace(
            go.Scatter(
                x=self.X_test.index,
                y=self.y_test[self.target_col],
                mode="lines",
                name=f"Test Data {self.testsize}%",
            )
        )

        fig.update_layout(
            title="Train and Test Data",
            xaxis_title="Date",
            yaxis_title=self.target_col,
        )
        return fig

In [39]:
stock = Stock('TSLA')

# Choose the target variable representation (e.g., 'returns', 'log_returns', 'diff', or 'close')
target_column = "Close"

# Initialize the data_prep class and prepare the data
dp = DataPreparer(stock.data, target_column, testsize=0.1)

# Use the prepared data for training and evaluating models or creating plots
dp.plot()

[*********************100%***********************]  1 of 1 completed


In [48]:
dp.X_train.shape[0]
dp.X_test.shape[0]

8352

928

In [49]:
dp.X_test.head()

Unnamed: 0,Open,High,Low,Adj Close,Volume,log_returns,HT_DCPERIOD,HT_DCPHASE,HT_TRENDMODE,MAX,MAXINDEX,MIN,MININDEX,SUM,ATAN,CEIL,COS,COSH,EXP,FLOOR,LN,LOG10,SIN,SINH,SQRT,TAN,TANH,APO,CMO,MOM,PPO,ROC,ROCP,ROCR,ROCR100,RSI,TRIX,DEMA,EMA,HT_TRENDLINE,KAMA,MA,MIDPOINT,SMA,T3,TEMA,TRIMA,WMA,LINEARREG,LINEARREG_ANGLE,LINEARREG_INTERCEPT,LINEARREG_SLOPE,STDDEV,TSF,VAR
2019-09-06 00:00:00,1.302977,1.284275,1.298722,1.295921,-1.012865,-0.358603,-1.597019,-0.173889,-1.997982,1.228371,1.379509,1.335865,1.383589,1.280606,0.548931,1.294865,0.81923,-0.022544,-0.022544,1.29468,1.166942,1.166942,0.955959,-0.022544,1.502272,0.019598,0.370826,0.115696,0.091621,0.223963,-0.084185,-0.017073,-0.017073,-0.017073,-0.017073,0.091621,0.020794,1.274029,1.278978,1.277624,1.262517,1.280606,1.267226,1.280606,1.2747,1.262346,1.272572,1.272647,1.271856,0.272178,1.270469,0.17529,0.5971,1.271673,0.001964
2019-09-09 00:00:00,1.296713,1.278217,1.273678,1.274244,-0.878202,-0.574492,-1.670141,0.077238,-1.997982,1.224159,1.379882,1.335865,1.383589,1.278775,0.548711,1.266402,0.97306,-0.022544,-0.022544,1.266217,1.160696,1.160696,-1.186122,-0.022544,1.485909,0.002357,0.370826,0.075475,-0.227888,0.860694,-0.104323,0.324146,0.324146,0.324146,0.324146,-0.227888,0.008368,1.27411,1.279496,1.279945,1.262682,1.278775,1.267226,1.278775,1.279523,1.2627,1.272807,1.273064,1.275332,0.442834,1.265135,0.292702,0.614185,1.275819,0.009091
2019-09-10 00:00:00,1.256996,1.237922,1.245612,1.254487,-0.793055,-0.532537,-1.725096,0.399451,-1.997982,1.214922,1.380256,1.335865,1.383589,1.276401,0.548506,1.25217,-0.797667,-0.022544,-0.022544,1.251985,1.154941,1.154941,-1.465904,-0.022544,1.470913,0.022495,0.370826,0.023403,-0.501632,0.067612,-0.130402,-0.095739,-0.095739,-0.095739,-0.095739,-0.501632,-0.005092,1.271595,1.278645,1.279835,1.262605,1.276401,1.267226,1.276401,1.278401,1.259282,1.273214,1.272266,1.271563,0.34298,1.266518,0.223835,0.604398,1.271659,0.004996
2019-09-11 00:00:00,1.244326,1.229187,1.25396,1.255035,-0.90671,-0.026627,-1.780707,0.889231,-1.997982,1.210847,1.389961,1.335865,1.383589,1.274373,0.548512,1.25217,-0.751698,-0.022544,-0.022544,1.251985,1.155102,1.155102,-1.497998,-0.022544,1.471331,0.023694,0.370826,0.059866,-0.493268,0.010959,-0.112217,-0.125292,-0.125292,-0.125292,-0.125292,-0.493268,-0.019303,1.269364,1.277886,1.280077,1.262544,1.274373,1.267226,1.274373,1.273432,1.256387,1.27371,1.271658,1.271531,0.44716,1.261073,0.295698,0.783998,1.272039,0.085655
2019-09-12 00:00:00,1.271943,1.259479,1.279579,1.274244,-0.84457,0.438065,-1.844904,1.147644,-1.997982,1.210847,1.389961,1.335865,1.383589,1.274972,0.548711,1.266402,0.97306,-0.022544,-0.022544,1.266217,1.160696,1.160696,-1.186122,-0.022544,1.485909,0.002357,0.370826,0.089964,-0.199339,0.368983,-0.097226,0.060774,0.060774,0.060774,0.060774,-0.199339,-0.032734,1.269812,1.278475,1.281181,1.262635,1.274972,1.267226,1.274972,1.269376,1.257468,1.274316,1.272476,1.275653,0.61561,1.256379,0.413273,0.387013,1.276831,-0.077032


In [66]:
dp.X_train.tail()

Unnamed: 0,Open,High,Low,Adj Close,Volume,log_returns,HT_DCPERIOD,HT_DCPHASE,HT_TRENDMODE,MAX,...,TEMA,TRIMA,WMA,LINEARREG,LINEARREG_ANGLE,LINEARREG_INTERCEPT,LINEARREG_SLOPE,STDDEV,TSF,VAR
2019-08-23 00:00:00,137.190002,138.350006,132.800003,128.916718,38508600,-0.032381,14.076048,185.539095,0,141.339996,...,136.177,137.31475,136.915634,136.626283,1.50318,136.285144,0.026241,1.94015,136.652525,3.764182
2019-08-26 00:00:00,134.990005,135.559998,133.899994,130.907654,20312600,0.015325,14.042129,177.360054,1,141.339996,...,135.941445,137.18125,136.793139,136.115426,-3.473795,136.904572,-0.060704,1.909392,136.054722,3.645778
2019-08-27 00:00:00,136.389999,136.720001,134.660004,131.187943,23102100,0.002139,14.068112,178.993083,1,141.339996,...,135.78973,137.08,136.696774,135.741713,-7.026429,137.344,-0.123253,1.890511,135.61846,3.574033
2019-08-28 00:00:00,134.880005,135.759995,133.550003,131.013977,17393300,-0.001327,14.132536,231.876638,1,141.339996,...,135.626687,136.989875,136.591677,135.824284,-4.229676,136.785714,-0.073956,1.391325,135.750328,1.935784
2019-08-29 00:00:00,137.25,138.440002,136.910004,133.488144,20168700,0.018709,14.105157,266.793988,0,141.339996,...,135.950996,136.900666,136.653268,136.491713,1.38742,136.176855,0.02422,1.500977,136.515933,2.252932


In [67]:
dp.y_test.tail()

Unnamed: 0,Close
2023-05-02 00:00:00,305.410004
2023-05-03 00:00:00,304.399994
2023-05-04 00:00:00,305.410004
2023-05-05 00:00:00,310.649994
2023-05-08 00:00:00,308.649994


In [68]:
dp.y_train.tail()

Unnamed: 0,Close
2019-08-23 00:00:00,133.389999
2019-08-26 00:00:00,135.449997
2019-08-27 00:00:00,135.740005
2019-08-28 00:00:00,135.559998
2019-08-29 00:00:00,138.119995


# Feature Analysis


### Overview

We want to reduce the features such that:
1.  They are independent of the target variable.
2.  They are independent of each other.
3.  Our selection does not does not depend on any a priori properties between target and dependent variables.
4.  Our selection rely on the characteristics of the data itself.

To that end we will use **Filter Methods:** These techniques rely on the characteristics of the data itself, such as correlation, mutual information, or statistical tests, to rank features. Specifically we will use:
1. **Pearson Correlation Coefficient:** The threshold typically ranges between 0 and 1. A higher threshold value means that you'll keep only features that have a strong correlation with the target variable, while a lower value allows for weaker correlations. You might need to adjust the threshold based on the problem's requirements and the correlations between the features and the target variable.

2. **Mutual Information**: The threshold can range between 0 and a positive value, with higher values indicating stronger dependence between the feature and the target variable. The choice of threshold depends on the scale of mutual information values in your dataset and the desired level of feature reduction.

Overall, these feature selection methods are important in identifying and selecting the most relevant features for a given problem. By reducing the number of features used in an analysis, these methods can help to reduce overfitting and improve the accuracy and efficiency of machine learning models.

In [124]:
class FeatureAnalyzer:
    """
    A class for feature analysis and selection.

    Parameters:
    -----------
    X : pandas.DataFrame
        The features matrix to analyze.

    y : pandas.DataFrame
        The target vector.

    model : sklearn.base.BaseEstimator
        A supervised learning model used to extract feature importances. Default is None.

    Methods:
    --------
    corr() -> pandas.DataFrame:
        Calculates the Pearson correlation coefficients between the features and the target.

    mutual_info() -> pandas.DataFrame:
        Calculates the mutual information between the features and the target.

    corr_matrix() -> pandas.DataFrame:
        Calculates the Pearson correlation matrix between the features.

    filter_threshold(df, column, threshold, greater=False) -> List[str]:
        Filters a DataFrame based on a given column and a threshold.

    filter_corr(threshold) -> List[str]:
        Filters the features based on the Pearson correlation coefficient.

    filter_mutual_info(threshold) -> List[str]:
        Filters the features based on the mutual information.

    filter_corr_matrix(threshold=0.3) -> List[str]:
        Filters the features based on the Pearson correlation matrix.

    identify_zero_variance() -> List[str]:
        Identifies the features with zero variance.

    identify_collinear(threshold) -> List[str]:
        Identifies the collinear features based on the VIF score.

    get_feature_importance() -> pandas.DataFrame:
        Extracts the feature importance from the model.

    identify_low_importance(threshold) -> List[str]:
        Identifies the features with low importance based on a threshold.

    combine_criteria(collinear_threshold=5, importance_threshold=0.01,
                     correlation_threshold=0.3, mutual_info_threshold=0.2) -> List[str]:
        Combines multiple feature selection criteria to select the best features.

    transform_X() -> pandas.DataFrame:
        Returns a new DataFrame with the selected features.

    summary(missing_threshold=0.1, collinear_threshold=5, importance_threshold=0.01,
            correlation_threshold=0.3, mutual_info_threshold=0.2) -> Dict[str, Dict[str, Union[List[str], int]]]:
        Returns a summary of the feature analysis and selection process.
    """  
    def __init__(self, X: DataFrame, y: DataFrame, model: BaseEstimator = None) -> None:
        self.X: DataFrame = X
        self.y: DataFrame = y
        self.model: BaseEstimator = model

    def corr(self) -> DataFrame:
        return pd.DataFrame(
            np.corrcoef(np.column_stack((self.X, self.y)).T)[:-1, -1],
            columns=["pearson_corr"],
            index=self.X.columns,
        )

    def mutual_info(self) -> DataFrame:
        return pd.DataFrame(
            mutual_info_regression(self.X, self.y),
            columns=["mutual_info"],
            index=self.X.columns,
        )

    def corr_matrix(self) -> pd.DataFrame:
        return self.X.corr(method="pearson")

    def filter_threshold(
        self, df: DataFrame, column: str, threshold: float, greater: bool = False
    ) -> List[str]:
        mask = df[column] > threshold if greater else df[column] < threshold
        return df[mask].index.to_list()

    def filter_corr(self, threshold) -> List[str]:
        return self.filter_threshold(
            self.corr(), "pearson_corr", threshold, greater=False
        )

    def filter_mutual_info(self, threshold) -> List[str]:
        return self.filter_threshold(
            self.mutual_info(),
            "mutual_info",
            self.mutual_info().mutual_info.quantile(threshold),
            greater=False,
        )

    def filter_corr_matrix(self, threshold=0.3) -> list:
        corr_matrix: DataFrame = self.corr_matrix()
        np.fill_diagonal(corr_matrix.values, 0)
        stacked_corr_matrix = corr_matrix.abs().stack().reset_index()
        stacked_corr_matrix.columns = ["feature_1", "feature_2", "correlation"]
        filtered_corr_matrix = self.filter_threshold(
            stacked_corr_matrix, "correlation", threshold, greater=False
        )

        return list(
            set(filtered_corr_matrix["feature_1"].unique())
            | set(filtered_corr_matrix["feature_2"].unique())
        )

    def identify_missing(self, threshold: float) -> List[str]:
        return self.filter_threshold(
            self.X.isnull().mean().to_frame("missing_fraction"),
            "missing_fraction",
            threshold,
            greater=True,
        )

    def identify_zero_variance(self) -> List[str]:
        selector = VarianceThreshold(threshold=0)
        selector.fit(self.X)
        return list(self.X.columns[~selector.get_support()])

    def identify_collinear(self, threshold: float) -> List[str]:
        # Scale the features
        X_scaled = StandardScaler().fit_transform(self.X)
        vif_data = pd.DataFrame()
        vif_data["feature"] = self.X.columns
        vif_data["VIF"] = [
            variance_inflation_factor(X_scaled, i) for i in range(len(self.X.columns))
        ]
        return list(vif_data.loc[vif_data["VIF"] < threshold, "feature"])

    def get_feature_importance(self) -> pd.DataFrame:
        if self.model is None:
            raise ValueError("A model must be provided for this method.")

        if hasattr(self.model, "feature_importances_"):
            importances = self.model.feature_importances_
        elif hasattr(self.model, "coef_"):
            importances = np.abs(self.model.coef_)
        else:
            raise ValueError(
                "The provided model does not have the feature_importances_' or 'coef_' attribute."
            )
        feature_importance_df = pd.DataFrame(
            importances, columns=["importance"], index=self.X.columns
        )
        return feature_importance_df.sort_values(by="importance", ascending=False)

    def identify_low_importance(self, threshold: float) -> List[str]:
        if self.model is None:
            raise ValueError("A model must be provided for this method.")

        return self.filter_threshold(
            self.get_feature_importance(), "importance", threshold, greater=False
        )

    def combine_criteria(
        self,
        collinear_threshold=5,
        importance_threshold=0.01,
        correlation_threshold=0.3,
        mutual_info_threshold=0.2,
        intersection: bool = True
    ) -> List[str]:
        criteria: List[List[str]] = [
            self.identify_collinear(collinear_threshold),
            self.filter_corr(correlation_threshold),
            self.filter_mutual_info(mutual_info_threshold),
        ]
        if self.model is not None:
            criteria.append(self.identify_low_importance(importance_threshold))
        if intersection:
            selected_features = set.intersection(*map(set, criteria)) - set(
                self.identify_zero_variance()
            )
        else:
            selected_features = set.union(*map(set, criteria)) - set(
                self.identify_zero_variance()
            )            
        return list(selected_features)

    def transform_X(self) -> DataFrame:
        selected_features = set(self.combine_criteria())
        return self.X.loc[:, list(selected_features)]

    def summary(
        self,
        missing_threshold=0.1,
        collinear_threshold=5,
        importance_threshold=0.01,
        correlation_threshold=0.3,
        mutual_info_threshold=0.2,
    ) -> Dict[str, Dict[str, Union[List[str], int]]]:
        summary_dict = {
            "missing_values": {
                "features": self.identify_missing(missing_threshold),
                "count": len(self.identify_missing(missing_threshold)),
            },
            "zero_variance": {
                "features": self.identify_zero_variance(),
                "count": len(self.identify_zero_variance()),
            },
            "not_collinear": {
                "features": self.identify_collinear(collinear_threshold),
                "count": len(self.identify_collinear(collinear_threshold)),
            },
            "low_correlation": {
                "features": self.filter_corr(correlation_threshold),
                "count": len(self.filter_corr(correlation_threshold)),
            },
            "low_mutual_info": {
                "features": self.filter_mutual_info(mutual_info_threshold),
                "count": len(self.filter_mutual_info(mutual_info_threshold)),
            },
            "combined_criteria": {
                "features": self.combine_criteria(
                    collinear_threshold,
                    importance_threshold,
                    correlation_threshold,
                    mutual_info_threshold,
                ),
                "count": len(
                    self.combine_criteria(
                        collinear_threshold,
                        importance_threshold,
                        correlation_threshold,
                        mutual_info_threshold,
                    )
                ),
            },
        }

        if self.model is not None:
            summary_dict["low_importance"] = {
                "features": self.identify_low_importance(importance_threshold),
                "count": len(self.identify_low_importance(importance_threshold)),
            }

        return summary_dict

In [70]:
feature_analyzer = FeatureAnalyzer(dp.X, dp.y)

In [71]:
feature_analyzer.combine_criteria()

['HT_DCPHASE', 'HT_TRENDMODE']

In [72]:
feature_analyzer.transform_X()

Unnamed: 0,HT_DCPHASE,HT_TRENDMODE
1986-07-18,1.050736,1
1986-07-21,2.163467,1
1986-07-22,8.302453,1
1986-07-23,18.173782,0
1986-07-24,32.701279,1
...,...,...
2023-05-02,145.131262,1
2023-05-03,134.488765,1
2023-05-04,134.184462,1
2023-05-05,137.530455,1


In [73]:
feature_analyzer.summary()

{'missing_values': {'features': [], 'count': 0},
 'zero_variance': {'features': [], 'count': 0},
 'not_collinear': {'features': ['Volume',
   'log_returns',
   'HT_DCPERIOD',
   'HT_DCPHASE',
   'HT_TRENDMODE',
   'COS',
   'SIN',
   'TAN',
   'PPO',
   'TRIX'],
  'count': 10},
 'low_correlation': {'features': ['Volume',
   'log_returns',
   'HT_DCPERIOD',
   'HT_DCPHASE',
   'HT_TRENDMODE',
   'COS',
   'COSH',
   'EXP',
   'SIN',
   'SINH',
   'TAN',
   'TANH',
   'APO',
   'CMO',
   'MOM',
   'PPO',
   'ROC',
   'ROCP',
   'ROCR',
   'ROCR100',
   'RSI',
   'TRIX',
   'LINEARREG_ANGLE',
   'LINEARREG_SLOPE'],
  'count': 24},
 'low_mutual_info': {'features': ['HT_DCPHASE',
   'HT_TRENDMODE',
   'COSH',
   'EXP',
   'SINH',
   'CMO',
   'ROC',
   'ROCP',
   'ROCR',
   'ROCR100',
   'RSI'],
  'count': 11},
 'combined_criteria': {'features': ['HT_DCPHASE', 'HT_TRENDMODE'], 'count': 2}}

# PREDICTORS


When working with time series prediction, it is important to choose models that can handle time-dependent patterns in the data. Here's a list of compatible models that can be used for time series prediction, along with some additional models specifically designed for time series forecasting:

1. Tree-based Models:

    - Random Forest Regressor (sklearn.ensemble.RandomForestRegressor)
    - Extra Trees Regressor (sklearn.ensemble.ExtraTreesRegressor)
    - Gradient Boosting Regressor (sklearn.ensemble.GradientBoostingRegressor)

1. Linear Models:

    - Lasso (sklearn.linear_model.Lasso)
    - Ridge (sklearn.linear_model.Ridge)
    - ElasticNet (sklearn.linear_model.ElasticNet)
    - Lasso LARS (sklearn.linear_model.LassoLars)
    - Lasso LARS IC (sklearn.linear_model.LassoLarsIC)
    - Lars (sklearn.linear_model.Lars)
    - Lars CV (sklearn.linear_model.LarsCV)
    - Lasso CV (sklearn.linear_model.LassoCV)
    - Ridge CV (sklearn.linear_model.RidgeCV)
    - ElasticNet CV (sklearn.linear_model.ElasticNetCV)
    - Orthogonal Matching Pursuit (sklearn.linear_model.OrthogonalMatchingPursuit)
    - Orthogonal Matching Pursuit CV (sklearn.linear_model.OrthogonalMatchingPursuitCV)

1. Ensemble Models:

    - AdaBoost Regressor (sklearn.ensemble.AdaBoostRegressor)
    - Bagging Regressor (sklearn.ensemble.BaggingRegressor)

1. Time Series Models:

    - Autoregression (statsmodels.tsa.ar_model.AutoReg)
    - SARIMAX (statsmodels.tsa.statespace.sarimax.SARIMAX)
    - Exponential Smoothing State Space Model (statsmodels.tsa.statespace.exponential_smoothing.ExponentialSmoothing)

We will pick a few of these and optimize them for the close for our case.


## ModelEvaluator

We introduce a utility class that trains a model using cross validation and calculates all the quantites related to the training of the model. It finally plots the predictions and the learning curve.

In [5]:
class LearningCurve:
    def calculate_learning_curve(self, train_sizes=None, cv=None):
        if not hasattr(self, "model"):
            raise ValueError("There is no model available.")
        if train_sizes is None:
            train_sizes = np.linspace(0.1, 1.0, 10)
        if self.cv is None:
            self.cv = cv
        # Compute the learning curve
        (
            self.train_sizes_abs,
            self.train_scores,
            self.val_scores,
            self.fit_times,
            _,
        ) = learning_curve(
            self.model,
            self.data_preparer.X,
            self.data_preparer.y,
            train_sizes=train_sizes,
            cv=self.cv,
            scoring="neg_root_mean_squared_error",
            n_jobs=-1,
            verbose=1,
            return_times=True,
        )
        # Compute the mean and standard deviation of the scores
        self.train_scores_mean = -np.mean(self.train_scores, axis=1)
        self.train_scores_std = -np.std(self.train_scores, axis=1)
        self.val_scores_mean = -np.mean(self.val_scores, axis=1)
        self.val_scores_std = -np.std(self.val_scores, axis=1)

        # Compute the mean fit time
        self.fit_times_mean = np.mean(self.fit_times, axis=1)

    def calculate_optimal_size(self):
        optimal_idx = np.argmin(self.val_scores_mean)
        self.optimal_size = self.train_sizes_abs[optimal_idx]
        self.optimal_size_pct = round(self.optimal_size / self.data_preparer.X.shape[0], 2)

    def plot_learning_curve(self) -> go.Figure:
        fig = go.Figure()

        # Plot the training scores
        fig.add_trace(
            go.Scatter(
                x=self.train_sizes_abs,
                y=self.train_scores_mean,
                mode="lines+markers",
                name="Training score",
                line=dict(color="blue"),
                error_y=dict(
                    type="data", array=self.train_scores_std, visible=True, color="blue"
                ),
            )
        )

        # Plot the validation scores
        fig.add_trace(
            go.Scatter(
                x=self.train_sizes_abs,
                y=self.val_scores_mean,
                mode="lines+markers",
                name="Validation score",
                line=dict(color="green"),
                error_y=dict(
                    type="data", array=self.val_scores_std, visible=True, color="green"
                ),
            )
        )

        # Add a vertical line for the optimal training set size
        fig.add_shape(
            type="line",
            x0=self.optimal_size,
            x1=self.optimal_size,
            y0=0,
            y1=1,
            yref="paper",
            xref="x",
            line=dict(color="red", dash="dash"),
        )

        # Add a secondary y-axis for the training time
        fig.update_layout(
            yaxis2=dict(
                title="Training Time (s)", overlaying="y", side="right", showgrid=False
            ),
            title="Learning Curve",
            xaxis_title="Training Set Size",
            yaxis_title="Error (RMSE)",
        )

        # Plot the training time
        fig.add_trace(
            go.Scatter(
                x=self.train_sizes_abs,
                y=self.fit_times_mean,
                mode="lines+markers",
                name="Training Time",
                line=dict(color="orange"),
                yaxis="y2",
            )
        )

        # Add a text box with the model performance metric
        fig.add_annotation(
            x=0.05,
            y=0.95,
            xref="paper",
            yref="paper",
            text="Performance Metric: RMSE",
            showarrow=False,
            font=dict(size=12),
        )

        # Add a text box with the optimal training set size
        fig.add_annotation(
            x=self.optimal_size,
            y=0.05,
            xref="x",
            yref="paper",
            text=f"Optimal Size: {self.optimal_size}({self.optimal_size_pct}%)",
            showarrow=True,
            arrowhead=1,
            arrowcolor="red",
            font=dict(size=12),
        )

        return fig


In [6]:
class ModelEvaluator(LearningCurve):
    """
    A class for evaluating machine learning models using grid search cross-validation and learning curves.

    Parameters
    ----------
    model : BaseEstimator
        A scikit-learn estimator object to be evaluated.
    data_preparer : DataPreparer
        A DataPreparer object containing the training and testing data.

    Attributes
    ----------
    model : BaseEstimator
        The best-performing estimator found by grid search.
    data_preparer : DataPreparer
        The DataPreparer object used to prepare the training and testing data.
    cv_scores : array-like
        Cross-validated scores of the model.
    predictions : DataFrame
        Predictions of the model on the testing data.
    best_score : float
        Best cross-validated score achieved by the model.
    best_params : dict
        Parameters of the best-performing estimator found by grid search.

    Methods
    -------
    gridcv_tune(param_grid=None, scoring='r2', cv=5)
        Tune the hyperparameters of the model using grid search cross-validation.
    plot_learning_curve()
        Plot the learning curve of the model.
    plot()
        Plot the predictions of the model on the training and testing data.
    """

    def __init__(self, model, data_preparer) -> None:
        self.model = model
        self.data_preparer = data_preparer
        self.cv_scores = None
        self.predictions = None

    def gridcv_tune(
        self,
        param_grid: Optional[dict[str, Any]] = None,
        scoring: str = "r2",
        cv: int = 5,
    ):
        if param_grid is None:
            param_grid = {
                # default parameter grid, can be updated based on the model
            }
        self.cv: int = cv

        # Perform grid search cross-validation
        self.grid_search = GridSearchCV(
            self.model, param_grid, cv=self.cv, scoring=scoring
        )
        self.grid_search.fit(self.data_preparer.X_train, self.data_preparer.y_train)
        self.model: BaseEstimator = self.grid_search.best_estimator_

        # Add logic for different model types to extract coefficients/feature importances
        self.best_params = self.grid_search.best_params_

        # Get the best score
        self.best_score: float = self.grid_search.best_score_
        print(f"Best cross-validated score: {self.best_score:.2f}")

        # Make predictions on the test set
        self.predictions = pd.DataFrame(
            self.model.predict(self.data_preparer.X_test),
            index=self.data_preparer.y_test.index,
            columns=[self.data_preparer.target_col],
        )

        # Calculate cross-validated R-squared scores
        self.cv_scores = cross_val_score(
            self.model,
            self.data_preparer.X,
            self.data_preparer.y,
            cv=self.cv,
            scoring=scoring,
        )

    def extract_features(self, threshold=0):
        if not hasattr(self, "grid_search"):
            raise ValueError("No grid search object found.")
        filtered_coef = None
        if hasattr(self.model, "coef_"):
            # Extract feature coefficients above the threshold
            self.coef_ = self.model.coef_
            coef = pd.DataFrame(
                self.model.coef_.T,
                index=self.data_preparer.X_train.columns,
                columns=["Coefficients"],
            )
            filtered_coef = coef[abs(coef["Coefficients"]) >= threshold]
        elif hasattr(self.model, "feature_importances_"):
            # Extract feature importances above the threshold
            self.feature_importances = self.model.feature_importances_
            importances = pd.DataFrame(
                self.model.feature_importances_,
                index=self.data_preparer.X_train.columns,
                columns=["Importances"],
            )
            filtered_coef = importances[importances["Importances"] >= threshold]
        else:
            raise ValueError(
                "The specified model does not have feature coefficients or importances."
            )

        return filtered_coef

    def plot(self) -> go.Figure:
        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=self.predictions.index,
                y=self.predictions[self.data_preparer.target_col],
                mode="lines",
                name=f"Predictions",
            )
        )

        fig.add_trace(
            go.Scatter(
                x=self.data_preparer.X_train.index,
                y=self.data_preparer.y_train[self.data_preparer.target_col],
                mode="lines",
                name=f"Train Data {1 - self.data_preparer.testsize}%",
            )
        )
        fig.add_trace(
            go.Scatter(
                x=self.data_preparer.X_test.index,
                y=self.data_preparer.y_test[self.data_preparer.target_col],
                mode="lines",
                name=f"Test Data {self.data_preparer.testsize}%",
            )
        )

        fig.update_layout(
            title=f"Train and Test Data",
            xaxis_title="Index",
            yaxis_title=self.data_preparer.target_col,
        )
        return fig

## Linear Models


We will directly use ElasticNet in this section in order to have all the features of LinearRegression and controlable complexity through regularization.

The grid of our choice is the following
```py
    param_grid = {
        "alpha": [0.001, 0.01, 0.1, 1, 10, 100],
        "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9],
        "max_iter": [1000, 2000, 5000],
        "tol": [1e-4, 1e-5, 1e-6],
        "selection": ["cyclic", "random"],
    }
```

- 'alpha': This parameter is a regularization term that is a linear combination of L1 and L2 penalties. Higher values of alpha result in stronger regularization, which may help prevent overfitting. A smaller value of alpha allows the model to be more flexible and capture more complex patterns in the data, but at the risk of overfitting. In this param_grid, we're trying out six different values of alpha: [0.001, 0.01, 0.1, 1, 10, 100].

- 'l1_ratio': This parameter is the mixing parameter between L1 (Lasso) and L2 (Ridge) penalties. It varies between 0 and 1. A value of 0 corresponds to the Ridge penalty, and a value of 1 corresponds to the Lasso penalty. We're trying out five different values of l1_ratio: [0.1, 0.3, 0.5, 0.7, 0.9].

- 'max_iter': This parameter is the maximum number of iterations for the optimization algorithm to converge. If the algorithm does not converge in the specified number of iterations, it will stop early. In this param_grid, we're trying out three different values of max_iter: [1000, 2000, 5000].

- 'tol': This parameter is the tolerance for the optimization algorithm. The algorithm will stop when the update is smaller than tol. Smaller values of tol will result in a more accurate solution, but may take more iterations to converge. In this param_grid, we're trying out three different values of tol: [1e-4, 1e-5, 1e-6].

- 'selection': This parameter determines the method used to update the coefficients during the optimization process. The two possible values are 'cyclic' and 'random'. In the 'cyclic' method, the algorithm iterates through each feature sequentially, while in the 'random' method, the algorithm selects a random feature at each iteration. The param_grid includes both options.


In [93]:
class ElasticNetRegressor(ModelEvaluator):
    def __init__(self, data_preparer) -> None:
        super().__init__(ElasticNet(), data_preparer)

    def evaluate(self):
        mae = mean_absolute_error(self.data_preparer.y_test, self.predictions)
        mse = mean_squared_error(self.data_preparer.y_test, self.predictions)

        print(
            f"""Mean Absolute Error (MAE): {mae:.2f}
        Mean Squared Error (MSE): {mse:.2f}
        Cross-validated R-squared scores: {self.cv_scores}
        Average cross-validated R-squared score: {np.mean(self.cv_scores):.2f}"""
        )
        return (mae, mse, self.cv_scores, np.mean(self.cv_scores))

    def get_summary_dict(self, threshold) -> dict:
        filtered_coef = self.extract_features(threshold)
        summary_dict = {
            "model": type(self.model).__name__,
            "target_variable": self.data_preparer.target_col,
            "num_input_features": self.data_preparer.X_train.shape[1],
            "test_size": self.data_preparer.testsize,
            "cv_folds": self.cv,
            "best_params": self.best_params,
            "best_cv_score": self.best_score,
            "optimal_training_size": self.optimal_size_pct,
            "cv_scores": self.cv_scores,
            "coefficients": self.coef_,
            "filtered_coef": filtered_coef.sort_values(by="Coefficients").rename(
                {"Coefficients": ""}, axis=1
            ),
            "MAE": mean_absolute_error(self.data_preparer.y_test, self.predictions),
            "MSE": mean_squared_error(self.data_preparer.y_test, self.predictions),
            "Average cross-validated R-squared score": np.mean(self.cv_scores),
        }
        return summary_dict

    def summarize(self, threshold=0) -> str:
        self.threshold = threshold
        summary_dict = self.get_summary_dict(threshold)
        return f"""Model summary:
    Model: {summary_dict['model']}
    Target variable: {summary_dict['target_variable']}
    Number of input features: {summary_dict['num_input_features']}
    Test Size: {summary_dict['test_size']}
    Cross-validation fold: {summary_dict['cv_folds']}
Evaluation:    
    Best params: {summary_dict['best_params']}
    Optimal test size: {round(1-summary_dict['optimal_training_size'],2)}
    Best cross-validated score: {summary_dict['best_cv_score']:.2f}
    Mean Absolute Error (MAE): {summary_dict['MAE']:.2f}
    Mean Squared Error (MSE): {summary_dict['MSE']:.2f}
    Cross-validated R-squared scores: {summary_dict['cv_scores']}
Filtered_coef>{self.threshold}: {summary_dict['filtered_coef']}"""

# Testing Predictors

In [59]:
MSFT = Stock('MSFT')
TSLA = Stock('TSLA')
AAPL = Stock('AAPL')
GOOG = Stock('GOOG')

[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed
[*********************100%***********************]  1 of 1 completed


In [60]:
dp_MSFT = DataPreparer(MSFT.data, 'Close')
dp_TSLA = DataPreparer(TSLA.data, 'Close')
dp_AAPL = DataPreparer(AAPL.data, 'Close')
dp_GOOG = DataPreparer(GOOG.data, 'Close')

In [61]:
# check for missing values
for dp_ in [dp_MSFT, dp_TSLA, dp_AAPL, dp_GOOG]:
    if any(dp_.X.isna().sum()):
        dp_.X.shape

## ElasticNet

In [94]:
# Initialize the StockPricePredictor class with the prepared data and a Ridge model
regressor = ElasticNetRegressor(dp_TSLA)

# Define the range of regularization parameters to be tested
param_grid = (
    {
        "alpha": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
        "l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9, 1],
        # "max_iter": [1000, 2000, 5000],
        # "tol": [1e-4, 1e-5, 1e-6],
        # "selection": ["cyclic", "random"],
    },
)

# Perform tuning and evaluation with 5 folds6
cv_scores = regressor.gridcv_tune(param_grid, cv=5)
regressor.calculate_learning_curve()
regressor.calculate_optimal_size()
# Evaluate the performance of the model
print(regressor.summarize(threshold=0.15))

Best cross-validated score: 0.86
[learning_curve] Training set sizes: [ 253  506  759 1012 1265 1518 1771 2024 2277 2531]


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.


Model summary:
    Model: ElasticNet
    Target variable: Close
    Number of input features: 52
    Test Size: 0.2
    Cross-validation fold: 5
Evaluation:    
    Best params: {'alpha': 0.001, 'l1_ratio': 0.9}
    Optimal test size: 0.2
    Best cross-validated score: 0.86
    Mean Absolute Error (MAE): 1.55
    Mean Squared Error (MSE): 4.30
    Cross-validated R-squared scores: [-8.43293251  0.94965566  0.98847894  0.99903858  0.99881245]
Filtered_coef>0.15:                           
LINEARREG_SLOPE  -0.828980
APO               0.696108
MOM               1.595577
SUM               2.295362
MIN               2.485989
MAX               5.015056
FLOOR             5.234621
CEIL             11.552760
High             17.217179
Low              22.307547
Adj Close        31.002410


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.6s finished


In [63]:
# regressor.calculate_learning_curve()
# regressor.calculate_optimal_size()
regressor.plot_learning_curve()

[learning_curve] Training set sizes: [ 253  506  759 1012 1265 1518 1771 2024 2277 2531]


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.7s finished


In [95]:
regressor.plot()