In [1]:
from IPython.core.interactiveshell import InteractiveShell

import json
from datetime import datetime as dt
from datetime import timedelta
from pathlib import Path
from typing import Any, Iterable, List, Optional, Tuple
from warnings import filterwarnings
from zoneinfo import ZoneInfo

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.io as pio
import plotly.offline as pyo
import scipy as sc
import seaborn as sns
import yfinance as yf
from pandas import DataFrame, Series
from pandas.core.frame import DataFrame
from plotly.offline import init_notebook_mode
from plotly.subplots import make_subplots
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, cross_val_score, train_test_split
from statsmodels.stats.stattools import durbin_watson
from yfinance.ticker import Ticker

from analytics import Stock

sns.set(style="darkgrid")
plt.style.use("dark_background")
plt.rcParams.update({"grid.linewidth": 0.5, "grid.alpha": 0.5})
plt.rc("figure", figsize=(16, 10))
plt.rc("lines", markersize=4)
plt.rcParams["figure.autolayout"] = True
sns.set_context("poster")
init_notebook_mode(connected=True)
pio.templates.default = "plotly_dark"
filterwarnings("ignore")
InteractiveShell.ast_node_interactivity = "all"



A NumPy version >=1.16.5 and <1.23.0 is required for this version of SciPy (detected version 1.24.2



In [5]:
stock = Stock('AAPL')

[*********************100%***********************]  1 of 1 completed


In [7]:
class TargetVariable:
    def __init__(self, data, column='Close'):
        self.data = data
        self.column = column

    def calculate_returns(self):
        self.data['returns'] = self.data[self.column].pct_change()

    def calculate_log_returns(self):
        self.data['log_returns'] = np.log(self.data[self.column]) - np.log(self.data[self.column].shift(1))

    def calculate_differencing(self):
        self.data['diff'] = self.data[self.column].diff()

    def create_lag_features(self, n_lags):
        for i in range(1, n_lags + 1):
            self.data[f"lag_{i}"] = self.data[self.column].shift(i)

    def generate_all(self, n_lags=5):
        self.calculate_returns()
        self.calculate_log_returns()
        self.calculate_differencing()
        self.create_lag_features(n_lags)
        return self.data.dropna()

# Usage example:
data = pd.DataFrame({"close": [100, 101, 102, 104, 106, 108]})

tv = TargetVariable(stock.data)
all_representations = tv.generate_all(n_lags=3)
print(all_representations)

                  Open        High         Low       Close   Adj Close  \
Date                                                                     
1980-12-17    0.115513    0.116071    0.115513    0.115513    0.089749   
1980-12-18    0.118862    0.119420    0.118862    0.118862    0.092351   
1980-12-19    0.126116    0.126674    0.126116    0.126116    0.097987   
1980-12-22    0.132254    0.132813    0.132254    0.132254    0.102756   
1980-12-23    0.137835    0.138393    0.137835    0.137835    0.107093   
...                ...         ...         ...         ...         ...   
2023-04-11  162.350006  162.360001  160.509995  160.800003  160.800003   
2023-04-12  161.220001  162.059998  159.779999  160.100006  160.100006   
2023-04-13  161.630005  165.800003  161.419998  165.559998  165.559998   
2023-04-14  164.589996  166.320007  163.820007  165.210007  165.210007   
2023-04-17  165.089996  165.389999  164.029999  165.229996  165.229996   

              Volume   returns  log_r

In [66]:
class data_prep(TargetVariable):
    def __init__(
        self,
        df: pd.DataFrame,
        target_column: str,
        n_lags: int = 3,
        testsize: float = 0.3,
        random_state: int = 101,
        shuffle=False,
    ) -> None:
        super().__init__(df)
        self.target_column = target_column
        self.testsize = testsize
        self.random_state = random_state
        self.shuffle = shuffle
        self.data = self.generate_all(n_lags=n_lags)
        self.X = self.data.drop(columns=[self.target_column])
        self.y = self.data[[self.target_column]]
        self.__train_test_split()

    def __train_test_split(self):
        X_train, X_test, y_train, y_test = train_test_split(
            self.X,
            self.y,
            test_size=self.testsize,
            shuffle=self.shuffle,
            random_state=self.random_state,
        )
        self.X_train = X_train.sort_index()
        self.X_test = X_test.sort_index()
        self.y_train = y_train.sort_index()
        self.y_test = y_test.sort_index()

    def plot_full_data(self):
        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                x=self.data.index,
                y=self.data[self.target_column],
                mode='lines',
                name='Full Data'
            )
        )
        fig.update_layout(
            title='Full Data',
            xaxis_title='Index',
            yaxis_title=self.target_column
        )
        return fig

    def plot_train_test_data(self):
        fig = go.Figure()

        fig.add_trace(
            go.Scatter(
                x=self.X_train.index,
                y=self.y_train[self.target_column],
                mode='lines',
                name='Train Data'
            )
        )
        fig.add_trace(
            go.Scatter(
                x=self.X_test.index,
                y=self.y_test[self.target_column],
                mode='lines',
                name='Test Data'
            )
        )

        fig.update_layout(
            title='Train and Test Data',
            xaxis_title='Index',
            yaxis_title=self.target_column
        )
        return fig

    def plot_all_data(self):
        fig = make_subplots(
            rows=2,
            cols=1,
            shared_xaxes=False,
            shared_yaxes=True,
            vertical_spacing=0.1,
            subplot_titles=("Full Data", "Train Test Data")
        )

        fig.add_trace(
            go.Scatter(
                x=self.data.index,
                y=self.data[self.target_column],
                mode='lines',
                name='Full Data'
            ),
            row=1,
            col=1
        )
        fig.add_trace(
            go.Scatter(
                x=self.X_train.index,
                y=self.y_train[self.target_column],
                mode='lines',
                name='Train Data'
            ),
            row=2,
            col=1
        )
        fig.add_trace(
            go.Scatter(
                x=self.X_test.index,
                y=self.y_test[self.target_column],
                mode='lines',
                name='Test Data'
            ),
            row=2,
            col=1
        )

        fig.update_layout(
            height=900,
            width=900,
            title_text="All Data Plots",
            yaxis_title=self.target_column,
            yaxis2_title=self.target_column
        )
        return fig


# Choose the target variable representation (e.g., 'returns', 'log_returns', 'diff', or 'close')
target_column = 'returns'

# Initialize the data_prep class and prepare the data
data_preparer = data_prep(stock.data, target_column, n_lags=3)

# Use the prepared data for training and evaluating models or creating plots
data_preparer.plot_all_data()

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm
from scipy import stats
from statsmodels.stats.diagnostic import het_breuschpagan

In [97]:
data_preparer.X_test

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,log_returns,diff,lag_1,lag_2,lag_3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2010-07-28,9.416786,9.499643,9.294643,9.320000,7.932422,519985200,-0.011885,-0.111429,9.431429,9.260000,9.283571
2010-07-29,9.311071,9.380357,9.146429,9.218214,7.845790,643806800,-0.010981,-0.101786,9.320000,9.431429,9.260000
2010-07-30,9.138929,9.275000,9.103571,9.187500,7.819650,448210000,-0.003337,-0.030714,9.218214,9.320000,9.431429
2010-08-02,9.301429,9.378214,9.272143,9.351786,7.959474,428055600,0.017723,0.164286,9.187500,9.218214,9.320000
2010-08-03,9.321786,9.402143,9.265000,9.354643,7.961905,417653600,0.000305,0.002857,9.351786,9.187500,9.218214
...,...,...,...,...,...,...,...,...,...,...,...
2023-04-11,162.350006,162.360001,160.509995,160.800003,160.800003,47644200,-0.007620,-1.229996,162.029999,164.660004,163.759995
2023-04-12,161.220001,162.059998,159.779999,160.100006,160.100006,50133100,-0.004363,-0.699997,160.800003,162.029999,164.660004
2023-04-13,161.630005,165.800003,161.419998,165.559998,165.559998,68445600,0.033535,5.459991,160.100006,160.800003,162.029999
2023-04-14,164.589996,166.320007,163.820007,165.210007,165.210007,49337200,-0.002116,-0.349991,165.559998,160.100006,160.800003


In [98]:
data_preparer.X_train

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume,log_returns,diff,lag_1,lag_2,lag_3
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1980-12-17,0.115513,0.116071,0.115513,0.115513,0.089749,86441600,0.024450,0.002790,0.112723,0.121652,0.128348
1980-12-18,0.118862,0.119420,0.118862,0.118862,0.092351,73449600,0.028580,0.003349,0.115513,0.112723,0.121652
1980-12-19,0.126116,0.126674,0.126116,0.126116,0.097987,48630400,0.059239,0.007254,0.118862,0.115513,0.112723
1980-12-22,0.132254,0.132813,0.132254,0.132254,0.102756,37363200,0.047522,0.006138,0.126116,0.118862,0.115513
1980-12-23,0.137835,0.138393,0.137835,0.137835,0.107093,46950400,0.041333,0.005581,0.132254,0.126116,0.118862
...,...,...,...,...,...,...,...,...,...,...,...
2010-07-21,9.467500,9.469643,9.071429,9.080000,7.728157,1185671200,0.009286,0.083929,8.996071,8.770714,8.925000
2010-07-22,9.202857,9.285714,9.118214,9.250714,7.873454,645318800,0.018627,0.170714,9.080000,8.996071,8.770714
2010-07-23,9.181786,9.299286,9.152857,9.283571,7.901416,533388800,0.003546,0.032857,9.250714,9.080000,8.996071
2010-07-26,9.285714,9.289286,9.203929,9.260000,7.881356,420551600,-0.002542,-0.023571,9.283571,9.250714,9.080000


In [99]:
class StockPricePredictor:
    def __init__(self, data_preparer, alpha=1.0, model=None) -> None:
        self.data_preparer = data_preparer
        self.model = model if model is not None else Ridge(alpha=alpha)
        self.cv_scores = None
        self.predictions = None

    def gridcv_tune(self, param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}, cv=5, scoring='r2'):
        # Perform grid search cross-validation
        grid_search = GridSearchCV(self.model, param_grid, cv=cv, scoring=scoring)
        grid_search.fit(self.data_preparer.X_train, self.data_preparer.y_train)
        self.model = grid_search.best_estimator_
        # Get the best score
        self.best_score = grid_search.best_score_
        print(f"Best cross-validated score: {self.best_score:.2f}")        

        # Make predictions on the test set
        self.predictions = pd.DataFrame(self.model.predict(self.data_preparer.X_test), index=self.data_preparer.y_test.index, columns=[self.data_preparer.target_column])


        # Calculate cross-validated R-squared scores
        self.cv_scores = cross_val_score(self.model, self.data_preparer.X, self.data_preparer.y, cv=cv, scoring=scoring)
        return self.cv_scores

    def evaluate(self):
        mae = mean_absolute_error(self.data_preparer.y_test, self.predictions)
        mse = mean_squared_error(self.data_preparer.y_test, self.predictions)

        print(f"Mean Absolute Error (MAE): {mae:.2f}")
        print(f"Mean Squared Error (MSE): {mse:.2f}")
        print(f"Cross-validated R-squared scores: {self.cv_scores}")
        print(f"Average cross-validated R-squared score: {np.mean(self.cv_scores):.2f}")
        return (mae, mse, self.cv_scores, np.mean(self.cv_scores))

Note for the LinearityChecker class:
- Using the mean of the predictions as the constant term in the Breusch-Pagan test is a common practice to account for the heteroscedasticity that can arise due to the variability in the constant term of the model. By using the mean of the predictions instead of the constant term, we can better capture this variability and obtain more accurate test results.

In [106]:
class LinearityChecker:
    def __init__(self, predictions: DataFrame, data_preparer: pd.DataFrame) -> None:
        self.data_preparer: DataFrame = data_preparer
        self.predictions = predictions
        self.y_test = data_preparer.y_test
        
    def check_residuals(self, alpha=0.05) -> bool:
        residuals = self.predictions - self.y_test
        exog = np.column_stack([np.ones_like(self.predictions), self.predictions])
        breusch_pagan_pvalue = het_breuschpagan(residuals, exog)[1]
        print(f"Breusch-Pagan test p-value: {breusch_pagan_pvalue:.4f}")
        if breusch_pagan_pvalue < alpha:
            print("The residuals exhibit heteroscedasticity.")
            return False
        else:
            print("The residuals exhibit homoscedasticity.")
            return True

    def check_normality(self, alpha=0.05) -> bool:
        _, pvalue = normaltest(self.predictions.values)
        print(f"Normal test p-value: {pvalue:.4f}")
        if pvalue < alpha:
            print("The predictions are not normally distributed.")
            return False
        else:
            print("The predictions are normally distributed.")
            return True

    def check_multicollinearity(self, data, threshold=5) -> bool:
        vif = pd.DataFrame()
        vif["features"] = data.columns
        vif["VIF"] = [variance_inflation_factor(data.values, i) for i in range(data.shape[1])]
        print("Variance Inflation Factors (VIF):")
        print(vif)
        if (vif["VIF"] > threshold).any():
            print("The data exhibit multicollinearity.")
            return False
        else:
            print("The data do not exhibit multicollinearity.")
            return True

    def check_all_conditions(self, alpha=0.05, vif_threshold=5):
        conditions = {}
        conditions["Heteroscedasticity"] = self.check_residuals(alpha)
        conditions["Normality"] = self.check_normality(alpha)
        data = pd.DataFrame(self.predictions, columns=['y_pred'])
        conditions["Multicollinearity"] = self.check_multicollinearity(data, vif_threshold)
        return conditions
    
    def plot_residuals(self):
        residuals = self.predictions - self.predictions.mean()
        plt.scatter(self.predictions, residuals)
        plt.xlabel("Predicted values")
        plt.ylabel("Residuals")
        plt.axhline(y=0, color='r', linestyle='--')
        plt.title("Residual Plot")
        plt.show()

    def plot_normality(self):
        residuals = self.predictions - self.predictions.mean()
        sns.histplot(residuals, kde=True)
        plt.xlabel("Residuals")
        plt.title("Residual Distribution")
        plt.show()

    def plot_multicollinearity(self, data):
        corr_matrix = data.corr()
        sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", center=0)
        plt.title("Multicollinearity Heatmap")
        plt.show()

    def plot_all_conditions(self, data):
        self.plot_residuals()
        self.plot_normality()
        self.plot_multicollinearity(data)

In [100]:
# Choose the target variable representation (e.g., 'returns', 'log_returns', 'diff', or 'close')
target_column = 'returns'

# Initialize the data_prep class and prepare the data
data_preparer = data_prep(stock.data, target_column, n_lags=3)

# Initialize the StockPricePredictor class with the prepared data and a Ridge model
regressor = StockPricePredictor(data_preparer)

# Define the range of regularization parameters to be tested
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}

# Perform tuning and evaluation with 5 folds
cv_scores = regressor.gridcv_tune(param_grid, cv=5)
print(f"Cross-validated R-squared scores: {cv_scores}")

# Evaluate the performance of the model
regressor.evaluate()


Best cross-validated score: 0.96
Cross-validated R-squared scores: [0.99665448 0.99593257 0.98515283 0.98699279 0.51441892]
Mean Absolute Error (MAE): 0.16
Mean Squared Error (MSE): 0.06
Cross-validated R-squared scores: [0.99665448 0.99593257 0.98515283 0.98699279 0.51441892]
Average cross-validated R-squared score: 0.90


(0.16179323546310426,
 0.06394793055057876,
 array([0.99665448, 0.99593257, 0.98515283, 0.98699279, 0.51441892]),
 0.8958303201868929)

In [102]:
data_preparer.y_test.index

DatetimeIndex(['2010-07-28', '2010-07-29', '2010-07-30', '2010-08-02',
               '2010-08-03', '2010-08-04', '2010-08-05', '2010-08-06',
               '2010-08-09', '2010-08-10',
               ...
               '2023-04-03', '2023-04-04', '2023-04-05', '2023-04-06',
               '2023-04-10', '2023-04-11', '2023-04-12', '2023-04-13',
               '2023-04-14', '2023-04-17'],
              dtype='datetime64[ns]', name='Date', length=3202, freq=None)

In [103]:
regressor.predictions

Unnamed: 0_level_0,returns
Date,Unnamed: 1_level_1
2010-07-28,-0.014359
2010-07-29,-0.013390
2010-07-30,-0.005007
2010-08-02,0.015782
2010-08-03,-0.001040
...,...
2023-04-11,-0.607497
2023-04-12,-0.600374
2023-04-13,-0.551356
2023-04-14,-0.592188


In [105]:
regressor.predictions

Unnamed: 0_level_0,returns
Date,Unnamed: 1_level_1
2010-07-28,-0.014359
2010-07-29,-0.013390
2010-07-30,-0.005007
2010-08-02,0.015782
2010-08-03,-0.001040
...,...
2023-04-11,-0.607497
2023-04-12,-0.600374
2023-04-13,-0.551356
2023-04-14,-0.592188


In [107]:

# Instantiate the LinearityChecker class with the predictions
checker = LinearityChecker(regressor.predictions, regressor.data_preparer)

# Check all the linearity conditions and plot all the linearity plots
checker.check_all_conditions()

Breusch-Pagan test p-value: 0.0000
The residuals exhibit heteroscedasticity.


NameError: name 'normaltest' is not defined