In this Jupyter notebook, we will explore the possibility to forecast the price of a stock at time t, based on some linear regressions of previous prices (of both the stock and the rest of the market).  

In [None]:
import pandas as pd
import numpy as np
import scipy 
import statsmodels.api as sm
import sklearn
from sklearn.linear_model import Ridge, Lasso, LinearRegression
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
import random
from scipy.stats import norm
import pmdarima as pm

In [None]:
dataset=pd.read_parquet('/Users/emanueledurante/Desktop/LGMB/lausanne/epfl/MLfinance/High-Frequency-Trading-with-Deep-Learning/data/high_10m.parquet')

In [None]:
dataset.describe()
symbols_with_enough_data = dataset['SYMBOL'].value_counts()
symbols_over_60 = symbols_with_enough_data[symbols_with_enough_data > 60].index
dataset = dataset[dataset['SYMBOL'].isin(symbols_over_60)]
dataset=dataset[dataset['RETURN']!=0]

In [None]:
dataset.head()

In [None]:

def rolling_regression_single_symbol_fit_over_50(df: pd.DataFrame, symbol: str, PLOT__: bool = True) -> tuple[float, float, tuple[float, float], str]:
    '''
    Perform rolling linear regressions using scikit-learn.
    Each model is trained on 50 observations with 10 lagged returns as features,
    and used to predict the next return. Each model is used to predict exactly one future return.

    Inputs:
        df (pd.DataFrame): contains ['SYMBOL', 'DATE', 'TIME', 'RETURN']
        symbol (str): the symbol to process
        PLOT__ (bool): if True, shows scatterplot of actual vs predicted

    Outputs:
        r2 (float): R² score
        sign_accuracy (float): percentage of correctly predicted signs
        ci (tuple): 99% confidence interval around sign accuracy
        significant (str): 'YES' if interval excludes 0.5, otherwise 'NO'
    '''
    group = df[df['SYMBOL'] == symbol].sort_values(by=['DATE', 'TIME']).reset_index(drop=True)
    returns = group['RETURN'].values

    max_lag = 10
    train_size = 50
    min_required = train_size + max_lag

    if len(returns) < min_required + 1:
        return np.nan, np.nan, (np.nan, np.nan), 'NA'

    # Build lag matrix and targets
    X_lags = np.column_stack([returns[i:len(returns)-max_lag+i] for i in range(max_lag)])
    y_all = returns[max_lag:]

    y_true = []
    y_pred = []

    max_t = len(y_all) - train_size - 1

    for t in range(max_t):
        X_train = X_lags[t:t+train_size]
        y_train = y_all[t:t+train_size]

        model = LinearRegression()
        model.fit(X_train, y_train)

        x_test = X_lags[t + train_size].reshape(1, -1)
        pred = model.predict(x_test)[0]

        y_true.append(y_all[t + train_size])
        y_pred.append(pred)

    if not y_true:
        return np.nan, np.nan, (np.nan, np.nan), 'NA'

    r2 = r2_score(y_true, y_pred)
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    sign_accuracy = np.mean(np.sign(y_true) == np.sign(y_pred))

    # Confidence interval (99%)
    n = len(y_true)
    p_hat = sign_accuracy
    p_0 = 0.5
    z = norm.ppf(1 - 0.01 / 2)  # two-tailed 99% → z ≈ 2.576
    se = np.sqrt(p_0 * (1 - p_0) / n)
    margin = z * se
    ci_low = p_hat - margin
    ci_high = p_hat + margin
    significant = "YES" if ci_low > 0.5 or ci_high < 0.5 else "NO"
    if PLOT__:
        plt.figure(figsize=(6, 6))
        plt.scatter(y_true, y_pred, alpha=0.6)
        plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='red', linestyle='--')
        plt.xlabel('Actual RETURN')
        plt.ylabel('Predicted RETURN')
        plt.title(f'{symbol}\nR² = {r2:.4f}, Sign Accuracy = {sign_accuracy:.2%}')
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    return r2, sign_accuracy, (ci_low, ci_high), significant


In [None]:
rolling_regression_single_symbol_fit_over_50(dataset,'ADTH')

In [None]:
symbols = dataset['SYMBOL'].dropna().unique()
random_symbols = random.sample(list(symbols), k=100)
for symbol in random_symbols:
    r2, sign_acc,_,significant = rolling_regression_single_symbol_fit_over_50(dataset, symbol,False)
    print(f'{symbol}: R² = {r2:.4f}, Sign Accuracy = {sign_acc:.2%},Significance={significant}')

In [None]:
def rolling_ridge_single_symbol_fit_over_50(df: pd.DataFrame, symbol: str, alpha: float=10e-1,PLOT__: bool = True) -> tuple[float, float, tuple[float, float], str]:
    '''
    Perform rolling ridge regressions using scikit-learn.
    Each model is trained on 50 observations with 10 lagged returns as features,
    and used to predict the next return. Each model is used to predict exactly one future return.

    Inputs:
        df (pd.DataFrame): contains ['SYMBOL', 'DATE', 'TIME', 'RETURN']
        symbol (str): the symbol to process
        PLOT__ (bool): if True, shows scatterplot of actual vs predicted

    Outputs:
        r2 (float): R² score
        sign_accuracy (float): percentage of correctly predicted signs
        ci (tuple): 99% confidence interval around sign accuracy
        significant (str): 'YES' if interval excludes 0.5, otherwise 'NO'
    '''
    group = df[df['SYMBOL'] == symbol].sort_values(by=['DATE', 'TIME']).reset_index(drop=True)
    returns = group['RETURN'].values

    max_lag = 10
    train_size = 50
    min_required = train_size + max_lag

    if len(returns) < min_required + 1:
        return np.nan, np.nan, (np.nan, np.nan), 'NA'

    # Build lag matrix and targets
    X_lags = np.column_stack([returns[i:len(returns)-max_lag+i] for i in range(max_lag)])
    y_all = returns[max_lag:]

    y_true = []
    y_pred = []

    max_t = len(y_all) - train_size - 1

    for t in range(max_t):
        X_train = X_lags[t:t+train_size]
        y_train = y_all[t:t+train_size]

        model = Ridge(alpha)
        model.fit(X_train, y_train)

        x_test = X_lags[t + train_size].reshape(1, -1)
        pred = model.predict(x_test)[0]

        y_true.append(y_all[t + train_size])
        y_pred.append(pred)

    if not y_true:
        return np.nan, np.nan, (np.nan, np.nan), 'NA'

    r2 = r2_score(y_true, y_pred)
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    sign_accuracy = np.mean(np.sign(y_true) == np.sign(y_pred))

    # Confidence interval (99%)
    n = len(y_true)
    p_hat = sign_accuracy
    p_0 = 0.5
    z = norm.ppf(1 - 0.01 / 2)  # two-tailed 99% → z ≈ 2.576
    se = np.sqrt(p_0 * (1 - p_0) / n)
    margin = z * se
    ci_low = p_hat - margin
    ci_high = p_hat + margin
    significant = "YES" if ci_low > 0.5 or ci_high < 0.5 else "NO"
    if PLOT__:
        plt.figure(figsize=(6, 6))
        plt.scatter(y_true, y_pred, alpha=0.6)
        plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='red', linestyle='--')
        plt.xlabel('Actual RETURN')
        plt.ylabel('Predicted RETURN')
        plt.title(f'{symbol}\nR² = {r2:.4f}, Sign Accuracy = {sign_accuracy:.2%}')
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    return r2, sign_accuracy, (ci_low, ci_high), significant

In [None]:
rolling_ridge_single_symbol_fit_over_50(dataset,'ACTD')

In [None]:
symbols = dataset['SYMBOL'].dropna().unique()
random_symbols = random.sample(list(symbols), k=100)
for symbol in random_symbols:
    r2, sign_acc,_,significant = rolling_ridge_single_symbol_fit_over_50(dataset, symbol,PLOT__=False)
    print(f'{symbol}: R² = {r2:.4f}, Sign Accuracy = {sign_acc:.2%},Significance={significant}')

It seems like we are doing bettere then a fair coin toss but it is important to consider that if we are forecasting with a constant value (for example aleways positive) and indeed most of the time the stock is going up, we will have a good accuracy. Of course, this is not a good model, but it is important to consider that the accuracy of a model is not the only metric to evaluate its performance. We will also look at the sharpe of the strategy built out of the model.

In [None]:
def rolling_lasso_single_symbol_fit_over_50(df: pd.DataFrame, symbol: str, alpha: float=10e-1,PLOT__: bool = True) -> tuple[float, float, tuple[float, float], str]:
    '''
    Perform rolling lasso regressions using scikit-learn.
    Each model is trained on 50 observations with 10 lagged returns as features,
    and used to predict the next return. Each model is used to predict exactly one future return.

    Inputs:
        df (pd.DataFrame): contains ['SYMBOL', 'DATE', 'TIME', 'RETURN']
        symbol (str): the symbol to process
        PLOT__ (bool): if True, shows scatterplot of actual vs predicted

    Outputs:
        r2 (float): R² score
        sign_accuracy (float): percentage of correctly predicted signs
        ci (tuple): 99% confidence interval around sign accuracy
        significant (str): 'YES' if interval excludes 0.5, otherwise 'NO'
    '''
    group = df[df['SYMBOL'] == symbol].sort_values(by=['DATE', 'TIME']).reset_index(drop=True)
    returns = group['RETURN'].values

    max_lag = 10
    train_size = 50
    min_required = train_size + max_lag

    if len(returns) < min_required + 1:
        return np.nan, np.nan, (np.nan, np.nan), 'NA'

    # Build lag matrix and targets
    X_lags = np.column_stack([returns[i:len(returns)-max_lag+i] for i in range(max_lag)])
    y_all = returns[max_lag:]

    y_true = []
    y_pred = []

    max_t = len(y_all) - train_size - 1

    for t in range(max_t):
        X_train = X_lags[t:t+train_size]
        y_train = y_all[t:t+train_size]

        model = Lasso(alpha)
        model.fit(X_train, y_train)

        x_test = X_lags[t + train_size].reshape(1, -1)
        pred = model.predict(x_test)[0]

        y_true.append(y_all[t + train_size])
        y_pred.append(pred)

    if not y_true:
        return np.nan, np.nan, (np.nan, np.nan), 'NA'

    r2 = r2_score(y_true, y_pred)
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    sign_accuracy = np.mean(np.sign(y_true) == np.sign(y_pred))

    # Confidence interval (99%)
    n = len(y_true)
    p_hat = sign_accuracy
    p_0 = 0.5
    z = norm.ppf(1 - 0.01 / 2)  # two-tailed 99% → z ≈ 2.576
    se = np.sqrt(p_0 * (1 - p_0) / n)
    margin = z * se
    ci_low = p_hat - margin
    ci_high = p_hat + margin
    significant = "YES" if ci_low > 0.5 or ci_high < 0.5 else "NO"
    if PLOT__:
        plt.figure(figsize=(6, 6))
        plt.scatter(y_true, y_pred, alpha=0.6)
        plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='red', linestyle='--')
        plt.xlabel('Actual RETURN')
        plt.ylabel('Predicted RETURN')
        plt.title(f'{symbol}\nR² = {r2:.4f}, Sign Accuracy = {sign_accuracy:.2%}')
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    return r2, sign_accuracy, (ci_low, ci_high), significant

In [None]:
rolling_lasso_single_symbol_fit_over_50(dataset,'AFGB')

In [None]:
symbols = dataset['SYMBOL'].dropna().unique()
random_symbols = random.sample(list(symbols), k=100)
for symbol in random_symbols:
    r2, sign_acc,_,significant = rolling_lasso_single_symbol_fit_over_50(dataset, symbol,PLOT__=False)
    print(f'{symbol}: R² = {r2:.4f}, Sign Accuracy = {sign_acc:.2%},Significance={significant}')

In [None]:
def rolling_arima_single_symbol_fit_over_50(df: pd.DataFrame, symbol: str, PLOT__: bool = True) -> tuple[float, float, tuple[float, float], str]:
    '''
    Perform rolling autoregressive forecasting using pmdarima's auto_arima.
    Each model is trained on 50 observations of raw returns and used to predict the next.
    Performance is measured by R² and sign accuracy, with 99% CI for the latter.

    Inputs:
        df (pd.DataFrame): must contain ['SYMBOL', 'DATE', 'TIME', 'RETURN']
        symbol (str): asset to analyze
        PLOT__ (bool): if True, shows scatterplot

    Outputs:
        r2 (float): R² score
        sign_accuracy (float): percent correct sign predictions
        ci (tuple): 99% confidence interval for sign accuracy
        significant (str): whether the sign accuracy is statistically different from 50%
    '''
    group = df[df['SYMBOL'] == symbol].sort_values(by=['DATE', 'TIME']).reset_index(drop=True)
    returns = group['RETURN'].values

    train_size = 50
    if len(returns) < train_size + 1:
        return np.nan, np.nan, (np.nan, np.nan), 'NA'

    y_true = []
    y_pred = []

    for t in range(len(returns) - train_size - 1):
        y_train = returns[t:t+train_size]

        try:
            model = pm.auto_arima(
                y_train,
                start_p=1, max_p=10,
                d=0,      # no differencing
                start_q=0, max_q=0,  # pure AR
                seasonal=False,
                stepwise=True,
                suppress_warnings=True,
                error_action='ignore'
            )

            forecast = model.predict(n_periods=1)[0]
            actual = returns[t + train_size]
            y_pred.append(forecast)
            y_true.append(actual)

        except Exception as e:
            continue

    if not y_true:
        return np.nan, np.nan, (np.nan, np.nan), 'NA'

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)

    r2 = r2_score(y_true, y_pred)
    sign_accuracy = np.mean(np.sign(y_true) == np.sign(y_pred))

    # Confidence interval
    n = len(y_true)
    p_hat = sign_accuracy
    p_0 = 0.5
    z = norm.ppf(1 - 0.01 / 2)
    se = np.sqrt(p_0 * (1 - p_0) / n)
    margin = z * se
    ci_low = p_hat - margin
    ci_high = p_hat + margin
    significant = "YES" if ci_low > 0.5 or ci_high < 0.5 else "NO"
    if PLOT__:
        plt.figure(figsize=(6, 6))
        plt.scatter(y_true, y_pred, alpha=0.6)
        plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='red', linestyle='--')
        plt.xlabel('Actual RETURN')
        plt.ylabel('Predicted RETURN')
        plt.title(f'{symbol}\nR² = {r2:.4f}, Sign Accuracy = {sign_accuracy:.2%}')
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    return r2, sign_accuracy, (ci_low, ci_high), significant


In [None]:
rolling_arima_single_symbol_fit_over_50(dataset,'AFGB')

In [None]:
symbols = dataset['SYMBOL'].dropna().unique()
random_symbols = random.sample(list(symbols), k=10)
for symbol in random_symbols:
    r2, sign_acc,_,significant = rolling_arima_single_symbol_fit_over_50(dataset, symbol,PLOT__=False)
    print(f'{symbol}: R² = {r2:.4f}, Sign Accuracy = {sign_acc:.2%},Significance={significant}')