In [83]:
# Day 5.4 – Permutation Sanity Check
# Purpose:
# Verify whether the absence of return predictability
# is structural or an artifact of temporal alignment.
#
# This notebook performs diagnostic analysis only.
# No model, feature, or validation logic is modified.


In [84]:
import sys
from pathlib import Path
import numpy as np
import pandas as pd

project_root=Path().resolve().parent.parent
sys.path.append(str(project_root))


In [85]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

In [86]:
from return_predictability.src.load_data import load_data



In [87]:
df=load_data("tsla.csv")
price=df["price"]
price.head()

Date
2010-06-29    1.59267
2010-06-30    1.58867
2010-07-01    1.46400
2010-07-02    1.28000
2010-07-06    1.07400
Name: price, dtype: float64

In [88]:
returns=np.log(price).diff()
returns.head()
type(returns)

pandas.core.series.Series

In [89]:
X=pd.DataFrame(
    {
        "lag_1":returns.shift(1),
        "lag_5":returns.shift(5),
    }
)
y=returns.copy()
X.head()

Unnamed: 0_level_0,lag_1,lag_5
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2010-06-29,,
2010-06-30,,
2010-07-01,-0.002515,
2010-07-02,-0.081725,
2010-07-06,-0.134312,


In [90]:
mask=X.notnull().all(axis=1)
X_clean=X[mask]
y_clean=y[mask]

In [91]:
rng=np.random.default_rng(seed=42)
y_permuted=y_clean.copy()
y_permuted[:] = rng.permutation(y_clean.values)


In [92]:
assert (y_clean.index==y_permuted.index).all()

In [93]:
X_clean.shape, y_clean.shape


((3887, 2), (3887,))

In [94]:
predictions=[]
actuals=[]

In [95]:
for t in range(1,len(y_clean)):
    X_train=X_clean.iloc[:t]
    y_train=y_clean.iloc[:t]

    X_test=X_clean.iloc[t:t+1]
    y_test=y_clean.iloc[t]

    model=LinearRegression()
    model.fit(X_train,y_train)

    y_pred=model.predict(X_test)
    predictions.append(float(y_pred[0]))
    actuals.append(float(y_test))

In [96]:
print(len(predictions)==len(actuals))
print(predictions[:5])
print(actuals[:5])

True
[0.09990577494050412, 0.03770336995625995, -0.05166324439617888, 0.03603266158229525, 0.10773112291384776]
[-0.003442344190972807, -0.0203170699439377, 0.06196355206706522, 0.08958593367123063, 0.002514470851039452]


In [97]:
predictions_permuted=[]
actuals_permuted=[]

In [98]:
for t in range(1,len(y_clean)):
    X_train=X_clean.iloc[:t]
    y_train=y_permuted.iloc[:t]

    X_test=X_clean.iloc[t:t+1]
    y_test=y_permuted.iloc[t]

    model=LinearRegression()
    model.fit(X_train,y_train)

    y_pred=model.predict(X_test)
    predictions_permuted.append(float(y_pred[0]))
    actuals_permuted.append(float(y_test))

In [99]:
r2=r2_score(actuals,predictions)
print(f"Walk-Forward R^2 with Clean Targets: {r2:.6f}")

Walk-Forward R^2 with Clean Targets: -0.014559


In [100]:
r2=r2_score(actuals_permuted,predictions_permuted)
print(f"Walk-Forward R^2 with Permuted Targets: {r2:.6f}")

Walk-Forward R^2 with Permuted Targets: -0.007152


In [101]:
# Under walk-forward validation,
# a linear model trained on lagged returns
# failed to produce positive out-of-sample R².
#
# When the target variable was permuted,
# performance remained at a similar level,
# indicating that the observed lack of predictability
# is not a pipeline artifact,
# but rather a consequence of the underlying problem structure.
