In [130]:
import sys
import os

sys.path.append(os.path.abspath(".."))  # Move one level up to 'project_root'
from scripts.utilities import *

import bisect
from TracyWidom import TracyWidom
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import statsmodels.api as sm

The file "history.csv" contains historical data for almost all US stocks and ETFs, it can be downloaded from the source below (1.5gb so decided not to push) \
Later I calculated returns (based on Adj close) for each trading day (relatively) and compiled it into 1 df so different stocks could be compared, the result is stored in returns.csv\
Source: [historical data for all US stocks and ETF's](https://www.kaggle.com/datasets/ericstanley/us-stock-market-history-data-csv)

Now i standartize the df by using the entire sample statistics for each stock \
Temporary decision considering that the distributions evolve in time, but i havent figured out a better approach yet

In [122]:
returns = pd.read_csv('../data/returns.csv', index_col=0)
R = (returns - returns.mean()) / returns.std()
R = R[:5000]
R

Unnamed: 0,INCR,EURN,NRP,AUB,ROIV,RBA,SBGI,EBON,JCTCF,EXAS,...,PFC,GM,BOTJ,GROV,DRH,ODD,SITM,CPBI,VRTS,TEL
1,,,,,,,,,,,...,,,,,,,,,,
2,2.711616,0.360892,0.332263,3.464503,-1.067960,1.052357,-0.610278,-1.320146,-0.025771,-0.948870,...,-0.024715,0.076327,-0.022619,0.239008,2.347903,1.229809,8.492683,-0.650456,-5.839329,-0.094508
3,-0.952001,-0.748772,-0.113213,-0.421184,-0.169015,1.945229,0.129280,0.683870,-1.014859,-0.507489,...,0.356745,-0.262743,-0.022619,-0.186093,-0.567944,1.211574,-0.697322,-0.069455,12.543608,-2.169173
4,-0.287062,0.629069,-0.910800,1.980605,0.027784,-1.506999,0.278080,-2.124831,-0.025771,-0.124423,...,-0.402738,-1.149779,-0.022619,0.025558,-0.017975,-1.312264,-1.045595,0.164229,-5.040725,-0.639139
5,-0.032006,-2.085596,0.138817,-0.024685,0.182556,2.532913,-0.613352,1.342251,0.995224,-0.025982,...,0.738220,0.302487,2.731230,0.025558,-0.437815,-0.665729,-0.224650,-0.885547,-1.204462,-1.057872
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4996,,,-0.614268,-0.024685,,0.392320,0.069048,,-0.025771,-0.556782,...,0.592916,,0.859379,,,,,,,
4997,,,0.574776,0.211701,,-0.086490,-0.075082,,-0.025771,0.409084,...,-0.375165,,-1.352522,,,,,,,
4998,,,1.082311,-0.009028,,0.274396,-0.417434,,-0.025771,-0.591007,...,-0.043330,,-0.455364,,,,,,,
4999,,,-0.657759,-0.040335,,-1.064821,0.435590,,-0.025771,0.025330,...,0.496319,,-0.963374,,,,,,,


Checking if the samples for each timestamp are iid using Baik–Deift–Johansson theorem \
Let L_n be the length of the longest increasing subsequence in a random permutation sampled uniformly from S_n, the permutation group on n elements. Then the cumulative distribution function of $\frac {l_{n}-2N^{1/2}}{N^{1/6}}$  converges to F_2

Sources: \
[Baik–Deift–Johansson theorem](https://en.wikipedia.org/wiki/Baik–Deift–Johansson_theorem) \
[Tracy–Widom distribution](https://en.wikipedia.org/wiki/Tracy–Widom_distribution)


In [123]:
def LIS(seq):
    sub = []
    for x in seq:
        i = bisect.bisect_left(sub, x)
        if i == len(sub):
            sub.append(x)
        else:
            sub[i] = x
    return len(sub)

def TWT(data):
    n = len(data)
    L_n = LIS(data)
    z = (L_n - 2 * np.sqrt(n)) / (n ** (1/6))
    p_val = 1 - TracyWidom(beta=2).cdf(z)
    return L_n, z, p_val

\begin{align*}
H_0 &: \text{The data are independent and identically distributed (i.i.d.) from a continuous distribution.} \\
H_a &: \text{The data are not i.i.d.; i.e., they exhibit dependence or come from different distributions.}
\end{align*}

In [124]:
data = []
for i in range(1, len(R) - 1):
    val = R.iloc[i, :].to_list()
    L_n, z, p_val = TWT(val)
    data.append([L_n, z, p_val])

In [125]:
data = np.array(data)
pvals = data[:,2]

len(np.where(pvals < 0.05)[0]) / len(pvals)  # proportion of p-values < 0.05

0.0056022408963585435

Plotting the data

In [126]:
fig = go.Figure()

data_sets = R[1::100].values

# Add histogram traces
for i, data in enumerate(data_sets):
    fig.add_trace(go.Histogram(
        x=data, histnorm='probability density',
        visible=(i == 0)
    ))

# Create slider steps with annotations
steps = []
for i, data in enumerate(data_sets):
    vis = [j == i for j in range(len(data_sets))]
    data_clean = data[~np.isnan(data)]
    mean = np.mean(data_clean)
    std = np.std(data)
    annotation = [dict(
        x=0, y=1.05, xref='paper', yref='paper',
        text=f"Mean: {mean:.2f}, Std: {std:.2f}",
        showarrow=False, font=dict(size=14)
    )]
    steps.append(dict(
        method='update',
        args=[
            {'visible': vis},
            {'annotations': annotation}
        ],
        label=f'Set {i}'
    ))

fig.update_layout(
    xaxis_range=[-10, 10],
    sliders=[{'active': 0, 'steps': steps}],
    annotations=[dict(
        x=0, y=1.05, xref='paper', yref='paper',
        text=f"Mean: {np.mean(data_sets[0]):.2f}, Std: {np.std(data_sets[0]):.2f}",
        showarrow=False, font=dict(size=14)
    )]
)

fig.show()


In [127]:
means = R.mean(axis=1)
stds = R.std(axis=1)

# Create subplot figure
fig = make_subplots(rows=1, cols=2, subplot_titles=("Mean over Time", "Standard Deviation over Time"))

# Mean plot
fig.add_trace(go.Scatter(y=means, mode='markers', name='Mean', marker=dict(size=6, opacity=0.3)), row=1, col=1)

# Std plot
fig.add_trace(go.Scatter(y=stds, mode='markers', name='Std Dev', marker=dict(size=6, opacity=0.3)), row=1, col=2)

fig.update_layout(height=400, width=1200)
fig.show()

Running regressions just to see the trends (thank god they are signigicant)

In [133]:
means = means.dropna()

X = sm.add_constant(means.index)

mod = sm.OLS(means, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.005
Model:                            OLS   Adj. R-squared:                  0.005
Method:                 Least Squares   F-statistic:                     26.75
Date:                Thu, 17 Apr 2025   Prob (F-statistic):           2.41e-07
Time:                        17:27:58   Log-Likelihood:                 9716.6
No. Observations:                4999   AIC:                        -1.943e+04
Df Residuals:                    4997   BIC:                        -1.942e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.0053      0.001      5.375      0.0

In [134]:
stds = stds.dropna()

X = sm.add_constant(stds.index)

mod = sm.OLS(stds, X)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.194
Model:                            OLS   Adj. R-squared:                  0.194
Method:                 Least Squares   F-statistic:                     1205.
Date:                Thu, 17 Apr 2025   Prob (F-statistic):          1.19e-236
Time:                        17:28:08   Log-Likelihood:                 4300.1
No. Observations:                4999   AIC:                            -8596.
Df Residuals:                    4997   BIC:                            -8583.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.0737      0.003    370.527      0.0