In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report


In [2]:
tickers = ["NVDA", "AMD", "TSM", "INTC"]
start_date = "2018-01-01"

data = yf.download(tickers, start=start_date, progress=False)
prices = data["Close"].dropna()
returns = prices.pct_change().dropna()


In [3]:
RV_WINDOW = 10

realized_vol = (
    returns
    .rolling(RV_WINDOW)
    .std()
    * np.sqrt(252)
)

realized_vol.tail()


Ticker,AMD,INTC,NVDA,TSM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2025-12-19,0.518257,0.325619,0.378771,0.391736
2025-12-22,0.513215,0.314448,0.375925,0.377202
2025-12-23,0.512728,0.308245,0.405771,0.383095
2025-12-24,0.512932,0.29468,0.404072,0.363202
2025-12-26,0.512896,0.279467,0.391711,0.366148


In [4]:
FWD_WINDOW = 5

future_vol = realized_vol.shift(-FWD_WINDOW)

vol_ratio = future_vol / realized_vol

VOL_EXPANSION_THRESHOLD = 1.25

vol_regime = (vol_ratio > VOL_EXPANSION_THRESHOLD).astype(int)
vol_regime = vol_regime.dropna()


In [5]:
WINDOW = 20
Z_THRESHOLD = 2.5

z_scores = (
    returns
    .rolling(WINDOW)
    .apply(lambda x: (x.iloc[-1] - x.mean()) / x.std(), raw=False)
)

shock_strength = z_scores.abs()


In [8]:
features = pd.concat([
    shock_strength.mean(axis=1),     # sector shock
    realized_vol.mean(axis=1),       # current vol
    returns.mean(axis=1)             # sector drift
], axis=1)

features.columns = [
    "sector_shock",
    "current_vol",
    "sector_return"
]

sector_vol_regime = (vol_regime.max(axis=1) > 0).astype(int)

dataset = features.join(
    sector_vol_regime.rename("target")
).dropna()

dataset.head()


Unnamed: 0_level_0,sector_shock,current_vol,sector_return,target
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-31,0.688171,0.406804,0.019137,1
2018-02-01,0.947142,0.419998,-0.016078,1
2018-02-02,1.610573,0.465203,-0.03539,1
2018-02-05,2.061933,0.533146,-0.056481,1
2018-02-06,0.837893,0.555742,0.023253,1


In [9]:
X = dataset.drop("target", axis=1)
y = dataset["target"]

model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression())
])

model.fit(X, y)


0,1,2
,steps,"[('scaler', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [10]:
preds = model.predict(X)
print(classification_report(y, preds))


              precision    recall  f1-score   support

           0       0.64      0.52      0.58       958
           1       0.62      0.73      0.67      1030

    accuracy                           0.63      1988
   macro avg       0.63      0.63      0.62      1988
weighted avg       0.63      0.63      0.63      1988



In [11]:
dataset["p_vol_expansion"] = model.predict_proba(X)[:, 1]
dataset[["p_vol_expansion"]].tail()


Unnamed: 0_level_0,p_vol_expansion
Date,Unnamed: 1_level_1
2025-12-19,0.53905
2025-12-22,0.500979
2025-12-23,0.491491
2025-12-24,0.483477
2025-12-26,0.490645


In [12]:
bt = dataset.copy()

bt["future_vol"] = realized_vol.mean(axis=1).shift(-5)
bt["vol_change"] = bt["future_vol"] - bt["current_vol"]

bt = bt.dropna()
bt.tail()


Unnamed: 0_level_0,sector_shock,current_vol,sector_return,target,p_vol_expansion,future_vol,vol_change
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-12-12,1.584818,0.395444,-0.041452,1,0.682206,0.403596,0.008152
2025-12-15,0.47683,0.391301,-0.007646,1,0.528739,0.395197,0.003896
2025-12-16,0.36606,0.330253,0.001854,1,0.582636,0.40246,0.072207
2025-12-17,1.466317,0.370643,-0.039835,1,0.696764,0.393721,0.023078
2025-12-18,0.798545,0.356378,0.016966,1,0.576793,0.387555,0.031177


In [13]:
ENTRY_THRESHOLD = 0.65

bt["trade"] = (bt["p_vol_expansion"] > ENTRY_THRESHOLD).astype(int)


In [14]:
bt.groupby("trade")["vol_change"].mean()


trade
0   -0.011311
1    0.037867
Name: vol_change, dtype: float64

In [15]:
bt[bt["trade"] == 1]["vol_change"].describe()


count    448.000000
mean       0.037867
std        0.069360
min       -0.109102
25%       -0.009641
50%        0.028485
75%        0.076963
max        0.443243
Name: vol_change, dtype: float64

In [16]:
bt["trade"].mean()


np.float64(0.2259203227433182)

In [17]:
bt.groupby(bt.index.year)["vol_change"].mean()


Date
2018    0.002652
2019   -0.006731
2020    0.000654
2021    0.002501
2022    0.000230
2023   -0.001371
2024    0.000445
2025    0.000257
Name: vol_change, dtype: float64

In [18]:
for t in [0.65, 0.7, 0.75]:
    mean_vol = bt[bt["p_vol_expansion"] > t]["vol_change"].mean()
    freq = (bt["p_vol_expansion"] > t).mean()
    print(t, mean_vol, freq)


0.65 0.037867214926402895 0.2259203227433182
0.7 0.04791388237621878 0.11901159858799798
0.75 0.058129315993059684 0.040342914775592535


In [20]:
def walk_forward_predictions(data, model, train_years=3):
    preds = []
    
    years = sorted(data.index.year.unique())
    
    for i in range(train_years, len(years)):
        train_years_range = years[i-train_years:i]
        test_year = years[i]

        train = data[data.index.year.isin(train_years_range)]
        test  = data[data.index.year == test_year]

        X_train = train.drop("target", axis=1)
        y_train = train["target"]

        X_test = test.drop("target", axis=1)

        model.fit(X_train, y_train)
        p = model.predict_proba(X_test)[:, 1]

        preds.append(
            pd.Series(p, index=X_test.index)
        )

    return pd.concat(preds)



In [21]:
LogisticRegression(
    class_weight="balanced",
    max_iter=1000
)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [22]:
thresholds = np.arange(0.55, 0.85, 0.05)

for t in thresholds:
    preds = (p_walk > t).astype(int)
    freq = preds.mean()
    recall = (preds[y_true == 1].sum() / (y_true == 1).sum())
    print(t, round(freq,2), round(recall,2))


NameError: name 'p_walk' is not defined

In [26]:
dataset.head()
dataset.index


DatetimeIndex(['2018-01-31', '2018-02-01', '2018-02-02', '2018-02-05',
               '2018-02-06', '2018-02-07', '2018-02-08', '2018-02-09',
               '2018-02-12', '2018-02-13',
               ...
               '2025-12-12', '2025-12-15', '2025-12-16', '2025-12-17',
               '2025-12-18', '2025-12-19', '2025-12-22', '2025-12-23',
               '2025-12-24', '2025-12-26'],
              dtype='datetime64[ns]', name='Date', length=1988, freq=None)

In [27]:
features["vol_compression"] = (
    realized_vol.mean(axis=1) <
    realized_vol.mean(axis=1).rolling(60).quantile(0.3)
)


In [28]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

model = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(
        class_weight="balanced",
        max_iter=1000
    ))
])


In [29]:
p_walk = walk_forward_predictions(
    data=dataset,
    model=model,
    train_years=3
)


In [30]:
p_walk.head()
p_walk.tail()


Date
2025-12-19    0.571196
2025-12-22    0.539221
2025-12-23    0.526833
2025-12-24    0.524129
2025-12-26    0.536906
dtype: float64

In [31]:
y_true = dataset.loc[p_walk.index, "target"]


In [32]:
len(p_walk), len(y_true)


(1252, 1252)

In [33]:
thresholds = np.arange(0.55, 0.85, 0.05)

for t in thresholds:
    preds = (p_walk > t).astype(int)
    freq = preds.mean()
    recall = (preds[y_true == 1].sum() / (y_true == 1).sum())
    print(f"t={t:.2f} | freq={freq:.2f} | recall={recall:.2f}")


t=0.55 | freq=0.37 | recall=0.50
t=0.60 | freq=0.24 | recall=0.33
t=0.65 | freq=0.16 | recall=0.22
t=0.70 | freq=0.09 | recall=0.13
t=0.75 | freq=0.03 | recall=0.04
t=0.80 | freq=0.01 | recall=0.01


In [34]:
dataset        # features + target, datetime index
walk_forward_predictions
model


0,1,2
,steps,"[('scaler', ...), ('clf', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [35]:
p_walk = walk_forward_predictions(
    data=dataset,
    model=model,
    train_years=3
)


In [37]:
p_walk.head()
p_walk.tail()
p_walk.isna().sum()


np.int64(0)

In [38]:
y_true = dataset.loc[p_walk.index, "target"]

len(p_walk), len(y_true)


(1252, 1252)

In [39]:
import numpy as np

thresholds = np.arange(0.55, 0.85, 0.05)

results = []

for t in thresholds:
    preds = (p_walk > t).astype(int)
    freq = preds.mean()
    recall = (preds[y_true == 1].sum() / (y_true == 1).sum())
    
    results.append((t, freq, recall))
    print(f"t={t:.2f} | freq={freq:.2f} | recall={recall:.2f}")


t=0.55 | freq=0.37 | recall=0.50
t=0.60 | freq=0.24 | recall=0.33
t=0.65 | freq=0.16 | recall=0.22
t=0.70 | freq=0.09 | recall=0.13
t=0.75 | freq=0.03 | recall=0.04
t=0.80 | freq=0.01 | recall=0.01


In [40]:
eval_df = pd.DataFrame({
    "p": p_walk,
    "target": y_true
})

eval_df.groupby(eval_df.index.year)["p"].mean()


Date
2021    0.545471
2022    0.353166
2023    0.558349
2024    0.440376
2025    0.451097
Name: p, dtype: float64