## Given today’s market conditions, how likely is it that the market will experience a risky drawdown tomorrow?

In [27]:
import pandas as pd
import numpy as np
import yfinance as yf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


In [28]:
df = yf.download("^GSPC", start="2005-01-01", end="2024-01-01")
df.head()


  df = yf.download("^GSPC", start="2005-01-01", end="2024-01-01")
[*********************100%***********************]  1 of 1 completed


Price,Close,High,Low,Open,Volume
Ticker,^GSPC,^GSPC,^GSPC,^GSPC,^GSPC
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2005-01-03,1202.079956,1217.800049,1200.319946,1211.920044,1510800000
2005-01-04,1188.050049,1205.839966,1185.390015,1202.079956,1721000000
2005-01-05,1183.73999,1192.72998,1183.719971,1188.050049,1738900000
2005-01-06,1187.890015,1191.630005,1183.27002,1183.73999,1569100000
2005-01-07,1186.189941,1192.199951,1182.160034,1187.890015,1477900000


In [29]:
df.shape

(4781, 5)

In [30]:
df.columns

MultiIndex([( 'Close', '^GSPC'),
            (  'High', '^GSPC'),
            (   'Low', '^GSPC'),
            (  'Open', '^GSPC'),
            ('Volume', '^GSPC')],
           names=['Price', 'Ticker'])

In [31]:
df.columns = df.columns.get_level_values(0)


In [32]:
print(df.columns)


Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')


In [33]:
df.columns.name = None


In [34]:
print(df.columns)


Index(['Close', 'High', 'Low', 'Open', 'Volume'], dtype='object')


In [35]:
df = df[['Close']]
df.dropna(inplace=True)

In [36]:
df.shape

(4781, 1)

## Feature Engineering: Explainable Features

In [37]:
# Daily Returns: How much did the market move today?

df['return'] = df['Close'].pct_change()


In [38]:
# Volatility (Risk): How violently prices move (risk intensity)

df['volatility_20'] = df['return'].rolling(20).std()


In [39]:
# Trend Strength (MA Slope):Is the market trending up or down?

df['ma_20'] = df['Close'].rolling(20).mean()
df['trend_strength'] = df['ma_20'].pct_change()


In [40]:
# Momentum: Short-term directional pressure

df['momentum_5'] = df['return'].rolling(5).sum()

In [41]:
# Drawdown: How deep are losses from the peak?

rolling_max = df['Close'].cummax()
df['drawdown'] = (df['Close'] - rolling_max) / rolling_max


In [42]:
df.shape

(4781, 7)

In [43]:
df.head()

Unnamed: 0_level_0,Close,return,volatility_20,ma_20,trend_strength,momentum_5,drawdown
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2005-01-03,1202.079956,,,,,,0.0
2005-01-04,1188.050049,-0.011671,,,,,-0.011671
2005-01-05,1183.73999,-0.003628,,,,,-0.015257
2005-01-06,1187.890015,0.003506,,,,,-0.011804
2005-01-07,1186.189941,-0.001431,,,,,-0.013219


In [44]:
df.dropna(inplace=True)


In [45]:
df.shape

(4761, 7)

# Regime Based

In [46]:
regimes = pd.read_csv("./outputs/regimes.csv")


In [47]:
df.columns

Index(['Close', 'return', 'volatility_20', 'ma_20', 'trend_strength',
       'momentum_5', 'drawdown'],
      dtype='object')

In [48]:
regimes.columns

Index(['Date', 'return', 'volatility', 'drawdown', 'regime'], dtype='object')

In [49]:
df = df.reset_index()


In [50]:
df.columns

Index(['Date', 'Close', 'return', 'volatility_20', 'ma_20', 'trend_strength',
       'momentum_5', 'drawdown'],
      dtype='object')

In [54]:
print(df['Date'].dtype)
print(regimes['Date'].dtype)


datetime64[ns]
object


In [55]:
df['Date'] = pd.to_datetime(df['Date'])
regimes['Date'] = pd.to_datetime(regimes['Date'])

In [56]:
print(df['Date'].dtype)
print(regimes['Date'].dtype)


datetime64[ns]
datetime64[ns]


In [57]:
df = df.merge(
    regimes[['Date', 'regime']],
    on='Date',
    how='inner'
)


In [58]:
df[['Date', 'regime']].head()

Unnamed: 0,Date,regime
0,2005-02-01,0
1,2005-02-02,0
2,2005-02-03,0
3,2005-02-04,1
4,2005-02-07,0


In [59]:
df['regime'].value_counts()

Unnamed: 0_level_0,count
regime,Unnamed: 1_level_1
1,3301
0,1169
2,291


## Target


In [60]:
df['future_drawdown'] = df['drawdown'].shift(-1)

In [61]:
df['risk_target'] = (df['future_drawdown'] < -0.02).astype(int)

In [65]:
df.sample(5)

Unnamed: 0,Date,Close,return,volatility_20,ma_20,trend_strength,momentum_5,drawdown,regime,future_drawdown,risk_target
4433,2022-09-12,4110.410156,0.010584,0.012703,4099.388062,-0.002066,0.046712,-0.14305,1,-0.180102,1
917,2008-09-23,1188.219971,-0.015633,0.02619,1240.652997,-0.003158,-0.017412,-0.240827,0,-0.242328,1
165,2005-09-27,1215.660034,2.5e-05,0.006011,1224.978503,0.000138,-0.004613,-0.023598,0,-0.02261,1
351,2006-06-23,1244.5,-0.000883,0.00979,1255.0495,-0.001129,-0.005538,-0.061293,0,-0.056722,1
2696,2015-10-16,2033.109985,0.00457,0.010634,1966.829993,0.001912,0.009157,-0.045856,1,-0.045597,1


# Next-day drawdown risk prediction (binary classification)

- 1 → High risk tomorrow (≥2% drawdown)

- 0 → Normal conditions

- Binary
 & Intuitive: Risk-focused

In [66]:
features = [
    'regime',
    'volatility_20',
    'trend_strength',
    'momentum_5'
]

X = df[features]
y = df['risk_target']


- regime → Market environment (calm / volatile / crisis)
- volatility_20 → Current risk intensity
- trend_strength → Is the market trending or weakening?
- momentum_5 → Short-term directional pressure

**Condition risk prediction on both current regime and market dynamics**

In [67]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.25,
    shuffle=False
)


In [68]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [69]:
model = LogisticRegression(max_iter=1000)

model.fit(X_train_scaled, y_train)


In [70]:
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

print(classification_report(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_prob))


              precision    recall  f1-score   support

           0       0.74      0.60      0.67       445
           1       0.79      0.88      0.83       746

    accuracy                           0.77      1191
   macro avg       0.77      0.74      0.75      1191
weighted avg       0.77      0.77      0.77      1191

ROC AUC: 0.8629273729553875


In [71]:
coefficients = pd.DataFrame({
    "Feature": features,
    "Coefficient": model.coef_[0]
}).sort_values(by="Coefficient", ascending=False)

coefficients


Unnamed: 0,Feature,Coefficient
1,volatility_20,3.793446
2,trend_strength,-0.083148
0,regime,-0.823399
3,momentum_5,-0.878395


- volatility_20   +3.79   → strongest risk driver
- regime          -0.82   → regimes matter a lot
- momentum_5      -0.87   → negative momentum increases risk
- trend_strength  -0.08   → weaker but still informative


**Volatility is the strongest predictor of drawdown risk, but regime information significantly improves prediction, confirming that risk behaves differently across market states.**

## Market regime is one of the strongest predictors of future drawdown risk.

In [72]:
X_no_regime = df[
    ['volatility_20', 'trend_strength', 'momentum_5']
]

X_train_nr, X_test_nr, y_train, y_test = train_test_split(
    X_no_regime, y,
    test_size=0.25,
    shuffle=False
)

scaler_nr = StandardScaler()
X_train_nr = scaler_nr.fit_transform(X_train_nr)
X_test_nr = scaler_nr.transform(X_test_nr)

model_nr = LogisticRegression(max_iter=1000)
model_nr.fit(X_train_nr, y_train)

roc_nr = roc_auc_score(y_test, model_nr.predict_proba(X_test_nr)[:, 1])


In [73]:
print("With regime:", roc_auc_score(y_test, y_prob))
print("Without regime:", roc_nr)


With regime: 0.8629273729553875
Without regime: 0.8564026869897883


Including market regime information improves predictive performance, validating the two-stage pipeline design.