In [None]:
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout, Bidirectional, LayerNormalization
from tensorflow.keras.losses import Huber
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from ta.momentum import RSIIndicator
from ta.trend import MACD
from ta.volatility import BollingerBands
from statsmodels.tsa.stattools import grangercausalitytests
import warnings
warnings.filterwarnings("ignore")

# DETERMINISM
os.environ['PYTHONHASHSEED'] = '42'
random.seed(42)
np.random.seed(42)
tf.random.set_seed(42)
np.seterr(invalid='ignore')

BASE_PATH = '/kaggle/input/marketdata-ohlcv/market_data'
category_folders = ['mag7', 'sectors', 'crypto', 'forex', 'commodities', 'indices', 'macro']
all_data = []
correlation_report = []
granger_report = []

# TARGET: SPY
spy_df = pd.read_csv(f'{BASE_PATH}/target/SPY_1d.csv', skiprows=2)
spy_df.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
spy_df = spy_df[spy_df['Date'] >= '2022-07-01']
spy_df[['Close']] = spy_df[['Close']].astype(float)
spy_df['Returns'] = spy_df['Close'].pct_change().bfill()
spy_returns = spy_df['Returns'].values.reshape(-1, 1)

# TA INDICATORS
temp_df = spy_df.copy()
temp_df['RSI'] = RSIIndicator(close=temp_df['Close'], window=14).rsi()
temp_df['MACD'] = MACD(close=temp_df['Close']).macd()
temp_df['MACD_signal'] = MACD(close=temp_df['Close']).macd_signal()
temp_df['BB_upper'] = BollingerBands(close=temp_df['Close']).bollinger_hband()
temp_df['BB_lower'] = BollingerBands(close=temp_df['Close']).bollinger_lband()
spy_ta = temp_df[['RSI', 'MACD', 'MACD_signal', 'BB_upper', 'BB_lower']].values
spy_close_full = spy_df['Close'].values.reshape(-1, 1)
spy_dates = pd.to_datetime(spy_df['Date']).reset_index(drop=True).to_numpy()

# LOAD CATEGORY DATA
for folder in category_folders:
    folder_path = os.path.join(BASE_PATH, folder)
    if not os.path.exists(folder_path):
        continue
    for file in os.listdir(folder_path):
        if file.endswith(".csv"):
            try:
                df = pd.read_csv(os.path.join(folder_path, file), skiprows=2)
                df.columns = ['Date', 'Close', 'High', 'Low', 'Open', 'Volume']
                df = df[df['Date'] >= '2022-07-01']
                for col in ['Open', 'High', 'Low', 'Close', 'Volume']:
                    df[col] = pd.to_numeric(df[col], errors='coerce')
                df = df.dropna(subset=['Open', 'High', 'Low', 'Close', 'Volume'])
                df_pct = df[['Open', 'High', 'Low', 'Close', 'Volume']].pct_change()
                df_pct.replace([np.inf, -np.inf], np.nan, inplace=True)
                df_pct = df_pct.bfill().ffill()

                spy_len = min(len(df_pct), len(spy_returns))
                corr = pd.Series(df_pct['Close'][:spy_len]).corr(pd.Series(spy_returns[:spy_len].flatten()))
                actual_corr = corr if np.isfinite(corr) else None
                if actual_corr is not None:
                    if -0.025 <= actual_corr < 0:
                        weight = 2
                    elif -0.05 <= actual_corr < -0.025:
                        weight = 3
                    elif -0.75 <= actual_corr < -0.05:
                        weight = 4
                    elif -0.9 <= actual_corr < -0.75:
                        weight = 5
                    elif -1 <= actual_corr < -0.9:
                        weight = 5
                    else:
                        weight = 1
                    weighted_corr = actual_corr * weight if weight != 1 else actual_corr
                    correlation_report.append((file, actual_corr, weight, weighted_corr))
                    if actual_corr < 0:
                        df_pct['Close'] *= -1
                    df_pct *= abs(weighted_corr)

                try:
                    granger_df = pd.DataFrame({
                        'shock_flag': spy_df['Returns'][:spy_len].values,
                        'instrument': df_pct['Close'][:spy_len].values
                    })
                    granger_result = grangercausalitytests(granger_df[['shock_flag', 'instrument']], maxlag=5, verbose=False)
                    best_pval = float('inf')
                    for lag, res in granger_result.items():
                        pval = res[0]['ssr_ftest'][1]
                        if pval < best_pval:
                            best_pval = pval
                    granger_report.append((file, best_pval))
                except:
                    granger_report.append((file, np.nan))

                all_data.append(df_pct.values)
            except Exception as e:
                print(f"[SKIP] {file} due to error: {e}")

# OUTPUT CORRELATION AND GRANGER REPORTS
correlation_report = sorted(correlation_report, key=lambda x: x[3], reverse=True)
print("\n--- Ranked Pearson Correlation of Close vs SPY Returns (Weighted) ---")
for name, corr, mult, w_corr in correlation_report:
    print(f"{name:30s} | Corr: {corr: .4f} | Mult: {mult:>3} | Weighted: {w_corr: .4f}")

granger_report = sorted(granger_report, key=lambda x: x[1])
print("\n--- Ranked Granger Causality p-values (lower is better) ---")
for name, pval in granger_report:
    print(f"{name:30s} | p-value: {pval:.4f}")

# ALIGN LENGTH
min_length = min([len(x) for x in all_data] + [len(spy_close_full), len(spy_returns), len(spy_ta)])
all_data = [x[:min_length] for x in all_data]
spy_close = spy_close_full[:min_length]
spy_returns = spy_returns[:min_length]
spy_dates = spy_dates[:min_length]
spy_ta = spy_ta[:min_length]

# STACK FEATURES
data_combined = np.hstack(all_data + [spy_ta])
data_combined = np.nan_to_num(data_combined)
spy_close = np.nan_to_num(spy_close)

# SCALE
feature_scaler = StandardScaler()
scaled_data = feature_scaler.fit_transform(data_combined)
spy_scaler = StandardScaler()
spy_scaled = spy_scaler.fit_transform(spy_close)

# SUPERVISED LEARNING SETUP
n_lookback = 252
n_forecast = 30
X, y = [], []
for i in range(n_lookback, len(scaled_data) - n_forecast + 1):
    X.append(scaled_data[i - n_lookback:i])
    y.append(spy_scaled[i + n_forecast - 1, 0] - spy_scaled[i - 1, 0])

X, y = np.array(X), np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# MODEL
model = Sequential([
    Input(shape=(X.shape[1], X.shape[2])),
    Bidirectional(LSTM(128, return_sequences=True)),
    LayerNormalization(),
    Dropout(0.5),
    LSTM(128),
    LayerNormalization(),
    Dropout(0.5),
    Dense(1)
])
model.compile(optimizer='adam', loss=Huber(delta=1.0))

callbacks = [
    EarlyStopping(monitor='val_loss', patience=50, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5)
]
history = model.fit(X_train, y_train, epochs=300, batch_size=32, validation_data=(X_test, y_test), callbacks=callbacks)

# PLOT LOSS
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)
plt.show()

# BACKTEST
pred_test = model.predict(X_test).flatten()
pred_test_cumsum = spy_scaled[n_lookback + len(X_train) - 1:n_lookback + len(X_train) - 1 + len(pred_test), 0] + pred_test
pred_test_inv = spy_scaler.inverse_transform(pred_test_cumsum.reshape(-1, 1))

actual_test = spy_scaled[n_lookback + len(X_train):n_lookback + len(X_train) + len(pred_test), 0]
y_test_inv = spy_scaler.inverse_transform(actual_test.reshape(-1, 1))

corr_test = np.corrcoef(pred_test_inv[:, 0], y_test_inv[:, 0])[0, 1]
print(f"[INFO] Correlation between actual SPY and predicted SPY: {corr_test:.4f}")

test_dates = pd.Series(spy_dates[n_lookback + len(X_train):n_lookback + len(X_train) + len(pred_test)]).reset_index(drop=True)
fig = go.Figure()
fig.add_trace(go.Scatter(x=test_dates, y=y_test_inv[:, 0], name='Actual SPY (Day 90)', line=dict(color='green')))
fig.add_trace(go.Scatter(x=test_dates, y=pred_test_inv[:, 0], name='Predicted SPY (Day 90)', line=dict(color='blue', dash='dash')))

# FUTURE FORECAST
forecast_range = 90
rolling_preds = []
rolling_dates = []

for i in range(-90, 0):  # last 90 days of data
    window_start = i - n_lookback
    window_end = i
    if abs(window_start) > len(scaled_data):
        continue
    window = scaled_data[window_start:window_end]
    if window.shape[0] == n_lookback:
        pred = model.predict(window.reshape(1, n_lookback, X.shape[2])).flatten()[0]
        last_price_scaled = spy_scaled[i - 1, 0] + pred
        future_price = spy_scaler.inverse_transform([[last_price_scaled]])[0, 0]
        rolling_preds.append(future_price)
        rolling_dates.append(spy_dates[i] + pd.Timedelta(days=1))

fig.add_trace(go.Scatter(x=rolling_dates, y=rolling_preds, name='Rolling 90d Forecast', line=dict(color='orange', dash='dot')))

# Static future prediction from latest window
last_window = scaled_data[-n_lookback:].reshape(1, n_lookback, X.shape[2])
future_delta = model.predict(last_window).flatten()[0]
last_price_scaled = spy_scaled[-1, 0] + future_delta
future_price = spy_scaler.inverse_transform([[last_price_scaled]])[0, 0]
future_date = spy_dates[-1] + pd.Timedelta(days=1)
fig.add_trace(go.Scatter(x=[future_date], y=[future_price], name='Future Predicted SPY', mode='markers+text', marker=dict(color='orange', size=10), text=['Prediction'], textposition='top center'))

fig.update_layout(title='LSTM SPY Forecast (Weighted Pearson + Granger)', xaxis_title='Date', yaxis_title='SPY Close Price', template='plotly_dark')
fig.show()

