In [None]:
"""
LSTM.py - LSTM model training using SPY technical indicators + sentiment + cyclic date features.
"""

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# === File Paths ===
PROJECT_ROOT = Path(__file__).resolve().parents[2]
PROCESSED_DATA = PROJECT_ROOT / "data" / "processed" / "spy_with_sentiment.csv"

# Load dataset
df = pd.read_csv(PROCESSED_DATA)

# Ensure datetime format
if "date" in df.columns:
    df["date"] = pd.to_datetime(df["date"])
else:
    df["date"] = pd.to_datetime(df["timestamp"])

# Create target
df["Target"] = (df["close"].shift(-1) > df["close"]).astype(int)

# Fill missing sentiment values BEFORE dropping
sentiment_cols = ["positive", "neutral", "negative", "compound", "headline_count"]
df[sentiment_cols] = df[sentiment_cols].fillna(0)

# Cyclic encoding for day of week and month
df["day_of_week"] = df["date"].dt.dayofweek
df["month"] = df["date"].dt.month
df["dow_sin"] = np.sin(2 * np.pi * df["day_of_week"] / 7)
df["dow_cos"] = np.cos(2 * np.pi * df["day_of_week"] / 7)
df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)

# Drop only rows missing essential values
df.dropna(subset=["close", "Target"], inplace=True)

# Features list
features = [
    "close", "volume", "EMA_50", "EMA_200",
    "RSI_14", "MACD", "MACD_Signal", "MACD_Hist",
    "Close_Lag_1", "Volume_Lag_1", "RSI_Lag_1",
    "MA_5", "MA_10", "Volatility_5",
    "positive", "neutral", "negative", "compound", "headline_count",
    "dow_sin", "dow_cos", "month_sin", "month_cos"
]

X = df[features]
y = df["Target"]

# Scale features
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Sequence builder
def create_sequences(X, y, time_steps=10):
    Xs, ys = [], []
    for i in range(len(X) - time_steps):
        Xs.append(X[i:i + time_steps])
        ys.append(y[i + time_steps])
    return np.array(Xs), np.array(ys)

X_seq, y_seq = create_sequences(X_scaled, y.values)

# Train/test split
split = int(0.8 * len(X_seq))
X_train, X_test = X_seq[:split], X_seq[split:]
y_train, y_test = y_seq[:split], y_seq[split:]

# Build LSTM model
model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(1, activation="sigmoid")
])
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

# Train model
early_stop = EarlyStopping(monitor="val_loss", patience=3, restore_best_weights=True)
model.fit(X_train, y_train, epochs=30, batch_size=32,
          validation_data=(X_test, y_test), callbacks=[early_stop], verbose=1)

# Evaluate
y_pred = (model.predict(X_test) > 0.5).astype(int)
print("LSTM Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

2025-08-04 18:04:23.337319: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super().__init__(**kwargs)


Epoch 1/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 49ms/step - accuracy: 0.5641 - loss: 0.6892 - val_accuracy: 0.5872 - val_loss: 0.6805
Epoch 2/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5137 - loss: 0.6878 - val_accuracy: 0.5872 - val_loss: 0.6823
Epoch 3/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5395 - loss: 0.6926 - val_accuracy: 0.5872 - val_loss: 0.6806
Epoch 4/30
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5825 - loss: 0.6812 - val_accuracy: 0.5872 - val_loss: 0.6852
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 96ms/step
LSTM Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        45
           1       0.59      1.00      0.74        64

    accuracy                           0.59       109
   macro avg       0.29      0.50    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
