# NYSE LSTM Forecasting (Kaggle Ready)
Professional LSTM workflow for NYSE ticker modeling with deployment artifacts.


## 1) Setup


In [None]:
import os
import json
import pickle
import random
from pathlib import Path

import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Input, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)

PLOTLY_TEMPLATE = "plotly_dark"
px.defaults.template = PLOTLY_TEMPLATE


## 2) Configuration


In [None]:
DATA_DIR = Path("/kaggle/input/datasets/dgawlik/nyse")
DATA_FILE = DATA_DIR / "prices-split-adjusted.csv"

TICKER = "EQIX"
FEATURE_COLUMNS = ["open", "high", "low", "close", "volume"]
TARGET_COLUMN = "close"
SEQUENCE_LENGTH = 30

TRAIN_RATIO = 0.80
VALID_RATIO = 0.10
TEST_RATIO = 0.10

LSTM_UNITS = 96
LSTM_LAYERS = 2
DROPOUT_RATE = 0.15
BATCH_SIZE = 64
EPOCHS = 40

ARTIFACT_DIR = Path("/kaggle/working/nyse_lstm_artifacts")
ARTIFACT_DIR.mkdir(parents=True, exist_ok=True)

if not DATA_FILE.exists():
    raise FileNotFoundError(f"Missing dataset: {DATA_FILE}")

print("Dataset:", DATA_FILE)
print("Artifacts:", ARTIFACT_DIR)


## 3) Data Load and Validation


In [None]:
market_df = pd.read_csv(DATA_FILE)
market_df["date"] = pd.to_datetime(market_df["date"])

print("Rows:", len(market_df))
print("Unique symbols:", market_df["symbol"].nunique())

available_symbols = sorted(market_df["symbol"].unique())
if TICKER not in available_symbols:
    raise ValueError(f"Ticker '{TICKER}' not found. Example symbols: {available_symbols[:20]}")

ticker_df = (
    market_df.loc[market_df["symbol"] == TICKER, ["date", "symbol"] + FEATURE_COLUMNS]
    .sort_values("date")
    .reset_index(drop=True)
)

ticker_df.head()


## 4) Visualization Flow


In [None]:
# 4.1 Candlestick + volume overview
fig = make_subplots(
    rows=2,
    cols=1,
    shared_xaxes=True,
    vertical_spacing=0.06,
    subplot_titles=(f"{TICKER} Candlestick", f"{TICKER} Volume"),
    row_heights=[0.72, 0.28],
)

fig.add_trace(
    go.Candlestick(
        x=ticker_df["date"],
        open=ticker_df["open"],
        high=ticker_df["high"],
        low=ticker_df["low"],
        close=ticker_df["close"],
        name="OHLC",
    ),
    row=1,
    col=1,
)

fig.add_trace(
    go.Bar(x=ticker_df["date"], y=ticker_df["volume"], name="Volume", marker_color="#5DADE2"),
    row=2,
    col=1,
)

fig.update_layout(height=780, width=1200, template=PLOTLY_TEMPLATE, xaxis_rangeslider_visible=False)
fig.update_yaxes(title_text="Price", row=1, col=1)
fig.update_yaxes(title_text="Volume", row=2, col=1)
fig.show()


In [None]:
# 4.2 Trend and return structure
viz_df = ticker_df.copy()
viz_df["ma_20"] = viz_df["close"].rolling(20).mean()
viz_df["ma_60"] = viz_df["close"].rolling(60).mean()
viz_df["daily_return"] = viz_df["close"].pct_change()

fig = make_subplots(rows=1, cols=2, subplot_titles=("Close + Moving Averages", "Daily Return Distribution"))

fig.add_trace(go.Scatter(x=viz_df["date"], y=viz_df["close"], mode="lines", name="Close", line=dict(color="#58D68D")), row=1, col=1)
fig.add_trace(go.Scatter(x=viz_df["date"], y=viz_df["ma_20"], mode="lines", name="MA 20", line=dict(color="#F5B041")), row=1, col=1)
fig.add_trace(go.Scatter(x=viz_df["date"], y=viz_df["ma_60"], mode="lines", name="MA 60", line=dict(color="#EC7063")), row=1, col=1)

fig.add_trace(go.Histogram(x=viz_df["daily_return"].dropna(), nbinsx=70, name="Returns", marker_color="#A569BD"), row=1, col=2)
fig.add_vline(x=0, line_dash="dash", line_color="white", row=1, col=2)

fig.update_layout(height=460, width=1250, template=PLOTLY_TEMPLATE)
fig.update_xaxes(title_text="Date", row=1, col=1)
fig.update_yaxes(title_text="Price", row=1, col=1)
fig.update_xaxes(title_text="Return", row=1, col=2)
fig.update_yaxes(title_text="Count", row=1, col=2)
fig.show()


In [None]:
# 4.3 Correlation heatmap
corr = ticker_df[FEATURE_COLUMNS].corr()
fig = go.Figure(
    data=go.Heatmap(
        z=corr.values,
        x=corr.columns,
        y=corr.index,
        colorscale="Viridis",
        text=np.round(corr.values, 2),
        texttemplate="%{text}",
    )
)
fig.update_layout(title=f"{TICKER} Feature Correlation", height=520, width=700, template=PLOTLY_TEMPLATE)
fig.show()


## 5) Preprocessing and Sequence Construction


In [None]:
# Original-style OHLC normalization (column-wise min-max)
def normalize_ohlc(df_stock):
    scaler = MinMaxScaler()
    out = df_stock.copy()
    out["open"] = scaler.fit_transform(df_stock[["open"]])
    out["high"] = scaler.fit_transform(df_stock[["high"]])
    out["low"] = scaler.fit_transform(df_stock[["low"]])
    out["close"] = scaler.fit_transform(df_stock[["close"]])
    return out


def create_sequences(stock_frame, seq_len, valid_pct, test_pct):
    data_raw = stock_frame.values.astype(np.float32)
    seq = []
    for i in range(len(data_raw) - seq_len):
        seq.append(data_raw[i: i + seq_len])

    seq = np.array(seq, dtype=np.float32)
    valid_size = int(np.round(valid_pct / 100 * seq.shape[0]))
    test_size = int(np.round(test_pct / 100 * seq.shape[0]))
    train_size = seq.shape[0] - (valid_size + test_size)

    x_train = seq[:train_size, :-1, :]
    y_train = seq[:train_size, -1, :]
    x_valid = seq[train_size:train_size + valid_size, :-1, :]
    y_valid = seq[train_size:train_size + valid_size, -1, :]
    x_test = seq[train_size + valid_size:, :-1, :]
    y_test = seq[train_size + valid_size:, -1, :]

    return x_train, y_train, x_valid, y_valid, x_test, y_test


# IMPORTANT: remove non-numeric columns used only for plotting/labels
model_df = ticker_df[ticker_df["symbol"] == TICKER].copy()
model_df = model_df.drop(columns=["date", "symbol", "volume"])

# Keep deterministic feature order for indexing consistency
MODEL_FEATURES = ["open", "high", "low", "close"]
model_df = model_df[MODEL_FEATURES]

model_df_norm = normalize_ohlc(model_df)

x_train, y_train, x_valid, y_valid, x_test, y_test = create_sequences(
    model_df_norm,
    SEQUENCE_LENGTH,
    valid_pct=VALID_RATIO * 100,
    test_pct=TEST_RATIO * 100,
)

print("model_df dtypes:\n", model_df.dtypes)
print("x_train:", x_train.shape, "| y_train:", y_train.shape, "| dtype:", x_train.dtype)
print("x_valid:", x_valid.shape, "| y_valid:", y_valid.shape, "| dtype:", x_valid.dtype)
print("x_test:", x_test.shape, "| y_test:", y_test.shape, "| dtype:", x_test.dtype)



In [None]:
# Preprocessing effect visualization: raw OHLC vs normalized OHLC
fig = make_subplots(
    rows=1,
    cols=2,
    subplot_titles=("Before Normalization (Raw OHLC)", "After Normalization (MinMax OHLC)")
)

for col, color in zip(MODEL_FEATURES, ["#E74C3C", "#F1C40F", "#3498DB", "#2ECC71"]):
    fig.add_trace(
        go.Scatter(
            x=np.arange(len(model_df)),
            y=model_df[col].values,
            mode="lines",
            name=f"raw_{col}",
            line=dict(color=color),
            legendgroup=col,
        ),
        row=1,
        col=1,
    )

for col, color in zip(MODEL_FEATURES, ["#E74C3C", "#F1C40F", "#3498DB", "#2ECC71"]):
    fig.add_trace(
        go.Scatter(
            x=np.arange(len(model_df_norm)),
            y=model_df_norm[col].values,
            mode="lines",
            name=f"norm_{col}",
            line=dict(color=color, dash="dot"),
            legendgroup=col,
            showlegend=False,
        ),
        row=1,
        col=2,
    )

fig.update_xaxes(title_text="Time [days]", row=1, col=1)
fig.update_yaxes(title_text="Raw price", row=1, col=1)
fig.update_xaxes(title_text="Time [days]", row=1, col=2)
fig.update_yaxes(title_text="Scaled value (0-1)", row=1, col=2)
fig.update_layout(height=460, width=1300, template=PLOTLY_TEMPLATE)
fig.show()



## 6) LSTM Modeling


In [None]:
n_steps = SEQUENCE_LENGTH - 1
n_inputs = 4
n_outputs = 4

lstm_model = Sequential(name="nyse_lstm_ohlc")
lstm_model.add(Input(shape=(n_steps, n_inputs), name="sequence_input"))

for idx in range(LSTM_LAYERS):
    ret_seq = idx < (LSTM_LAYERS - 1)
    lstm_model.add(LSTM(LSTM_UNITS, return_sequences=ret_seq))
    lstm_model.add(Dropout(DROPOUT_RATE))

lstm_model.add(Dense(n_outputs))
lstm_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss="mse")

best_path = ARTIFACT_DIR / "best_lstm.keras"
callbacks = [
    EarlyStopping(monitor="val_loss", patience=7, restore_best_weights=True),
    ModelCheckpoint(filepath=str(best_path), monitor="val_loss", save_best_only=True),
]

history = lstm_model.fit(
    x_train,
    y_train,
    validation_data=(x_valid, y_valid),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    verbose=1,
    callbacks=callbacks,
)



In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=history.history["loss"], mode="lines", name="train_loss", line=dict(color="#58D68D")))
fig.add_trace(go.Scatter(y=history.history["val_loss"], mode="lines", name="val_loss", line=dict(color="#EC7063")))
fig.update_layout(title="Training vs Validation Loss", xaxis_title="Epoch", yaxis_title="MSE", height=450, width=1000, template=PLOTLY_TEMPLATE)
fig.show()


## 7) Forecast Evaluation


In [None]:
y_train_pred = lstm_model.predict(x_train, verbose=0)
y_valid_pred = lstm_model.predict(x_valid, verbose=0)
y_test_pred = lstm_model.predict(x_test, verbose=0)

feature_index_map = {name: i for i, name in enumerate(MODEL_FEATURES)}
ft = feature_index_map["open"]  # plotted feature index
idx_open = feature_index_map["open"]
idx_close = feature_index_map["close"]

mae = mean_absolute_error(y_test[:, ft], y_test_pred[:, ft])
rmse = np.sqrt(mean_squared_error(y_test[:, ft], y_test_pred[:, ft]))

corr_price_development_train = np.mean(np.sign(y_train[:, idx_close]-y_train[:, idx_open]) == np.sign(y_train_pred[:, idx_close]-y_train_pred[:, idx_open]))
corr_price_development_valid = np.mean(np.sign(y_valid[:, idx_close]-y_valid[:, idx_open]) == np.sign(y_valid_pred[:, idx_close]-y_valid_pred[:, idx_open]))
corr_price_development_test = np.mean(np.sign(y_test[:, idx_close]-y_test[:, idx_open]) == np.sign(y_test_pred[:, idx_close]-y_test_pred[:, idx_open]))

print(f"MAE: {mae:.6f}")
print(f"RMSE: {rmse:.6f}")
print('correct sign prediction for close-open train/valid/test: %.2f/%.2f/%.2f' % (
    corr_price_development_train,
    corr_price_development_valid,
    corr_price_development_test,
))



In [None]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("Past and Future Stock Prices", "Future Stock Prices"))

fig.add_trace(go.Scatter(x=np.arange(y_train.shape[0]), y=y_train[:,ft], mode="lines", name="train target", line=dict(color="#5DADE2")), row=1, col=1)
fig.add_trace(go.Scatter(x=np.arange(y_train.shape[0], y_train.shape[0]+y_valid.shape[0]), y=y_valid[:,ft], mode="lines", name="valid target", line=dict(color="#AAB7B8")), row=1, col=1)
fig.add_trace(go.Scatter(x=np.arange(y_train.shape[0]+y_valid.shape[0], y_train.shape[0]+y_valid.shape[0]+y_test.shape[0]), y=y_test[:,ft], mode="lines", name="test target", line=dict(color="#F8C471")), row=1, col=1)

fig.add_trace(go.Scatter(x=np.arange(y_train_pred.shape[0]), y=y_train_pred[:,ft], mode="lines", name="train prediction", line=dict(color="#2ECC71")), row=1, col=1)
fig.add_trace(go.Scatter(x=np.arange(y_train_pred.shape[0], y_train_pred.shape[0]+y_valid_pred.shape[0]), y=y_valid_pred[:,ft], mode="lines", name="valid prediction", line=dict(color="#F39C12")), row=1, col=1)
fig.add_trace(go.Scatter(x=np.arange(y_train_pred.shape[0]+y_valid_pred.shape[0], y_train_pred.shape[0]+y_valid_pred.shape[0]+y_test_pred.shape[0]), y=y_test_pred[:,ft], mode="lines", name="test prediction", line=dict(color="#E74C3C")), row=1, col=1)

fig.add_trace(go.Scatter(x=np.arange(y_test.shape[0]), y=y_test[:,ft], mode="lines", name="test target (zoom)", line=dict(color="#F8C471")), row=1, col=2)
fig.add_trace(go.Scatter(x=np.arange(y_test_pred.shape[0]), y=y_test_pred[:,ft], mode="lines", name="test prediction (zoom)", line=dict(color="#E74C3C")), row=1, col=2)

fig.update_xaxes(title_text="Time [days]", row=1, col=1)
fig.update_yaxes(title_text="Normalized price", row=1, col=1)
fig.update_xaxes(title_text="Time [days]", row=1, col=2)
fig.update_yaxes(title_text="Normalized price", row=1, col=2)
fig.update_layout(height=470, width=1300, template=PLOTLY_TEMPLATE)
fig.show()


In [None]:
residuals = y_test[:, ft] - y_test_pred[:, ft]

fig = make_subplots(rows=1, cols=2, subplot_titles=("Residual Distribution", "Residuals vs Predicted"))
fig.add_trace(go.Histogram(x=residuals, nbinsx=40, marker_color="#AF7AC5", name="residuals"), row=1, col=1)
fig.add_vline(x=0, line_dash="dash", line_color="white", row=1, col=1)

fig.add_trace(
    go.Scatter(
        x=y_test_pred[:, ft],
        y=residuals,
        mode="markers",
        marker=dict(size=6, opacity=0.6, color="#5DADE2"),
        name="residual scatter",
    ),
    row=1,
    col=2,
)
fig.add_hline(y=0, line_dash="dash", line_color="white", row=1, col=2)

fig.update_xaxes(title_text="Residual", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)
fig.update_xaxes(title_text="Predicted", row=1, col=2)
fig.update_yaxes(title_text="Residual", row=1, col=2)
fig.update_layout(height=460, width=1250, template=PLOTLY_TEMPLATE)
fig.show()


## 8) Export for Deployment


In [None]:
model_path = ARTIFACT_DIR / "nyse_lstm_ohlc.keras"
lstm_model.save(model_path)

config = {
    "ticker": TICKER,
    "feature_columns": FEATURE_COLUMNS,
    "target_column": TARGET_COLUMN,
    "sequence_length": SEQUENCE_LENGTH,
    "lstm_units": LSTM_UNITS,
    "lstm_layers": LSTM_LAYERS,
    "dropout_rate": DROPOUT_RATE,
}

(ARTIFACT_DIR / "config.json").write_text(json.dumps(config, indent=2), encoding="utf-8")

preprocess_meta = {
    "normalization": "column-wise MinMax on OHLC",
    "columns": ["open", "high", "low", "close"],
}
with open(ARTIFACT_DIR / "preprocess_meta.pkl", "wb") as f:
    pickle.dump(preprocess_meta, f)

print("Saved artifacts:")
for fpath in sorted(ARTIFACT_DIR.glob("*")):
    print("-", fpath)


## 9) Inference Utility


In [None]:
def predict_next_ohlc(model, normalized_window, seq_len=SEQUENCE_LENGTH):
    # normalized_window expected shape: (seq_len - 1, 4)
    x = np.asarray(normalized_window, dtype=np.float32)
    expected = (seq_len - 1, 4)
    if x.shape != expected:
        raise ValueError(f"Expected {expected}, got {x.shape}")

    pred = model.predict(x.reshape(1, seq_len - 1, 4), verbose=0)
    return pred.reshape(-1)

# Example:
# next_pred = predict_next_ohlc(lstm_model, x_test[0])
# print(next_pred)


## 10) Final Notes
- Change `TICKER` and rerun from top to retrain for another stock.
- Artifacts are written to `/kaggle/working/nyse_lstm_artifacts`.
- Keep sequence formatting identical during training and inference.
