In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import ta 

# LSTM Data Preprocessing

## Purpose
Transform our engineered features from notebook 02 into the proper format for LSTM training:
- Scale data to 0-1 range for neural networks
- Create sequences (60 days → predict day 61)
- Split data chronologically for time series
- Save processed data for model training

In [2]:
# Load and clean data 
TICKER = input("Enter stock ticker (e.g. AAPL, TSLA, MSFT): ").upper()
df = pd.read_csv(f'../data/{TICKER.lower()}.csv', index_col=0, parse_dates=True)

# Convert to numeric and clean data
numeric_columns = ['Open', 'High', 'Low', 'Close', 'Volume']
for col in numeric_columns:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')

df = df.dropna()
print(f"Data loaded: {df.shape}")

Data loaded: (1760, 5)


  df = pd.read_csv(f'../data/{TICKER.lower()}.csv', index_col=0, parse_dates=True)


In [3]:
# Create technical indicators 
df['SMA_20'] = ta.trend.SMAIndicator(df['Close'], window=20).sma_indicator()
df['RSI'] = ta.momentum.RSIIndicator(df['Close'], window=14).rsi()
df['MACD'] = ta.trend.MACD(df['Close']).macd()

In [4]:
# Create lag features
df['Close_lag1'] = df['Close'].shift(1)
df['Close_lag2'] = df['Close'].shift(2)
df['Close_lag3'] = df['Close'].shift(3)

In [5]:
# Select features for LSTM 
features = ['Close', 'SMA_20', 'RSI', 'MACD', 'Close_lag1', 'Close_lag2', 'Close_lag3']

# Select features and remove incomplete data
df = df[features].dropna()
print(f"Final dataset shape: {df.shape}")

Final dataset shape: (1735, 7)


In [6]:
# Scale data for neural networks (0-1 range)
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_data, columns=features, index=df.index)

In [7]:
# Chronological train/test split
split_ratio = 0.8
split_index = int(len(scaled_df) * split_ratio)

train_df = scaled_df[:split_index]
test_df = scaled_df[split_index:]

print(f"Train: {len(train_df)} rows, Test: {len(test_df)} rows")

Train: 1388 rows, Test: 347 rows


In [8]:
def create_sequences(data, lookback=60):
    """Create sequences for LSTM training"""
    X, y = [], [] # X is input of 60 days and y is the next day's closing price (y)

# For each day after day 60:
# X[0] = Days 1-60 → y[0] = Day 61 price
# X[1] = Days 2-61 → y[1] = Day 62 price  
# X[2] = Days 3-62 → y[2] = Day 63 price
    
    for i in range(lookback, len(data)):
        X.append(data.iloc[i - lookback:i].values)
        y.append(data.iloc[i]['Close'])
    
    return np.array(X), np.array(y)

# Create sequences
lookback = 60
X_train, y_train = create_sequences(train_df, lookback)
X_test, y_test = create_sequences(test_df, lookback)

In [9]:
# Verify data shapes
print(f"X_train shape: {X_train.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (1328, 60, 7)
y_train shape: (1328,)
X_test shape: (287, 60, 7)
y_test shape: (287,)


In [10]:
# Save preprocessed data for LSTM training
import os
import joblib

# Create data directory if needed
os.makedirs('../data', exist_ok=True)

# Save all datasets
np.save(f'../data/X_train_{TICKER.lower()}.npy', X_train)
np.save(f'../data/y_train_{TICKER.lower()}.npy', y_train)
np.save(f'../data/X_test_{TICKER.lower()}.npy', X_test)
np.save(f'../data/y_test_{TICKER.lower()}.npy', y_test)

# Save scaler for inverse transformation later
joblib.dump(scaler, f'../data/scaler_{TICKER.lower()}.pk1')

print("Data saved successfully!")

Data saved successfully!
