# Tesla Stock Price Prediction - Feature Engineering

## Objective
- Create sequences for 1-day, 5-day, and 10-day predictions
- Apply MinMaxScaler for normalization
- Prepare train/test splits maintaining temporal order

In [11]:
%pip install scikit-learn

import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from pathlib import Path

PROJECT_ROOT = Path('..').resolve()
DATA_PATH = PROJECT_ROOT / 'data' / 'processed' / 'tsla_cleaned.csv'
OUTPUT_PATH = PROJECT_ROOT / 'data' / 'processed'
MODELS_PATH = PROJECT_ROOT / 'models'

# Try loading cleaned data, else use raw
if DATA_PATH.exists():
    df = pd.read_csv(DATA_PATH, index_col='Date', parse_dates=True)
else:
    df = pd.read_csv(PROJECT_ROOT / 'data' / 'raw' / 'TSLA.csv')
    df['Date'] = pd.to_datetime(df['Date'])
    df = df.set_index('Date').sort_index()
    df = df.ffill().bfill()

print("Data shape:", df.shape)
df.head()


[notice] A new release of pip is available: 24.0 -> 26.0.1
[notice] To update, run: C:\Users\LENOVO\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Collecting scikit-learn
  Downloading scikit_learn-1.8.0-cp311-cp311-win_amd64.whl.metadata (11 kB)
Collecting scipy>=1.10.0 (from scikit-learn)
  Downloading scipy-1.17.0-cp311-cp311-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/61.0 kB ? eta -:--:--
     ------ --------------------------------- 10.2/61.0 kB ? eta -:--:--
     ---------------------------------------- 61.0/61.0 kB 1.1 MB/s eta 0:00:00
Collecting joblib>=1.3.0 (from scikit-learn)
  Using cached joblib-1.5.3-py3-none-any.whl.metadata (5.5 kB)
Collecting threadpoolctl>=3.2.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.8.0-cp311-cp311-win_amd64.whl (8.1 MB)
   ---------------------------------------- 0.0/8.1 MB ? eta -:--:--
   - -------------------------------------- 0.2/8.1 MB 6.1 MB/s eta 0:00:02
   --- ------------------------------------ 0.6/8.1 MB 8.0 MB/s eta 0:00:01
   ----- ---------------------------------- 

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-06-29,19.0,25.0,17.540001,23.889999,23.889999,18766300
2010-06-30,25.790001,30.42,23.299999,23.83,23.83,17187100
2010-07-01,25.0,25.92,20.27,21.959999,21.959999,8218800
2010-07-02,23.0,23.1,18.709999,19.200001,19.200001,5139800
2010-07-06,20.0,20.0,15.83,16.110001,16.110001,6866900


## Create Sequences for LSTM/RNN

In [12]:
def create_sequences(data, lookback=60, forecast_horizon=1):
    """
    Create input-output sequences for time-series prediction.
    
    Parameters:
    - data: scaled closing price array
    - lookback: number of past days to use as input
    - forecast_horizon: 1, 5, or 10 days ahead to predict
    
    Returns:
    - X: input sequences (samples, lookback, 1)
    - y: target values (samples,)
    """
    X, y = [], []
    for i in range(lookback, len(data) - forecast_horizon + 1):
        X.append(data[i - lookback:i])
        y.append(data[i + forecast_horizon - 1])  # Predict closing price at t+forecast_horizon
    return np.array(X), np.array(y)

In [13]:
LOOKBACK = 60  # Use 60 days of history
TEST_RATIO = 0.2  # 80% train, 20% test

# Use Closing price (Adj Close) as per problem statement
close_prices = df['Close'].values.reshape(-1, 1)

# Fit MinMaxScaler on full data (in production, fit only on train)
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(close_prices)

# Save scaler for inference
import joblib
MODELS_PATH.mkdir(exist_ok=True)
joblib.dump(scaler, MODELS_PATH / 'scaler.pkl')
print("Scaler saved.")

Scaler saved.


In [14]:
horizons = [1, 5, 10]

for horizon in horizons:
    X, y = create_sequences(scaled_data, lookback=LOOKBACK, forecast_horizon=horizon)
    
    # Train/test split - maintain temporal order
    split_idx = int(len(X) * (1 - TEST_RATIO))
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]
    
    # Flatten y for storage (it's 1D)
    np.save(OUTPUT_PATH / f'X_train_{horizon}d.npy', X_train)
    np.save(OUTPUT_PATH / f'X_test_{horizon}d.npy', X_test)
    np.save(OUTPUT_PATH / f'y_train_{horizon}d.npy', y_train)
    np.save(OUTPUT_PATH / f'y_test_{horizon}d.npy', y_test)
    
    print(f"{horizon}-day: X_train {X_train.shape}, X_test {X_test.shape}")

1-day: X_train (1884, 60, 1), X_test (472, 60, 1)
5-day: X_train (1881, 60, 1), X_test (471, 60, 1)
10-day: X_train (1877, 60, 1), X_test (470, 60, 1)
