# Cross Validation

In [None]:
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import TimeSeriesSplit
from tensorflow.keras.metrics import MeanSquaredError, RootMeanSquaredError, MeanAbsoluteError
from src.LongShortTermMemory import LSTMModel
from sklearn.preprocessing import MinMaxScaler

from src.StockDataProcessor import StockDataProcessor
from src.LongShortTermMemory import LSTMModel

In [None]:
class CrossValidator:
    """
    Performs time series cross-validation for LSTM models.
    """

    def __init__(self, n_splits=5):
        """
        Initializes the cross-validator.

        Args:
            n_splits: The number of splits for cross-validation.
        """
        self.n_splits = n_splits
        self.tscv = TimeSeriesSplit(n_splits=self.n_splits)

    def cross_val_score(self, model_creator, data, time_steps, epochs=50, batch_size=32):
        """
        Performs cross-validation and returns the evaluation scores.

        Args:
            model_creator: A function that returns a compiled Keras model.
            data: The raw stock price data (Pandas DataFrame or Series).
            time_steps: The number of time steps for creating sequences.
            epochs: The number of epochs for training.
            batch_size: The batch size for training.

        Returns:
            list: A list of dictionaries containing evaluation metrics for each fold.
        """
        all_scores = []

        for fold, (train_index, test_index) in enumerate(self.tscv.split(data)):
            print(f"Fold: {fold + 1}")

            # --- Create Sequences for This Fold --- 
            train_data = data.iloc[train_index]
            test_data = data.iloc[test_index]

            # Assuming data['Close'] has your closing prices (adjust if needed)
            scaler = MinMaxScaler()  # Create a new scaler for each fold
            train_data['Close'] = scaler.fit_transform(train_data[['Close']])
            test_data['Close'] = scaler.transform(test_data[['Close']]) # Assuming using only 'Close' - need to align the 'Close' column with the scaling process

            X_train, y_train = self._create_sequences(train_data['Close'], time_steps)
            X_test, y_test = self._create_sequences(test_data['Close'], time_steps)

            # --- Model Training and Evaluation ---
            model = model_creator()
            model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, verbose=0)
            scores = self.evaluate_model(model, X_test, y_test)
            all_scores.append(scores)

        return all_scores


    def _create_sequences(self, data, time_steps):
        """
        Helper function to create time series sequences.
        """
        X, y = [], []
        for i in range(time_steps, len(data)):
            X.append(data[i-time_steps:i])
            y.append(data[i])
        return np.array(X), np.array(y)


    def evaluate_model(self, model, X_test, y_test):
        """
        Evaluates the model and returns a dictionary of metrics.
        """
        y_pred = model.predict(X_test)
        
        mse = MeanSquaredError()(y_test, y_pred).numpy()
        rmse = RootMeanSquaredError()(y_test, y_pred).numpy() # np.sqrt(mse)  
        mae = MeanAbsoluteError()(y_test, y_pred).numpy()

        return {'mse': mse, 'rmse': rmse, 'mae': mae}

In [None]:
EPOCHS = 100
BATCH_SIZE = 10
TIME_STEPS = 60

FOLDER_PREFIX = "data/min/"
STOCK_START_DATE = pd.to_datetime("2017-06-07 15:59:00")
STOCK_VALIDATION_DATE = pd.to_datetime("2024-06-09 09:30:00")
STOCK_END_DATE = pd.to_datetime("2024-06-12 15:59:00")
TOKEN = "GOOG"
RUN_FOLDER = f"{FOLDER_PREFIX}{TOKEN}/"
WORK_DIR = os.path.join(os.getcwd(), RUN_FOLDER)
CSV_FILE = f"{WORK_DIR}data.csv"

PROJECT_FOLDER = os.path.join(os.getcwd(), RUN_FOLDER)
if not os.path.exists(PROJECT_FOLDER):
    os.makedirs(PROJECT_FOLDER)

In [None]:
(x_train, y_train), (x_test, y_test), (training_data, test_data) = StockDataProcessor.load_csv_transform_to_numpy(TIME_STEPS, CSV_FILE, STOCK_VALIDATION_DATE)

In [None]:
# Create a CrossValidator instance
cross_validator = CrossValidator(n_splits=5)

# Perform cross-validation (pass raw data and time_steps)
all_scores = cross_validator.cross_val_score(
    LSTMModel.create, 
    x_train,
    time_steps=TIME_STEPS,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE
)
# Print the results for each fold
for fold, scores in enumerate(all_scores):
    print(f"Fold {fold + 1}: MSE={scores['mse']:.4f}, RMSE={scores['rmse']:.4f}, MAE={scores['mae']:.4f}")

# Calculate and print the average scores across all folds
average_scores = {metric: np.mean([fold_scores[metric] for fold_scores in all_scores]) for metric in all_scores[0]}
print(f"Average: MSE={average_scores['mse']:.4f}, RMSE={average_scores['rmse']:.4f}, MAE={average_scores['mae']:.4f}")