# Import Libraries

In [1]:
import yfinance as yf
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

from data_prep import DataPreparation

ModuleNotFoundError: No module named 'yfinance'

# Fetch Data

In [None]:
data_prep = DataPreparation()
ticker = "AAPL"
start_date = "2022-01-01"
end_date = "2023-01-01"

data = data_prep.fetch_financial_data(ticker, start_date, end_date)
processed_data = data_prep.preprocess_data(data)

print(processed_data.head())

# Prepare Sequences

# Define the Autoencoder Model

In [None]:
class Autoencoder(nn.Module):
    def __init__(self, sequence_length=10, num_features=10):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(sequence_length * num_features, 128),
            nn.ReLU(True),
            nn.Linear(128, 64),
            nn.ReLU(True),
            nn.Linear(64, 12),
            nn.ReLU(True),
            nn.Linear(12, 3)  # Final output of encoder is of size 3
        )
        self.decoder = nn.Sequential(
            nn.Linear(3, 12),  # First layer of decoder must match the final output size of encoder
            nn.ReLU(True),
            nn.Linear(12, 64),
            nn.ReLU(True),
            nn.Linear(64, 128),
            nn.ReLU(True),
            nn.Linear(128, sequence_length * num_features),
            nn.Tanh()
        )

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x

    def encode(self, x):
        return self.encoder(x)

# Convert Sequences to Tensor and Create DataLoader

In [None]:
sequences = data_prep.create_sequences(processed_data)
sequences.shape

# Ensure sequences is a PyTorch tensor; this is necessary if sequences is initially a numpy array or a list
if not isinstance(sequences, torch.Tensor):
    sequences = torch.tensor(sequences, dtype=torch.float)

# Split the sequences into training and validation sets before flattening
sequences_train, sequences_val = train_test_split(sequences, test_size=0.2, random_state=42)

# Convert sequences_train and sequences_val to PyTorch tensors if not already
sequences_train = sequences_train.clone().detach()
sequences_val = sequences_val.clone().detach()

# Flatten the sequences
sequences_train_flat = sequences_train.view(sequences_train.shape[0], -1)
sequences_val_flat = sequences_val.view(sequences_val.shape[0], -1)

# Create TensorDatasets for both training and validation sets using the flattened sequences
train_dataset = TensorDataset(sequences_train_flat, sequences_train_flat)
val_dataset = TensorDataset(sequences_val_flat, sequences_val_flat)

# Create DataLoaders for both sets
train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=32)

# Output the shape of the input and target tensors in train_dataset, and the total number of items
print("Shape of the input tensor in train_dataset:", train_dataset.tensors[0].shape)
print("Shape of the target tensor in train_dataset:", train_dataset.tensors[1].shape)
print("Total number of items in train_dataset:", len(train_dataset))

# Similarly, for the validation dataset if needed
print("Shape of the input tensor in val_dataset:", val_dataset.tensors[0].shape)
print("Shape of the target tensor in val_dataset:", val_dataset.tensors[1].shape)
print("Total number of items in val_dataset:", len(val_dataset))

# Training Loop

In [2]:
# Early stopping criteria
patience = 5  # Number of epochs to wait for improvement before stopping
min_delta = 0.001  # Minimum change to qualify as an improvement
best_val_loss = float('inf')  # Initialize best validation loss to infinity
counter = 0  # Initialize counter for epochs without improvement

# Initialize the model
model = Autoencoder(sequence_length=10, num_features=data_scaled.shape[1])
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 50
for epoch in range(epochs):
    model.train()
    train_loss = 0.0
    for inputs, _ in train_dataloader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, inputs)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * inputs.size(0)
    train_loss /= len(train_dataloader.dataset)
    
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, _ in val_dataloader:
            outputs = model(inputs)
            loss = criterion(outputs, inputs)
            val_loss += loss.item() * inputs.size(0)
    val_loss /= len(val_dataloader.dataset)
    
    print(f'Epoch {epoch+1}, Training Loss: {train_loss:.4f}, Validation Loss: {val_loss:.4f}')
    
    # Early stopping logic
    if best_val_loss - val_loss > min_delta:
        best_val_loss = val_loss
        counter = 0  # Reset counter if there's an improvement
    else:
        counter += 1  # Increment counter if no improvement
    if counter >= patience:
        print("Early stopping triggered")
        break  # Exit the training loop

NameError: name 'Autoencoder' is not defined

# Save the Model

In [None]:
# Define your model and optimizer
model = Autoencoder(sequence_length=10, num_features=data_scaled.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Assuming your model is already trained and ready to be saved

# Specify the path to save the model
model_save_path = 'models/autoencoder_model.pth'
optimizer_save_path = 'models/autoencoder_optimizer.pth'

# Save the model state dictionary
torch.save(model.state_dict(), model_save_path)

# Optionally, save the optimizer state dictionary
torch.save(optimizer.state_dict(), optimizer_save_path)

print(f"Model saved to {model_save_path}")
print(f"Optimizer state saved to {optimizer_save_path}")

# Load and Test the Saved Model

In [None]:
# Initialize model and optimizer with the same architecture and parameters as when you saved them
loaded_model = Autoencoder(sequence_length=10, num_features=data_scaled.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Load the model and optimizer state dictionaries
loaded_model.load_state_dict(torch.load(model_save_path))
optimizer.load_state_dict(torch.load(optimizer_save_path))

print("Model and optimizer state loaded successfully.")

In [None]:
loaded_model.eval()
criterion = nn.MSELoss()  # Assuming MSE loss was used during training
val_loss = 0.0

with torch.no_grad():  # No gradients needed for evaluation
    for inputs, _ in val_dataloader:
        outputs = model(inputs)
        loss = criterion(outputs, inputs)  # Compare model output vs input
        val_loss += loss.item() * inputs.size(0)
val_loss /= len(val_dataloader.dataset)

print(f'Validation Loss: {val_loss:.4f}')

In [None]:
# Assuming the model is already loaded and in evaluation mode
model.eval()

# Create a single input example with random data
# This example assumes num_features=10 (from the defined model) and sequence_length=10
single_input = torch.randn(1, 10 * 10)  # Batch size of 1

# Use the encode method to get the encoded representation
encoded_features = model.encode(single_input)

print("Encoded Features:", single_input)
print("---------------------")
print("Shape of features:", single_input.shape)
print("---------------------")
print("Encoded features:", encoded_features)
print("---------------------")
print("Shape of encoded features:", encoded_features.shape)