In [1]:
import yfinance as yf
import numpy as np
import torch

class TimeSeriesProcessor:
    def __init__(self, ticker, start, end, column="Close"):
        self.ticker = ticker
        self.start = start
        self.end = end
        self.column = column
        self.data = self._load_data()

    def _load_data(self):
        """Fetch stock price data from Yahoo Finance."""
        df = yf.download(self.ticker, start=self.start, end=self.end)
        return df[[self.column]].dropna().values  # Extract chosen column as numpy array

    def instance_normalize(self):
        """Normalize time series data (zero mean, unit variance)."""
        mean = np.mean(self.data, axis=0)
        std = np.std(self.data, axis=0)
        self.data = (self.data - mean) / (std + 1e-8)  # Prevent division by zero
        return self

    def create_patches(self, window_size=30, stride=1):
        """Create overlapping patches (time series windows)."""
        patches = [
            self.data[i : i + window_size] 
            for i in range(0, len(self.data) - window_size + 1, stride)
        ]
        return np.array(patches)

# Usage Example
if __name__ == "__main__":
    processor = TimeSeriesProcessor(ticker="AAPL", start="2023-01-01", end="2024-01-01")
    processor.instance_normalize()
    patches = processor.create_patches(window_size=30, stride=1)

    # Convert to PyTorch tensor
    patches_tensor = torch.tensor(patches, dtype=torch.float32)
    print("Patched Data Shape:", patches_tensor.shape)  # (num_patches, window_size, 1)


YF.download() has changed argument auto_adjust default to True


[*********************100%***********************]  1 of 1 completed

Patched Data Shape: torch.Size([221, 30, 1])





In [14]:
import torch
import torch.nn as nn
import yfinance as yf
import numpy as np
from transformers import AutoModel, AutoTokenizer

# -------------------- Step 1: Data Preprocessing -------------------- #
class TimeSeriesProcessor:
    def __init__(self, ticker, start, end, column="Close"):
        self.ticker = ticker
        self.start = start
        self.end = end
        self.column = column
        self.data = self._load_data()

    def _load_data(self):
        df = yf.download(self.ticker, start=self.start, end=self.end)
        return df[[self.column]].dropna().values

    def instance_normalize(self):
        mean = np.mean(self.data, axis=0)
        std = np.std(self.data, axis=0)
        self.data = (self.data - mean) / (std + 1e-8)
        return self

    def create_patches(self, window_size=30, stride=1):
        patches = [
            self.data[i : i + window_size] 
            for i in range(0, len(self.data) - window_size + 1, stride)
        ]
        return np.array(patches)

# -------------------- Step 2: Patch Embedder -------------------- #
class PatchEmbedder(nn.Module):
    def __init__(self, window_size, embed_dim):
        super().__init__()
        self.projection = nn.Linear(window_size, embed_dim)  

    def forward(self, x):
        x = x.squeeze(-1)  # Remove the last dimension -> (num_patches, window_size)
        return self.projection(x)  # Output shape: (num_patches, embed_dim)



# -------------------- Step 3: Phi-3 Word Embeddings -------------------- #
class Phi3Embedder:
    def __init__(self, model_name="microsoft/phi-2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def get_word_embeddings(self, words):
        self.tokenizer.pad_token = self.tokenizer.eos_token
        inputs = self.tokenizer(words, return_tensors="pt", padding=True, truncation=True)
        print("Hidden size of model:", self.model.config.hidden_size)
        
        with torch.no_grad():
            outputs = self.model(**inputs)  # Output is (batch_size, seq_len, hidden_dim)
        
        # Use CLS token (first token) OR mean pooling
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # (batch_size, embedding_dim)

        print("Fixed Word Embeddings Shape:", cls_embeddings.shape)  # Debugging
        return cls_embeddings  # Should now be (num_words, 256)


# -------------------- Step 4: Multi-Head Attention Fusion -------------------- #
class TimeSeriesToLanguage(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads=4):
        super().__init__()
        self.patch_embedder = PatchEmbedder(input_dim, embed_dim)
        self.text_linear = nn.Linear(embed_dim, embed_dim)
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, time_series_patches, text_embeddings):
        time_series_embeddings = self.patch_embedder(time_series_patches)  
        # Shape: (num_patches, embed_dim)

        time_series_embeddings = time_series_embeddings.unsqueeze(1)  
        # Shape: (num_patches, 1, embed_dim)

        text_prototypes = self.text_linear(text_embeddings)  
        # Shape: (num_words, embed_dim)

        text_prototypes = text_prototypes.unsqueeze(1)  
        # Shape: (num_words, 1, embed_dim)

        # Ensure correct shape for MultiheadAttention
        time_series_embeddings = time_series_embeddings.permute(1, 0, 2)  
        # Shape: (1, num_patches, embed_dim)

        text_prototypes = text_prototypes.permute(1, 0, 2)  
        # Shape: (1, num_words, embed_dim)

        # Ensure `seq_len` is the same for both
        if time_series_embeddings.shape[1] != text_prototypes.shape[1]:
            min_seq_len = min(time_series_embeddings.shape[1], text_prototypes.shape[1])
            time_series_embeddings = time_series_embeddings[:, :min_seq_len, :]
            text_prototypes = text_prototypes[:, :min_seq_len, :]

        attn_output, _ = self.attention(time_series_embeddings, text_prototypes, text_prototypes)

        return self.output_linear(attn_output.squeeze(0))  # (num_patches, embed_dim)


# -------------------- Step 5: Execution -------------------- #
if __name__ == "__main__":
    # Load and process time series data
    processor = TimeSeriesProcessor(ticker="AAPL", start="2023-01-01", end="2024-01-01")
    processor.instance_normalize()
    patches = processor.create_patches(window_size=30, stride=1)
    
    # Convert to tensor (num_patches, window_size)
    patches_tensor = torch.tensor(patches, dtype=torch.float32).squeeze(-1)
    print("Patches Tensor Shape:", patches_tensor.shape)
    print("Patches Tensor Shape:", patches_tensor.shape)  # Should be [221, 30]

    # Get Phi-3 word embeddings
    phi3 = Phi3Embedder()
    words = ["growth", "volatility", "trend", "seasonality", "market"]
    word_embeddings = phi3.get_word_embeddings(words)
    print("Patches Tensor Shape:", patches_tensor.shape)  # Should be [221, 30]
    print("Word Embeddings Shape:", word_embeddings.shape)  # Should be [5, 256]

    # Transform time series into language representation
    model = TimeSeriesToLanguage(input_dim=30, embed_dim=2560)
    language_representation = model(patches_tensor, word_embeddings)

    print("Final Language Representation Shape:", language_representation.shape)


[*********************100%***********************]  1 of 1 completed


Patches Tensor Shape: torch.Size([221, 30])
Patches Tensor Shape: torch.Size([221, 30])


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.38it/s]


Hidden size of model: 2560
Fixed Word Embeddings Shape: torch.Size([5, 2560])
Patches Tensor Shape: torch.Size([221, 30])
Word Embeddings Shape: torch.Size([5, 2560])


NotImplementedError: Module [TimeSeriesToLanguage] is missing the required "forward" function