In [17]:
import torch
import torch.nn as nn
import yfinance as yf
import numpy as np
from transformers import AutoModel, AutoTokenizer

In [18]:
class TimeSeriesProcessor:
    def __init__(self, ticker, start, end, column="Close"):
        self.ticker = ticker
        self.start = start
        self.end = end
        self.column = column
        self.data = self._load_data()

    def _load_data(self):
        df = yf.download(self.ticker, start=self.start, end=self.end)
        return df[[self.column]].dropna().values

    def instance_normalize(self):
        mean = np.mean(self.data, axis=0)
        std = np.std(self.data, axis=0)
        self.data = (self.data - mean) / (std + 1e-8)
        return self

    def create_patches(self, window_size=30, stride=1):
        patches = [
            self.data[i : i + window_size] 
            for i in range(0, len(self.data) - window_size + 1, stride)
        ]
        print(f"Shape of the patches : {np.array(patches).shape}")
        return np.array(patches)
    
processor = TimeSeriesProcessor(ticker="AAPL", start="2023-01-01", end="2024-01-01")
processor.instance_normalize()
patches = processor.create_patches(window_size=30, stride=1)
patches_tensor = torch.tensor(patches, dtype=torch.float32).squeeze(-1)
print("Patches Tensor Shape:", patches_tensor.shape)
print(f"First five patches : {patches_tensor[:5]}")

[*********************100%***********************]  1 of 1 completed

Shape of the patches : (221, 30, 1)
Patches Tensor Shape: torch.Size([221, 30])
First five patches : tensor([[-2.7332, -2.6598, -2.7360, -2.4742, -2.4440, -2.4110, -2.2539, -2.2584,
         -2.1816, -2.1144, -2.1560, -2.1525, -2.0045, -1.8201, -1.7393, -1.7774,
         -1.6579, -1.5457, -1.7125, -1.6391, -1.5742, -1.2674, -1.0579, -1.2155,
         -1.0493, -1.2047, -1.2645, -1.2434, -1.0815, -1.1186],
        [-2.6598, -2.7360, -2.4742, -2.4440, -2.4110, -2.2539, -2.2584, -2.1816,
         -2.1144, -2.1560, -2.1525, -2.0045, -1.8201, -1.7393, -1.7774, -1.6579,
         -1.5457, -1.7125, -1.6391, -1.5742, -1.2674, -1.0579, -1.2155, -1.0493,
         -1.2047, -1.2645, -1.2434, -1.0815, -1.1186, -0.9971],
        [-2.7360, -2.4742, -2.4440, -2.4110, -2.2539, -2.2584, -2.1816, -2.1144,
         -2.1560, -2.1525, -2.0045, -1.8201, -1.7393, -1.7774, -1.6579, -1.5457,
         -1.7125, -1.6391, -1.5742, -1.2674, -1.0579, -1.2155, -1.0493, -1.2047,
         -1.2645, -1.2434, -1.0815, -1.118




In [19]:
class PatchEmbedder(nn.Module):
    def __init__(self, window_size, embed_dim):
        super().__init__()
        self.projection = nn.Linear(window_size, embed_dim)  

    def forward(self, x):
        x = x.squeeze(-1)  # Remove the last dimension -> (num_patches, window_size)
        return self.projection(x)  # Output shape: (num_patches, embed_dim)

In [20]:
class Phi3Embedder:
    def __init__(self, model_name="microsoft/phi-2"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name)

    def get_word_embeddings(self, words):
        self.tokenizer.pad_token = self.tokenizer.eos_token
        inputs = self.tokenizer(words, return_tensors="pt", padding=True, truncation=True)
        print("Hidden size of model:", self.model.config.hidden_size)
        
        with torch.no_grad():
            outputs = self.model(**inputs)  # Output is (batch_size, seq_len, hidden_dim)
        
        # Use CLS token (first token) OR mean pooling
        cls_embeddings = outputs.last_hidden_state[:, 0, :]  # (batch_size, embedding_dim)

        print("Fixed Word Embeddings Shape:", cls_embeddings.shape)  # Debugging
        return cls_embeddings  # Should now be (num_words, 256)
phi3 = Phi3Embedder()
words = ["growth", "volatility", "trend", "seasonality", "market"]
word_embeddings = phi3.get_word_embeddings(words)
print("Patches Tensor Shape:", patches_tensor.shape)  # Should be [221, 30]
print("Word Embeddings Shape:", word_embeddings.shape)  # Should be [5, 256]

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.19it/s]


Hidden size of model: 2560
Fixed Word Embeddings Shape: torch.Size([5, 2560])
Patches Tensor Shape: torch.Size([221, 30])
Word Embeddings Shape: torch.Size([5, 2560])


In [21]:
class TimeSeriesToLanguage(nn.Module):
    def __init__(self, input_dim, embed_dim, num_heads=4):
        super().__init__()
        self.patch_embedder = PatchEmbedder(input_dim, embed_dim)
        self.text_linear = nn.Linear(embed_dim, embed_dim)
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.output_linear = nn.Linear(embed_dim, embed_dim)

    def forward(self, time_series_patches, text_embeddings):
        time_series_embeddings = self.patch_embedder(time_series_patches)  
        # Shape: (num_patches, embed_dim)
        time_series_embeddings = time_series_embeddings.unsqueeze(1)  
        # Shape: (num_patches, 1, embed_dim)
        text_prototypes = self.text_linear(text_embeddings)  
        # Shape: (num_words, embed_dim)
        text_prototypes = text_prototypes.unsqueeze(1)  
        # Shape: (num_words, 1, embed_dim)
        # Ensure correct shape for MultiheadAttention
        time_series_embeddings = time_series_embeddings.permute(1, 0, 2)  
        # Shape: (1, num_patches, embed_dim)
        text_prototypes = text_prototypes.permute(1, 0, 2)  
        # Shape: (1, num_words, embed_dim)
        # Ensure `seq_len` is the same for both
        if time_series_embeddings.shape[1] != text_prototypes.shape[1]:
            min_seq_len = min(time_series_embeddings.shape[1], text_prototypes.shape[1])
            time_series_embeddings = time_series_embeddings[:, :min_seq_len, :]
            text_prototypes = text_prototypes[:, :min_seq_len, :]

        attn_output, _ = self.attention(time_series_embeddings, text_prototypes, text_prototypes)

        return self.output_linear(attn_output.squeeze(0))  # (num_patches, embed_dim)
model = TimeSeriesToLanguage(input_dim=30, embed_dim=2560)
language_representation = model(patches_tensor, word_embeddings)

print("Final Language Representation Shape:", language_representation)


Final Language Representation Shape: tensor([[ 0.0256, -0.1249,  0.0323,  ...,  0.0732,  0.4386, -0.0759],
        [-0.1686,  0.1043,  0.0582,  ..., -0.1155, -0.0989, -0.1598],
        [-0.0334,  0.1332, -0.1042,  ...,  0.0635, -0.1143,  0.2024],
        [ 0.0538,  0.0452,  0.1771,  ...,  0.0335,  0.0884,  0.4106],
        [ 0.1886, -0.1491,  0.1947,  ...,  0.1327,  0.1777,  0.2488]],
       grad_fn=<AddmmBackward0>)


In [22]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel

# Load Phi-2 tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
phi2_model = AutoModel.from_pretrained("microsoft/phi-2")

def decode_embeddings_to_words(embeddings, phi2_model, tokenizer):
    """
    Convert model-generated embeddings into words by finding the closest tokens in Phi-2 vocabulary.
    """
    with torch.no_grad():
        # Get the token embeddings from Phi-2
        token_embeddings = phi2_model.get_input_embeddings().weight  # (vocab_size, embed_dim)

        # Compute cosine similarity
        similarities = F.cosine_similarity(embeddings.unsqueeze(1), token_embeddings.unsqueeze(0), dim=-1)  
        closest_tokens = similarities.argmax(dim=-1)  # Get index of the closest token

        # Decode token IDs into words
        decoded_words = tokenizer.convert_ids_to_tokens(closest_tokens.tolist())
        
    return " ".join(decoded_words)

# Convert the final language representation
decoded_text = decode_embeddings_to_words(language_representation, phi2_model, tokenizer)

print("Generated Text:", decoded_text)


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.15it/s]


Generated Text: Ġenc cern ĠClaw Ġdisaster Ġcommission
