# Embedding Document Store Demo

This notebook demonstrates the `EmbeddingDocumentStore` class for semantic similarity retrieval of financial documents.

In [None]:
from entropy.contexts.retrieval import EmbeddingDocumentStore, YFinanceFetcher
from entropy.utils.Seans_helpers import print_obj_map
import numpy as np

## Setup

Fetch from yfinance for a few example stocks.

In [None]:
# Fetch real news from yfinance
tickers = ["AAPL", "TSLA", "MSFT", "NVDA"]
fetcher = YFinanceFetcher()
texts, metadata = fetcher.fetch_news(tickers)

print(f"Fetched {len(texts)} articles across {len(tickers)} tickers")

## Create and Populate Store

Initialize the embedding document store and index the fetched articles.

In [None]:
store = EmbeddingDocumentStore()
store.add_documents(texts, metadata)

stats = store.get_stats()
print(f"Documents: {stats['num_documents']}")
print(f"Embedding dimension: {store.dimension}")
print(f"Tickers: {', '.join(stats['tickers'])}")

## Search with Embeddings

Search for documents using semantic similarity. Unlike BM25, embeddings understand meaning and concepts, not just keywords.

In [None]:
def print_search_result_summary(results, query=None):
    if query is not None:
        print(f"\nQuery: '{query}'\n")

    for i, result in enumerate(results):
        doc = result["document"]
        score = result["score"]
        print(f"{i+1}. [{doc['metadata']['ticker']}] {doc['metadata']['title']}")
        print(f"   Distance: {score:.4f}")  # lower = better

    print("Note: lower distances are better matches")

In [None]:
query = "companies with strong cloud business growth"

results = store.search(query, k=5)

print_search_result_summary(results, query=query)

## Semantic Understanding

Embeddings excel at understanding concepts without exact keyword matches. Try different conceptual queries:

In [None]:
# Test semantic understanding with queries that don't have exact keyword matches
semantic_queries = [
    "electric vehicle manufacturers",
    "artificial intelligence chip makers",
    "quarterly financial performance"
]

for query in semantic_queries:
    results = store.search(query, k=2)
    print_search_result_summary(results, query=query)
    print()

## Filter by Ticker

Search within a specific ticker symbol to narrow results.

In [None]:
query = "latest developments"
results = store.search(query, k=5, filter_ticker="TSLA")

print_search_result_summary(results, query=query)

## Inspect Embeddings

Look at the dense vector representation of a document.

In [None]:
# Get embedding for query
sample_embedding = store.model.encode([query])[0]

print(f"Embedded text: \"{query}\"")
print(f"Embedding type: {type(sample_embedding)} | Embedding shape: {sample_embedding.shape}")
print(f"\nFirst 10 dimensions: {sample_embedding[:10]}")