# BM25 Document Store Demo

This notebook demonstrates the `BM25DocumentStore` class for keyword-based retrieval of financial documents.

In [None]:
from entropy.contexts.retrieval import BM25DocumentStore, YFinanceFetcher
from entropy.utils.Seans_helpers import print_obj_map
import matplotlib.pyplot as plt
import numpy as np
import os
from dotenv import load_dotenv
from pathlib import Path

load_dotenv();

## Setup

In [None]:
refresh = True  # Set to True to force rebuild the store
store_path = Path(os.getenv("DATA_PROCESSED_PATH")) / "bm25_store_demo.pkl"

# load existing store with previously fetched data or fetch new data and save it into a newly created store
fetch_new_data = refresh or not os.path.exists(store_path)

if fetch_new_data:
    print("New data will be fetched.")
else:
    print("Existing store found, no need to fetch new data.")

Fetch from yfinance for a few example stocks.

In [None]:
if fetch_new_data:
    # Fetch news from yfinance
    tickers = ["AAPL", "TSLA", "MSFT", "NVDA"]
    fetcher = YFinanceFetcher()
    texts, metadata = fetcher.fetch_news(tickers)

    print(f"Fetched {len(texts)} articles across {len(tickers)} tickers")
else:
    print("This step unnecessary because existing store found.")

## Create and Populate Store

Initialize the BM25 document store and index the fetched articles.

In [None]:
if not fetch_new_data:
    print(f"Loading existing store from {store_path}")
    store = BM25DocumentStore.load(store_path)
else:
    print("Creating new store and indexing documents...")
    store = BM25DocumentStore()
    store.add_documents(texts, metadata)
    
    # Save for future use
    store.save(store_path)
    print(f"Saved store to {store_path}")

stats = store.get_stats()
print(f"Documents: {stats['num_documents']}")
print(f"Tickers: {', '.join(stats['tickers'])}")

Hierarchical structure of `BM25DocumentStore` object 

In [None]:
print_obj_map(store, include_dicts=False, header="store", max_depth=3);

print(store.bm25_index.idf)

## Search with BM25

Search for documents using BM25 ranking, which excels at exact term matching (ticker symbols, keywords).

In [None]:
def print_search_result_summary(results, query=None):

    if query is not None:
        print(f"\nQuery: '{query}'\n")

    for i, result in enumerate(results):
        doc = result["document"]
        score = result["score"]
        tickers = ", ".join(doc['metadata']['tickers'])
        print(f"{i+1}. [{tickers}] {doc['metadata']['title']}")
        print(f"   Score: {score:.2f}")

In [None]:
query = "AI chips revenue"

results = store.search(query, k=5)

print_search_result_summary(results, query=query)

Uncomment cell below to see hierarchical data structure example

In [None]:
# print_obj_map(results[0], header="result", mode="value");

## Filter by Ticker

Search within a specific ticker symbol to narrow results.

In [None]:
results = store.search(query, k=5, filter_ticker="TSLA")

print_search_result_summary(results, query=query)

## Inspect Tokenization

In [None]:
print("Example document tokenized:")

max_line_len = 120
current_line = []
current_len = 0

for token in store.tokenized_corpus[0]:
    token_str = f"<{token}>"
    token_len = len(token_str) + 1  # +1 for the space
    
    if current_len + token_len > max_line_len and current_line:
        print(" ".join(current_line))
        current_line = [token_str]
        current_len = token_len
    else:
        current_line.append(token_str)
        current_len += token_len

# Print remaining tokens
if current_line:
    print(" ".join(current_line))

print(f"\nLength: {len(store.tokenized_corpus[0])} tokens")


In [None]:
idf_dict = store.bm25_index.idf

labels = sorted(idf_dict, key=idf_dict.get)

n_most_extreme = 10
labels = labels[:n_most_extreme] + labels[-n_most_extreme:]
values = [idf_dict[k] for k in labels]

plt.figure(figsize=(12, 6))
plt.bar(labels, values, color='skyblue')
plt.xlabel('Terms')
plt.ylabel('Inverse Document Frequency')
plt.title('BM25 IDF values for most and least common terms')
plt.xticks(rotation=45);
