In [21]:
# Some minor setup for the notebook
import warnings
import nest_asyncio

# Ignore all warnings
warnings.filterwarnings("ignore")

# Allows for running async code in Jupyter notebooks
nest_asyncio.apply()

In [12]:
from dotenv import load_dotenv

load_dotenv(override=True)

True

In [7]:
from datasets import load_dataset

ms_marco = load_dataset("microsoft/ms_marco", "v2.1")

In [36]:
def mean_reciprocal_rank(retrieved_list, relevant_list):
    ranks = []
    for relevant in relevant_list:
        try:
            rank = retrieved_list.index(relevant) + 1
        except ValueError:
            rank = len(retrieved_list) + 1  # Not found
        ranks.append(rank)
    return 1.0 / min(ranks) if ranks else 0.0

# Use subset for demonstration
subset = ms_marco['train'].select(range(1000))

In [8]:
print(subset[2]['query'])

why did stalin want control of eastern europe


In [9]:
from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document



In [11]:
# Prepare the documents for BM25
bm25_docs = [Document(page_content=passage) for data in subset for passage in data['passages']['passage_text']]

# Initialize BM25 Retriever
bm25_retriever = BM25Retriever.from_documents(bm25_docs)

In [33]:
import os
from ragstack_colbert import CassandraDatabase, ColbertEmbeddingModel

keyspace = "benchmarksmarco1000parallel"
database_id = os.getenv("ASTRA_DATABASE_ID")
astra_token = os.getenv("ASTRA_TOKEN")

database = CassandraDatabase.from_astra(
    astra_token=astra_token,
    database_id=database_id,
    keyspace=keyspace
)

embedding_model = ColbertEmbeddingModel()



In [34]:
from ragstack_langchain.colbert import ColbertVectorStore as LangchainColbertVectorStore

lc_vector_store = LangchainColbertVectorStore(
    database=database,
    embedding_model=embedding_model,
)

In [15]:
all_texts = []
all_metadatas = []
i = 0
for row in subset:
    all_texts.extend(row['passages']['passage_text'])
    all_metadatas.extend([{'row_id': i} for _ in row['passages']['is_selected']])
    i += 1


In [None]:
print("Number of texts:", len(all_texts))

In [None]:
lc_vector_store.add_texts(all_texts, metadatas=all_metadatas)

In [None]:
from tqdm import tqdm
colbert_mrrs = []
for row in tqdm(subset):
    query = row['query']
    n_results = len(row['passages']['is_selected'])
    raw_results = lc_vector_store.similarity_search(query, k=n_results)
    retrieved_list = [result.page_content for result in raw_results]
    relevant_list = row['passages']['passage_text']
    
    mrr = mean_reciprocal_rank(retrieved_list, relevant_list)
    colbert_mrrs.append(mrr)

 27%|██▋       | 266/1000 [02:57<09:18,  1.32it/s]

In [38]:
print(f"Mean Reciprocal Rank for ColBERT: {sum(colbert_mrrs) / len(colbert_mrrs):.4f}")

Mean Reciprocal Rank for ColBERT: 0.9825


In [40]:
# Now test BM25
bm25_mrrs = []
for row in subset:
    query = row['query']
    n_results = len(row['passages']['is_selected'])
    raw_results = bm25_retriever.invoke(query, k=n_results)
    retrieved_list = [result.page_content for result in raw_results]
    relevant_list = row['passages']['passage_text']
    
    mrr = mean_reciprocal_rank(retrieved_list, relevant_list)
    bm25_mrrs.append(mrr)

In [41]:
print(f"Mean Reciprocal Rank for BM25: {sum(bm25_mrrs) / len(bm25_mrrs):.4f}")

Mean Reciprocal Rank for BM25: 0.7442


In [43]:
# Plot the MRR distributions of the two models
import matplotlib.pyplot as plt

plt.hist(colbert_mrrs, bins=20, alpha=0.5, label='ColBERT')
plt.hist(bm25_mrrs, bins=20, alpha=0.5, label='BM25')
plt.legend(loc='upper right')
plt.xlabel('Mean Reciprocal Rank')
plt.ylabel('Frequency')
plt.show()

NameError: name 'colbert_mrrs' is not defined