In [1]:
import json
from dexter.data.loaders.RetrieverDataset import RetrieverDataset
from dexter.config.constants import Split
from dexter.data.datastructures.hyperparameters.dpr import DenseHyperParams
from dexter.retriever.dense.Contriever import Contriever
from dexter.utils.metrics.SimilarityMatch import CosineSimilarity
from dexter.utils.metrics.retrieval.RetrievalMetrics import RetrievalMetrics
from dexter.llms.llm_engine_orchestrator import LLMEngineOrchestrator
import os

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# test_data_loading.py
"""
Simple script to test if data loading works correctly
"""

from dexter.data.loaders.RetrieverDataset import RetrieverDataset
from dexter.config.constants import Split

print("=" * 80)
print("Testing Data Loading")
print("=" * 80)

print("\n1. Loading dataset...")
print("   - Dataset: musiqueqa")
print("   - Corpus: wiki-musiqueqa-corpus")
print("   - Split: DEV")

try:
    loader = RetrieverDataset(
        "wikimultihopqa",
        "wiki-musiqueqa-corpus",
        "evaluation/config.ini",
        Split.DEV,
        tokenizer=None  # None = faster loading, just get raw data
    )
    
    print("   ✓ Loader initialized successfully!")
    
    print("\n2. Extracting queries, qrels, and corpus...")
    queries, qrels, corpus = loader.qrels()
    
    print(f"   ✓ Data loaded successfully!")
    print(f"\n3. Data Statistics:")
    print(f"   - Total queries: {len(queries)}")
    print(f"   - Total qrels: {len(qrels)}")
    print(f"   - Corpus size: {len(corpus)} documents")
    
    print(f"\n4. First 1200 queries (your test set):")
    test_queries = queries[:1200]
    print(f"   - Test set size: {len(test_queries)} queries")
    
    print(f"\n5. Sample query (first one):")
    first_query = queries[0]
    print(f"   - ID: {first_query.id()}")
    print(f"   - Question: {first_query.text()[:100]}...")
    
    print(f"\n6. Sample corpus document (first one):")
    first_doc = corpus[0]
    print(f"   - ID: {first_doc.id()}")
    print(f"   - Title: {first_doc.title()}")
    print(f"   - Text preview: {first_doc.text()[:100]}...")
    
    print("\n" + "=" * 80)
    print("✓ SUCCESS! Your data is loaded correctly.")
    print("=" * 80)
    print("\nYou're ready to proceed to Step 3: Running retrieval experiments!")
    
except FileNotFoundError as e:
    print(f"\n✗ ERROR: File not found - {e}")
    print("\nCheck that:")
    print("  1. config.ini line 6 is: musiqueqa = data")
    print("  2. config.ini line 9 is: wiki-musiqueqa-corpus = data/wiki_musique_corpus.json")
    print("  3. Files exist: data/dev.json and data/wiki_musique_corpus.json")
    
except Exception as e:
    print(f"\n✗ ERROR: {type(e).__name__}: {e}")
    import traceback
    traceback.print_exc()

Testing Data Loading

1. Loading dataset...
   - Dataset: musiqueqa
   - Corpus: wiki-musiqueqa-corpus
   - Split: DEV


Loading passages: 100%|██████████| 563424/563424 [00:00<00:00, 775038.02it/s]
Transforming passage dataset: 100%|██████████| 563424/563424 [00:01<00:00, 502835.41it/s]


Harley-Davidson Harley-Davidson
KeysView(<Section: Data-Path>)
12576


100%|██████████| 1200/1200 [02:05<00:00,  9.59it/s]


   ✓ Loader initialized successfully!

2. Extracting queries, qrels, and corpus...
   ✓ Data loaded successfully!

3. Data Statistics:
   - Total queries: 1200
   - Total qrels: 1200
   - Corpus size: 563424 documents

4. First 1200 queries (your test set):
   - Test set size: 1200 queries

5. Sample query (first one):
   - ID: 8813f87c0bdd11eba7f7acde48001122
   - Question: Who is the mother of the director of film Polish-Russian War (Film)?...

6. Sample corpus document (first one):
   - ID: 0
   - Title: Ishberda
   - Text preview: Ishberda is a rural locality( a selo) and the administrative center of Ishberdinsky Selsoviet, Bayma...

✓ SUCCESS! Your data is loaded correctly.

You're ready to proceed to Step 3: Running retrieval experiments!


In [3]:
from dexter.data.loaders.RetrieverDataset import RetrieverDataset
from dexter.config.constants import Split
from dexter.data.datastructures.hyperparameters.dpr import DenseHyperParams
from dexter.retriever.dense.Contriever import Contriever
from dexter.utils.metrics.SimilarityMatch import CosineSimilarity
from dexter.utils.metrics.retrieval.RetrievalMetrics import RetrievalMetrics
import json

print("Loading data...")
loader = RetrieverDataset("wikimultihopqa", "wiki-musiqueqa-corpus", 
                          "evaluation/config.ini", Split.DEV, tokenizer=None)
queries, qrels, corpus = loader.qrels()

print(f"Loaded {len(queries)} queries, {len(corpus)} documents")


Loading data...


Loading passages: 100%|██████████| 563424/563424 [00:00<00:00, 1199990.02it/s]
Transforming passage dataset: 100%|██████████| 563424/563424 [00:01<00:00, 519926.40it/s]


Harley-Davidson Harley-Davidson
KeysView(<Section: Data-Path>)
12576


100%|██████████| 1200/1200 [02:20<00:00,  8.52it/s]


Loaded 1200 queries, 563424 documents


In [4]:
import torch
print(f"MPS available: {torch.backends.mps.is_available()}")
print(f"MPS built: {torch.backends.mps.is_built()}")

print(torch.__version__)

MPS available: True
MPS built: True
2.9.1


In [5]:
import torch
# Initialize Contriever 
config = DenseHyperParams(
    query_encoder_path="facebook/contriever",
    document_encoder_path="facebook/contriever",
    batch_size=64,  # Increased from 32 for faster processing
    show_progress_bar=True
)
retriever = Contriever(config)
similarity = CosineSimilarity()

# CHANGE 1: Use smaller corpus for faster testing (or full corpus if you have time)
# corpus_subset = corpus[:100]  
corpus_subset = corpus  # Uncomment this line to use full corpus later

# Encode once, reuse for all k values
print("Encoding queries once...")
query_embeddings = retriever.encode_queries(queries, batch_size=64)

print(f"Encoding corpus once ({len(corpus_subset)} documents)...")
corpus_embeddings = retriever.encode_corpus(corpus_subset)

print("Query embeddings shape:", query_embeddings.shape)
print("Corpus embeddings shape:", corpus_embeddings.shape)

# Now retrieve for different k values using pre-computed embeddings
results = {}
for k in [1, 3, 5]:
    print(f"\n{'='*80}")
    print(f"Computing top-{k} from embeddings...")
    print(f"{'='*80}")
    
    # Compute similarity scores
    cos_scores = similarity.evaluate(query_embeddings, corpus_embeddings)
    print(f"Similarity scores shape: {cos_scores.shape}")
    
    # Get top-k indices for each query
    response = {}
    for query_idx in range(len(queries)):
        query_id = queries[query_idx].id()
        # Get top-k document indices and scores
        top_k_scores, top_k_indices = torch.topk(cos_scores[query_idx], k)
        
        # Store results
        response[query_id] = {
            str(corpus_subset[idx.item()].id()): float(score.item())  # Use corpus ID, not index
            for idx, score in zip(top_k_indices, top_k_scores)
        }
    
    # Evaluate
    metrics = RetrievalMetrics(k_values=[k])
    eval_results = metrics.evaluate_retrieval(qrels=qrels, results=response)
    
    results[k] = {'retrieval': response, 'metrics': eval_results}
    print(f"\nResults for k={k}:")
    print(eval_results)
    
    # Save retrieval results
    with open(f'retrieval_k{k}.json', 'w') as f:
        json.dump(response, f)

print(f"\n{'='*80}")
print("DONE! Retrieval results saved.")
print(f"{'='*80}")

Encoding queries once...
token_emb torch.Size([1200, 35, 768])
sentence_emb torch.Size([1200, 768])
Encoding corpus once (563424 documents)...


  0%|          | 0/563424 [00:00<?, ?it/s]

Starting encoding of contexts....


563456it [19:07:55,  8.18it/s]                                


context_embeddings torch.Size([563424, 768])
Query embeddings shape: torch.Size([1200, 768])
Corpus embeddings shape: torch.Size([563424, 768])

Computing top-1 from embeddings...
Similarity scores shape: torch.Size([1200, 563424])

Results for k=1:
({'NDCG@1': 0.58083}, {'MAP@1': 0.0584}, {'Recall@1': 0.0584}, {'P@1': 0.58083})

Computing top-3 from embeddings...
Similarity scores shape: torch.Size([1200, 563424])

Results for k=3:
({'NDCG@3': 0.34511}, {'MAP@3': 0.07699}, {'Recall@3': 0.09721}, {'P@3': 0.32222})

Computing top-5 from embeddings...
Similarity scores shape: torch.Size([1200, 563424])

Results for k=5:
({'NDCG@5': 0.28171}, {'MAP@5': 0.08854}, {'Recall@5': 0.12153}, {'P@5': 0.24167})

DONE! Retrieval results saved.
