In [1]:
import os
import sys
sys.path.insert(0, '../')

from colbert.infra import Run, RunConfig, ColBERTConfig
from colbert.data import Queries, Collection
from colbert import Indexer, Searcher

In [2]:
dataroot = 'downloads/lotte'
dataset = 'lifestyle'
datasplit = 'dev'

queries = os.path.join(dataroot, dataset, datasplit, 'questions.search.tsv')
collection = os.path.join(dataroot, dataset, datasplit, 'collection.tsv')

queries = Queries(path=queries)
collection = Collection(path=collection)

f'Loaded {len(queries)} queries and {len(collection):,} passages'

[Feb 23, 04:43:19] #> Loading the queries from downloads/lotte/lifestyle/dev/questions.search.tsv ...
[Feb 23, 04:43:19] #> Got 417 queries. All QIDs are unique.

[Feb 23, 04:43:19] #> Loading collection...
0M 1M 2M 


'Loaded 417 queries and 2,360,655 passages'

In [7]:
nbits = 2   # encode each dimension with 2 bits
doc_maxlen = 300   # truncate passages at 300 tokens

checkpoint = 'downloads/colbertv2.0'
index_name = f'lifestyle.dev.2bits'

In [8]:
# To create the searcher using its relative name (i.e., not a full path), set
# experiment=value_used_for_indexing in the RunConfig.
with Run().context(RunConfig(experiment='notebook')):
    searcher = Searcher(index=index_name)


# If you want to customize the search latency--quality tradeoff, you can also supply a
# config=ColBERTConfig(ncells=.., centroid_score_threshold=.., ndocs=..) argument.
# The default settings with k <= 10 (1, 0.5, 256) gives the fastest search,
# but you can gain more extensive search by setting larger values of k or
# manually specifying more conservative ColBERTConfig settings (e.g. (4, 0.4, 4096)).

[Feb 23, 04:47:30] #> Loading collection...
0M 1M 2M 
[Feb 23, 04:47:38] #> Loading codec...
[Feb 23, 04:47:38] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Feb 23, 04:47:39] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...
[Feb 23, 04:47:39] #> Loading IVF...
[Feb 23, 04:47:40] #> Loading doclens...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 95/95 [00:00<00:00, 1322.00it/s]

[Feb 23, 04:47:40] #> Loading codes and residuals...



100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 95/95 [00:06<00:00, 15.22it/s]


In [9]:
# query = queries[37]   # or supply your own query
# query = "what is applewatch?"
# query = queries[31]
# query = "hello"
# query = "learn SQL"
# query = "prepare a luggage for travel"
query = "Chinese cooking recepit"
# query = queries[3]


print(f"#> {query}")

# Find the top-3 passages for this query 【query数量】
results = searcher.search(query, k=10)
# results = searcher.search(query, k=10)

# Print out the top-k retrieved passages
for passage_id, passage_rank, passage_score in zip(*results):
    print(f"\t [{passage_rank}] \t\t {passage_score:.1f} \t\t {searcher.collection[passage_id]}")

#> Chinese cooking recepit

#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==
#> Input: . Chinese cooking recepit, 		 True, 		 None
#> Output IDs: torch.Size([32]), tensor([  101,     1,  2822,  8434, 28667, 13699,  4183,   102,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103,   103,   103,   103,   103,   103,   103,   103,   103,
          103,   103])
#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0])

	 [1] 		 18.6 		 Chinese Cooking. Cookbook.
	 [2] 		 18.6 		 Chinese Cooking. .
	 [3] 		 18.5 		 Dim Sum: Dumplings, Parcels and Other Delectable Chinese Snacks in 25 Authentic Recipes. Dim sum is a traditional style of eating, where bite-sized tidbits are served for shared dining. This book makes authentic Chinese dim sum accessible to the home cook. It opens with a practical introduction to the cuisin