# Getting started with whoosh

In [1]:
import os
import csv

import whoosh
from whoosh import fields, scoring
from whoosh.qparser import QueryParser

We first define a schema for the data. In this case, each document has a name (which is its ID) and content.

In [2]:
schema = fields.Schema(name=fields.ID(stored=True), content=fields.TEXT)

Next, we create an index according to the schema in a new directory.

In [3]:
os.makedirs('index', exist_ok=True)
idx = whoosh.index.create_in('index', schema)
writer = idx.writer()

Now we can add some sample documents to our index. Note how the keyword arguments correspond to the schema.

In [4]:
writer.add_document(name='d1', content='This is the first document we\'ve added!')
writer.add_document(name='d2', content='The second one is even more interesting!')
writer.commit()

We can now query our index and print the results.

In [5]:
query = 'first'
with idx.searcher() as searcher:
    q = whoosh.qparser.QueryParser('content', idx.schema).parse(query)
    results = searcher.search(q)
    for r in results:
        print(r)

<Hit {'name': 'd1'}>


# Custom ranking functions

We can change the ranking function by passing it to the searcher:

In [6]:
with idx.searcher(weighting=scoring.TF_IDF()) as searcher:
    q = QueryParser('content', idx.schema).parse(query)
    results = searcher.search(q)
    for r in results:
        print(r)

<Hit {'name': 'd1'}>


You can define a custom ranking function too. For example, ``pos_score_fn`` computes a score for a given document using only one field. Here the score is based on the first occurence (position) of the query term.

In [7]:
def pos_score_fn(searcher, fieldname, text, matcher):
    positions = matcher.value_as('positions')
    return 1 / (positions[0] + 1)

pos_weighting = scoring.FunctionWeighting(pos_score_fn)
with idx.searcher(weighting=pos_weighting) as searcher:
    q = QueryParser('content', idx.schema).parse('first')
    results = searcher.search(q)
    for r in results:
        print(r)

<Hit {'name': 'd1'}>


# Indexing a collection and computing metrics

We now index a collection of sample documents. All documents are stored in the file `clueweb_clean_docs_sample.tsv`, where the first column contains the name and the third column contains the content of a document. First, we read the file.

In [8]:
def read_file(file_path, delimiter='\t'):
    doc_list = []
    with open(file_path, encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter, quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for name, _, content in reader:
            # we remove newlines
            doc_list.append((name, content.replace('\n',' ')))
    return doc_list

doc_list = read_file('clueweb_clean_docs_sample.tsv')
print('number of docs: {}'.format(len(doc_list)))

number of docs: 58


Now we can build the index using the same schema as before.

In [9]:
os.makedirs('cw_index', exist_ok=True)
idx = whoosh.index.create_in('cw_index', schema)
writer = idx.writer()

for name, content in doc_list:
    writer.add_document(name=name, content=content)
writer.commit()

We can also define a function that returns all results for a given query, using some ranking function.

In [10]:
def search_index(idx, query, ranking_fn):
    result_list = []
    with idx.searcher(weighting=ranking_fn) as searcher:
        q = QueryParser('content', idx.schema).parse(query)
        results = searcher.search(q, limit=None)
        result_list = [r['name'] for r in results]
    return result_list

We search for the query `403b`.

In [11]:
query = '403b'
result_list_bm25 = search_index(idx, query, ranking_fn=scoring.BM25F)
print('found {} results'.format(len(result_list_bm25)))

found 30 results


In order to evaluate a scoring function, we can use some metrics like precision, recall etc. Additionally, we need the query relevance scores, i.e. a list of integers that indicate how relevant a document is for a query. For our query `403b` we can read these relevance scores from the file `403b-qrels.csv`.

In [12]:
def read_qrels(file_path, delimiter=' '):
    qrels = {}
    with open(file_path, encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile, delimiter=delimiter, quotechar='|', quoting=csv.QUOTE_MINIMAL)
        for name, qrel in reader:
            qrels[name] = int(qrel)
    return qrels

qrels = read_qrels('403b-qrels.csv')

Now we can implement our metrics and evaluate the ranking function using the results and the query relevance scores.

In [13]:
def precision(doc_list, qrels, k):
    # we consider a document relevant if its qrel is greater than 0
    tp = [name for name in doc_list[:k] if qrels.get(name, 0) > 0]
    return len(tp) / k

precision(result_list_bm25, qrels, 15)

0.6666666666666666

We can also compare BM25 ranking with tf-idf raking.

In [14]:
result_list_tfidf = search_index(idx, query, ranking_fn=scoring.TF_IDF)
precision(result_list_tfidf, qrels, 15)

0.5333333333333333