# Indexing

In [32]:
import os
import time
import json
import pyterrier as pt
from time import time

INDEX_BASE_PATH = "./indexes"
INDEX_PATH_NO_PREPROCESSING = f"{INDEX_BASE_PATH}/no_preprocessing"
INDEX_PATH_DEFAULT = f"{INDEX_BASE_PATH}/default"
INDEX_PATH_DEFAULT_POSITIONS = f"{INDEX_BASE_PATH}/default_positions"
INDEX_PATH_STOPWORDS = f"{INDEX_BASE_PATH}/stopwords"
INDEX_PATH_STEMMING = f"{INDEX_BASE_PATH}/stemming"

CORPUS_PATH = "./data/corpus.jsonl"
THREADS = 6

In [33]:
if not pt.started():
    pt.init()

In [34]:
def get_corpus(path: str):
    """
    Loads the corpus from the given path.
    :param path: Path to the corpus.
    :return: Generator of documents (lines in the document/path).
    """
    with open(path, "r") as f:
        for line in f:
            line_dict = json.loads(line)
            line_dict['docno'] = line_dict.pop('_id')
            yield line_dict

PyTerrier indexing runs with stemming and stopwords removal by default (“Stopwords,PorterStemmer”).
Modifying the preprocessing step is done by calling `.setProperty()`on the `IterDictIndexer` object.
Below cells will generate 4 different indices with different preprocessing steps.

In [35]:
def generate_index(index_path, termpipelines=None, overwrite=False, blocks=False):
    iter_indexer = pt.IterDictIndexer(
        index_path,
        overwrite=overwrite,
        meta=["docno", "title", "text"],
        meta_lengths=[20, 256, 4096],
        threads=THREADS,
        blocks=blocks
    )
    if termpipelines is not None:
        iter_indexer.setProperty("termpipelines", termpipelines)

    iter_indexer.index(get_corpus(CORPUS_PATH), fields=["title", "text"])
    index = pt.IndexFactory.of(index_path)
    return index

# Timing the indexing process

Default indexing ("Stopwords, PorterStemmer"), with positions (blocks)

In [None]:
t = time()
index = generate_index(INDEX_PATH_DEFAULT_POSITIONS, termpipelines="Stopwords,PorterStemmer", blocks=True)
m, s = divmod(time() - t, 60)
print(f"Indexing with blocks/positions and both stemming and stopword removal took {m:.02f} minutes and {s:.02f} seconds")

Default indexing ("Stopwords, PorterStemmer")

In [None]:
t = time()
index = generate_index(INDEX_PATH_DEFAULT, termpipelines="Stopwords,PorterStemmer")
m, s = divmod(time() - t, 60)
print(f"Indexing with both stemming and stopword removal took {m:.02f} minutes and {s:.02f} seconds")

No preprocessing pipeline

In [None]:
t = time()
index = generate_index(INDEX_PATH_NO_PREPROCESSING, termpipelines="")
m, s = divmod(time() - t, 60)
print(f"Indexing with no preprocessing took {m:.02f} minutes and {s:.02f} seconds")

Only remove stopwords

In [None]:
t = time()
index = generate_index(INDEX_PATH_STOPWORDS, termpipelines="Stopwords")
m, s = divmod(time() - t, 60)
print(f"Indexing with only stopwords took {m:.02f} minutes and {s:.02f} seconds")

Only do stemming

In [None]:
t = time()
index = generate_index(INDEX_PATH_STEMMING, termpipelines="PorterStemmer")
m, s = divmod(time() - t, 60)
print(f"Indexing with only PorterStemmer took {m:.02f} minutes and {s:.02f} seconds")

# Looking into index statistics

In [44]:
def get_index(index_path: str):
    index = pt.IndexFactory.of(index_path)
    return index

Default pipeline ("Stopwords, PorterStemmer")

In [45]:
index = get_index(INDEX_PATH_DEFAULT)
print(index.getCollectionStatistics().toString())


Number of documents: 1
Number of terms: 1
Number of postings: 1
Number of fields: 0
Number of tokens: 1
Field names: []
Positions:   false



Default pipeline ("Stopwords, PorterStemmer") with positions/blocks

In [None]:
index = get_index(INDEX_PATH_DEFAULT_POSITIONS)
print(index.getCollectionStatistics().toString())

Only remove stopwords

In [46]:
index = get_index(INDEX_PATH_STOPWORDS)
print(index.getCollectionStatistics().toString())

Number of documents: 528155
Number of terms: 621458
Number of postings: 90480384
Number of fields: 2
Number of tokens: 145322007
Field names: [title, text]
Positions:   false



Only PorterStemmer

In [None]:
index = get_index(INDEX_PATH_STEMMING)
print(index.getCollectionStatistics().toString())

No preprocessing pipeline

In [None]:
index = get_index(INDEX_PATH_NO_PREPROCESSING)
print(index.getCollectionStatistics().toString())



Only remove stopwords

In [None]:
index = get_index(INDEX_PATH_STOPWORDS)
index.getCollectionStatistics().toString()
