# 1. Setup and helper functions

In [8]:
import os
import time
import json
import pyterrier as pt
from time import time

INDEX_BASE_PATH = "./indexes"
INDEX_PATH_NO_PREPROCESSING = f"{INDEX_BASE_PATH}/no_preprocessing"
INDEX_PATH_DEFAULT = f"{INDEX_BASE_PATH}/default"
INDEX_PATH_DEFAULT_POSITIONS = f"{INDEX_BASE_PATH}/default_positions"
INDEX_PATH_STOPWORDS = f"{INDEX_BASE_PATH}/stopwords"
INDEX_PATH_STEMMING = f"{INDEX_BASE_PATH}/stemming"

CORPUS_PATH = "./data/corpus.jsonl"
THREADS = 6

In [2]:
if not pt.started():
    pt.init()

PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.


In [3]:
def get_corpus(path: str):
    """
    Loads the corpus from the given path.
    :param path: Path to the corpus.
    :return: Generator of documents (lines in the document/path).
    """
    with open(path, "r") as f:
        for line in f:
            line_dict = json.loads(line)
            line_dict['docno'] = line_dict.pop('_id')
            yield line_dict

PyTerrier indexing runs with stemming and stopwords removal by default (“Stopwords,PorterStemmer”).
Modifying the preprocessing step is done by calling `.setProperty()`on the `IterDictIndexer` object.
Below cells will generate 4 different indices with different preprocessing steps.

In [4]:
def generate_index(index_path, termpipelines=None, overwrite=False, blocks=False):
    if not os.path.exists(index_path + '/data.properties'):
        indexer = pt.IterDictIndexer(
        index_path,
        overwrite=overwrite,
        meta=["docno", "title", "text"],
        meta_lengths=[20, 256, 4096],
        threads=THREADS,
        blocks=blocks)
        if termpipelines is not None:
            indexer.setProperty("termpipelines", termpipelines)

        index_ref = indexer.index(get_corpus(CORPUS_PATH), fields=["title", "text"])
    else:
        index_ref = pt.IndexRef.of(index_path + '/data.properties')
    index = pt.IndexFactory.of(index_ref)
    return index

In [5]:
def get_index(index_path: str):
    index = pt.IndexFactory.of(index_path)
    return index

# 2. Timing the indexing process

Default indexing ("Stopwords, PorterStemmer"), with positions (blocks)

Default indexing ("Stopwords, PorterStemmer")

In [13]:
t = time()
index = generate_index(INDEX_PATH_DEFAULT, termpipelines="Stopwords,PorterStemmer")
m, s = divmod(time() - t, 60)
print(f"Indexing with both stemming and stopword removal took {m:.02f} minutes and {s:.02f} seconds")

Indexing with both stemming and stopword removal took 0.00 minutes and 0.00 seconds


No preprocessing pipeline

In [10]:
t = time()
index = generate_index(INDEX_PATH_NO_PREPROCESSING, termpipelines="")
m, s = divmod(time() - t, 60)
print(f"Indexing with no preprocessing took {m:.02f} minutes and {s:.02f} seconds")

Indexing with no preprocessing took 0.00 minutes and 0.54 seconds


Only remove stopwords

In [11]:
t = time()
index = generate_index(INDEX_PATH_STOPWORDS, termpipelines="Stopwords")
m, s = divmod(time() - t, 60)
print(f"Indexing with only stopwords took {m:.02f} minutes and {s:.02f} seconds")

Indexing with only stopwords took 0.00 minutes and 0.51 seconds


Only do stemming

In [12]:
t = time()
index = generate_index(INDEX_PATH_STEMMING, termpipelines="PorterStemmer")
m, s = divmod(time() - t, 60)
print(f"Indexing with only PorterStemmer took {m:.02f} minutes and {s:.02f} seconds")

Indexing with only PorterStemmer took 0.00 minutes and 0.54 seconds


## 2.1 With positions (blocks)

In [None]:
# Default indexing ("Stopwords, PorterStemmer"), with positions (blocks)
t = time()
index = generate_index(INDEX_PATH_DEFAULT_POSITIONS, termpipelines="Stopwords,PorterStemmer", blocks=True)
m, s = divmod(time() - t, 60)
print(f"Indexing with blocks/positions and both stemming and stopword removal took {m:.02f} minutes and {s:.02f} seconds")

# 3. Looking into index statistics

Default pipeline ("Stopwords, PorterStemmer")

In [9]:
index = get_index(INDEX_PATH_DEFAULT)
print(index.getCollectionStatistics().toString())
print("Default index size: ", os.path.getsize(INDEX_PATH_DEFAULT + '/data.inverted.bf')/1000000, "mb")

Number of documents: 2453
Number of terms: 30772
Number of postings: 292376
Number of fields: 2
Number of tokens: 485085
Field names: [title, text]
Positions:   false

Default index size:  0.455468 mb


In [28]:
# Only remove stopwords
index = get_index(INDEX_PATH_STOPWORDS)
print(index.getCollectionStatistics().toString())
print("Stopworded index size: ", os.path.getsize(INDEX_PATH_STOPWORDS + '/data.inverted.bf')/1000000, "mb")

Number of documents: 528155
Number of terms: 621458
Number of postings: 90480384
Number of fields: 2
Number of tokens: 145322007
Field names: [title, text]
Positions:   false

Stopworded index size:  167.401919 mb


In [27]:
# Only PorterStemmer
index = get_index(INDEX_PATH_STEMMING)
print(index.getCollectionStatistics().toString())
print("Stemmed index size: ", os.path.getsize(INDEX_PATH_STEMMING + '/data.inverted.bf')/1000000, "mb")

Number of documents: 528155
Number of terms: 520725
Number of postings: 110057603
Number of fields: 2
Number of tokens: 257028522
Field names: [title, text]
Positions:   false

Stemmed index size:  189.370016 mb


In [26]:
# No preprocessing pipeline
index = get_index(INDEX_PATH_NO_PREPROCESSING)
print(index.getCollectionStatistics().toString())
print("No preprocessing index size: ", os.path.getsize(INDEX_PATH_NO_PREPROCESSING + '/data.inverted.bf')/1000000, "mb")

Number of documents: 528155
Number of terms: 622029
Number of postings: 117766368
Number of fields: 2
Number of tokens: 257028522
Field names: [title, text]
Positions:   false

No preprocessing index size:  213.383976 mb


In [42]:
# Default pipeline ("Stopwords, PorterStemmer") with positions/blocks
index = get_index(INDEX_PATH_DEFAULT_POSITIONS)
print(index.getCollectionStatistics().toString())
print("Default index with positions size: ", os.path.getsize(INDEX_PATH_DEFAULT_POSITIONS + '/data.inverted.bf')/1000000, "mb")

Number of documents: 528155
Number of terms: 520520
Number of postings: 83716130
Number of fields: 2
Number of tokens: 145322007
Field names: [title, text]
Positions:   true

Default index with positions size:  394.240574 mb
