# Ranking Models

## 1. Setup

In [30]:
import os
import time
import json
import pyterrier as pt
import pandas as pd
import numpy as np
from collections import defaultdict
from time import time

INDEX_BASE_PATH = "./indexes"
INDEX_PATH_NO_PREPROCESSING = f"{INDEX_BASE_PATH}/no_preprocessing"
INDEX_PATH_DEFAULT = f"{INDEX_BASE_PATH}/default"
INDEX_PATH_DEFAULT_POSITIONS = f"{INDEX_BASE_PATH}/default_positions"
INDEX_PATH_STOPWORDS = f"{INDEX_BASE_PATH}/stopwords"
INDEX_PATH_STEMMING = f"{INDEX_BASE_PATH}/stemming"

CORPUS_PATH = "./data/corpus.jsonl"
THREADS = 6

In [8]:
if not pt.started():
    pt.init()

In [28]:
def get_index(index_path: str):
    index = pt.IndexFactory.of(index_path)
    return index

In [32]:
# topics
query_df = pd.read_csv('data/train_query.csv', dtype=str)
#query_df = pd.read_csv('data/lab_topics.csv', dtype=str)

# Load qrels
qrels_df = pd.read_csv('data/train_qrel.csv', dtype=str).astype({'label': 'int32'})
#qrels_df = pd.read_csv('data/lab_qrels.csv', dtype=str).astype({'label': 'int32'})

## 2. Creating folds

In [11]:
# Create 3 folds
def create_folds(num_folds: int, df):
    df_size = len(df)
    fold_size = df_size // num_folds
    for n in range(num_folds):
        if n == num_folds:
            end = df_size
        else:
            end = (n + 1) * fold_size

        start = n * fold_size
        yield df.iloc[start:end]
        
# Finding optimal controls from a tuning run
def get_optimal_controls(tuning_run):
    control_values = defaultdict(list)
    for fold in tuning_run:
        for _, control_name, value in fold:
            control_values[control_name].append(value)
    return {control: np.mean(values) for control, values in control_values.items()}

## 3. Tuning Probabilistic Models

### 3.1  BM25

In [19]:
num_folds = 3
query_folds = list(create_folds(num_folds, query_df))

index = get_index(INDEX_PATH_DEFAULT)
bm25 = pt.BatchRetrieve(index, wmodel="BM25", controls={"bm25.b": 0.3, "bm25.k_1": 0.5, "bm25.k_3": 0.75})
tuned_bm25, fold_controls = pt.KFoldGridSearch(
    bm25,
    {bm25: {
        "bm25.b": np.linspace(0.1, 1, 2),
        "bm25.k_1": np.linspace(0.3, 1, 2),
        "bm25.k_3": np.linspace(0.3, 1, 2)
    }},
    query_folds,
    qrels_df,
    "map",
    jobs=6,
    verbose=True
)

Fold 1


  warn("Cannot provide progress on parallel job")
PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

PyTerrier 0.8.1 has loaded Terrier 5.6 (built by craigmacdonald on 2021-09-17 13:27)

No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap configuration.
No etc/terrier.properties, using terrier.default.properties for bootstrap conf

Best map is 0.321467
Best setting is ['BR(BM25) bm25.b=1.0', 'BR(BM25) bm25.k_1=1.0', 'BR(BM25) bm25.k_3=0.3']
Fold 2
Best map is 0.372062
Best setting is ['BR(BM25) bm25.b=1.0', 'BR(BM25) bm25.k_1=1.0', 'BR(BM25) bm25.k_3=0.3']
Fold 3


  warn("Cannot provide progress on parallel job")
  warn("Cannot provide progress on parallel job")


Best map is 0.432605
Best setting is ['BR(BM25) bm25.b=1.0', 'BR(BM25) bm25.k_1=1.0', 'BR(BM25) bm25.k_3=0.3']


In [20]:
print("Optimal controls:", get_optimal_controls(fold_controls), "\n")
print(pt.Experiment([bm25, tuned_bm25], query_df, qrels_df, ["map", "ndcg"]))


Optimal controls: {'bm25.b': 1.0, 'bm25.k_1': 1.0, 'bm25.k_3': 0.3} 

                                                name       map      ndcg
0                                           BR(BM25)  0.559474  0.724525
1           qid  docid    docno  rank     score  ...  0.559474  0.724525


### 3.2 Tuning BM25F

In [18]:
num_folds = 3
query_folds = list(create_folds(num_folds, query_df))

index = get_index(INDEX_PATH_DEFAULT)
bm25f = pt.BatchRetrieve(index, wmodel="BM25F", controls={"c.0" : 0.75, 
                                                          "c.1" : 0.75, 
                                                          "w.0": 0.75, 
                                                          "w.1" : 0.75, 
                                                          "bm25f.k_1": 0.75, 
                                                          "bm25f.k_3": 0.75,
                                                         })
tuned_bm25f, fold_controls_bm25f = pt.KFoldGridSearch(
    bm25f,
    {bm25f: {
        "c.0": np.linspace(0, 1, 5),
        "c.1": np.linspace(0, 1, 5),
        "w.0": np.linspace(0, 1, 5),
        "w.1": np.linspace(0, 1, 5),
        "bm25f.k_1": np.linspace(0, 1, 5),
        "bm25f.k_3": np.linspace(0, 1, 5)
    }},
    query_folds,
    qrels_df,
    "map",
    jobs=6,
    verbose=True
)

Fold 1


  warn("Cannot provide progress on parallel job")

KeyboardInterrupt



In [None]:
print(pt.Experiment([bm25f, tuned_bm25f], query_df, qrels_df, ["map", "ndcg"]))

## 4. Tuning Language Models

In [25]:
num_folds = 3
query_folds = list(create_folds(num_folds, query_df))

index = get_index(INDEX_PATH_DEFAULT)
dir_lm = pt.BatchRetrieve(index, wmodel="DirichletLM", controls={"dirichletlm.mu" : 2500})
tuned_dir_lm, fold_controls_dir_lm = pt.KFoldGridSearch(
    dir_lm,
    {dir_lm: {
        "dirichletlm.mu": np.linspace(0, 10000, 5)
    }},
    query_folds,
    qrels_df,
    "map",
    jobs=6,
    verbose=True
)

Fold 1
Best map is 0.319564
Best setting is ['BR(DirichletLM) dirichletlm.mu=5000.0']
Fold 2


  warn("Cannot provide progress on parallel job")
  warn("Cannot provide progress on parallel job")


Best map is 0.370269
Best setting is ['BR(DirichletLM) dirichletlm.mu=2500.0']
Fold 3
Best map is 0.436811
Best setting is ['BR(DirichletLM) dirichletlm.mu=5000.0']


  warn("Cannot provide progress on parallel job")


In [26]:
print(pt.Experiment([bm25, tuned_bm25, dir_lm2, tuned_dir_lm], query_df, qrels_df, ["map", "ndcg"]))

Optimal controls: {'dirichletlm.mu': 4166.666666666667} 

                                                name       map      ndcg
0                                           BR(BM25)  0.559474  0.724525
1           qid  docid    docno  rank     score  ...  0.559474  0.724525
2                                    BR(DirichletLM)  0.562166  0.731454
3           qid  docid    docno  rank     score  ...  0.562166  0.731454


# 5. Comparing Models/Indices

In [33]:
for index_name in [INDEX_PATH_NO_PREPROCESSING,INDEX_PATH_DEFAULT,INDEX_PATH_DEFAULT_POSITIONS, 
                   INDEX_PATH_STOPWORDS,INDEX_PATH_STEMMING]:
    index = get_index(index_name)
    bm25 = pt.BatchRetrieve(index, wmodel="BM25", controls={"bm25.b": 0.7, "bm25.k_1": 0.75, "bm25.k_3": 0.75})
    dir_lm = pt.BatchRetrieve(index, wmodel="DirichletLM", controls={"dirichletlm.mu": 2500})
    print(f"\n\n{index_name}: {index.getCollectionStatistics().toString()}")
    print(pt.Experiment([bm25, dir_lm], query_df, qrels_df, ["map", "ndcg"]))



./indexes/no_preprocessing: Number of documents: 528155
Number of terms: 622029
Number of postings: 117766368
Number of fields: 2
Number of tokens: 257028522
Field names: [title, text]
Positions:   false

              name       map      ndcg
0         BR(BM25)  0.218007  0.474417
1  BR(DirichletLM)  0.223315  0.485798


./indexes/default: Number of documents: 528155
Number of terms: 520520
Number of postings: 83716130
Number of fields: 2
Number of tokens: 145322007
Field names: [title, text]
Positions:   false

              name       map      ndcg
0         BR(BM25)  0.249030  0.518623
1  BR(DirichletLM)  0.238461  0.510835


./indexes/default_positions: Number of documents: 528155
Number of terms: 520520
Number of postings: 83716130
Number of fields: 2
Number of tokens: 145322007
Field names: [title, text]
Positions:   true

              name       map      ndcg
0         BR(BM25)  0.249030  0.518623
1  BR(DirichletLM)  0.238461  0.510835


./indexes/stopwords: Number of docume