In [1]:
from pathlib import Path
from elasticsearch import Elasticsearch
from functions import baseline_retrieval, one_by_one_index, bulk_index, get_queries, get_relevance_scores



INDEX_NAME = "msmarcopassages"

DATA_FILE = str(Path("../data/collection.tsv"))

# Query documents. Contains query id and query text 
QUERIES_TRAIN = str(Path("../data/queries.train.tsv"))
QUERIES_EVAL = str(Path("../data/queries.eval.tsv"))
QUERIES_DEV = str(Path("../data/queries.dev.tsv"))
QUERY_FILES = [QUERIES_DEV, QUERIES_TRAIN, QUERIES_EVAL]

# Evaluation scores
RELEVANCE_SCORES = str(Path("../data/2019qrels-pass.txt"))

# OUTPUT FILE NAMES (for use by trec_eval)
ADVANCED_METHOD_RESULTS = "advanced_method_results"
QRELS_BINARY = "qrels_binary"


INDEX_SETTINGS = {
    "mappings": {
        "properties": {
            "body": {
                "type": "text",
                "term_vector": "with_positions",
                "analyzer": "english",
            },
        }
    }
}

INDEX_NAME = "msmarcopassages"

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
es = Elasticsearch()    
es.info()

{'name': 'DESKTOP-M9B3O86',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': '2v5YrawFSg6zh-XVReEnLw',
 'version': {'number': '7.17.6',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': 'f65e9d338dc1d07b642e14a27f338990148ee5b6',
  'build_date': '2022-08-23T11:08:48.893373482Z',
  'build_snapshot': False,
  'lucene_version': '8.11.1',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [3]:
# Index some documents
n = 100  # number of documents to index

one_by_one_index(
    es, data_file=DATA_FILE, index='obo', index_settings=INDEX_SETTINGS, cutoff=n
)

In [4]:
# Index some documents
n = 100_000  # number of documents to index

bulk_index(
    es, data_file=DATA_FILE, index=INDEX_NAME, index_settings=INDEX_SETTINGS, cutoff=n, reindex_if_exist=True
)

Indexed 100000 passages
Indexed 100001 passages


In [5]:
queries = get_queries()
rel_scores = get_relevance_scores()

In [8]:
id_query = [id_and_query for id_and_query in queries.items()]

query_id, query = id_query[910884]
# query has / in it
query = query.replace('/', '')

print(query_id, query, sep='|')

527433|types of dysarthria from cerebral palsy


In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')


In [10]:
baseline = baseline_retrieval(es, INDEX_NAME, query, 10)
print(baseline)

advanced_method(INDEX_NAME, baseline, model, tokenizer)

[]


NameError: name 'advanced_method' is not defined

In [None]:
from typing import List


def advanced_method(index_name: str, baseline: List[str], model, tokenizer):
    docs = [es.get(index=index_name, id=_id)['_source']['body'] for _id in baseline]

    features = tokenizer([query] * len(baseline), docs,  padding=True, truncation=True, return_tensors="pt")

    model.eval()
    with torch.no_grad():
        scores = model(**features).logits
        sorted_indexes = list(reversed(np.argsort(list(scores))))
    
    return [baseline[i] for i in sorted_indexes]



In [None]:
import numpy as np

docs = [es.get(index=INDEX_NAME, id=_id)['_source']['body'] for _id in baseline]
docs

features = tokenizer([query] * len(baseline), docs,  padding=True, truncation=True, return_tensors="pt")

model.eval()
with torch.no_grad():
    scores = model(**features).logits
    print(list(reversed(np.argsort(list(scores)))))

print(scores)

In [None]:
reranking = advanced_method(es)