In [4]:
from elasticsearch import Elasticsearch
from retrievals import baseline_retrieval 
from baseline_retrieval import bulk_index, get_queries, get_relevance_scores

DATA_FILE = "../data/collection.tsv"

# Query documents. Contains query id and query text apears
QUERIES_TRAIN = "../data/queries.train.tsv"
QUERIES_EVAL = "../data/queries.eval.tsv"
QUERIES_DEV = "../data/queries.dev.tsv"
QUERY_FILES = [QUERIES_DEV, QUERIES_TRAIN, QUERIES_EVAL]

INDEX_SETTINGS = {
    "mappings": {
        "properties": {
            "body": {
                "type": "text",
                "term_vector": "with_positions",
                "analyzer": "english",
            },
        }
    }
}

# Evaluation scores
RELEVANCE_SCORES = "../data/2019qrels-pass.txt"

INDEX_NAME = "dev_index"

In [5]:
# Index some documents
n = 100  # number of documents to index
es = Elasticsearch()    
es.info()

bulk_index(
    es, data_file=DATA_FILE, index=INDEX_NAME, index_settings=INDEX_SETTINGS, cutoff=n
)



In [6]:
queries = get_queries()
rel_scores = get_relevance_scores()

In [7]:
id_query = [id_and_query for id_and_query in queries.items()]

query_id, query = id_query[910884]
# query has / in it
query = query.replace('/', '')

print(query_id, query, sep='|')

527433|types of dysarthria from cerebral palsy


In [8]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/ms-marco-MiniLM-L-6-v2')


In [33]:
baseline = baseline_retrieval(es, INDEX_NAME, query, 10)
print(baseline)

advanced_method(INDEX_NAME, baseline, model, tokenizer)



['15', '59', '27', '20', '48', '23', '9', '26', '87', '80']


  result = getattr(asarray(obj), method)(*args, **kwds)
  result = getattr(asarray(obj), method)(*args, **kwds)


['23', '59', '20', '26', '87', '48', '27', '9', '80', '15']

In [32]:
from typing import List


def advanced_method(index_name: str, baseline: List[str], model, tokenizer):
    docs = [es.get(index=index_name, id=_id)['_source']['body'] for _id in baseline]

    features = tokenizer([query] * len(baseline), docs,  padding=True, truncation=True, return_tensors="pt")

    model.eval()
    with torch.no_grad():
        scores = model(**features).logits
        sorted_indexes = list(reversed(np.argsort(list(scores))))
    
    return [baseline[i] for i in sorted_indexes]



In [24]:
import numpy as np

docs = [es.get(index=INDEX_NAME, id=_id)['_source']['body'] for _id in baseline]
docs

features = tokenizer([query] * len(baseline), docs,  padding=True, truncation=True, return_tensors="pt")

model.eval()
with torch.no_grad():
    scores = model(**features).logits
    print(list(reversed(np.argsort(list(scores)))))

print(scores)



[5, 1, 3, 7, 8, 4, 2, 6, 9, 0]
tensor([[-11.3357],
        [-11.2387],
        [-11.3024],
        [-11.2458],
        [-11.2975],
        [-11.2146],
        [-11.3053],
        [-11.2489],
        [-11.2668],
        [-11.3131]])


  result = getattr(asarray(obj), method)(*args, **kwds)
  result = getattr(asarray(obj), method)(*args, **kwds)


In [11]:
reranking = advanced_method(es)

TypeError: advanced_method() takes 0 positional arguments but 1 was given