In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
import time
import csv
import json
import requests
import ir_measures
from ir_measures import *
from sentence_transformers import SentenceTransformer, util
import torch
requests.packages.urllib3.disable_warnings() 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
es = Elasticsearch([{'host' : 'localhost', 'port' : 9200, 'scheme' : 'https'}], basic_auth=("elastic","X8w8*Kabqp+5d5ROVoYM"), verify_certs=False)

  _transport = transport_class(


# Create Index

In [4]:
es.indices.create(index='en1kindex')

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'en1kindex'})

# Configure Settings

In [3]:
mappings = {
    'properties' : {
        'text_right' : {
            'type' : 'text'
        }
    }
}

In [4]:
settingsNoStemmer = {
    'analysis' : {
        'analyzer' : {
            'default' : {
                'tokenizer' : 'whitespace'
            },
            "default_search": {
                'tokenizer' : 'whitespace'
            }
        }
    }
}

# Index modification

In [5]:
def recreate_index():
    es.indices.delete(index='en1kindex')
    es.indices.create(index='en1kindex', mappings=mappings, settings=settingsNoStemmer)

In [6]:
recreate_index()

# Check Analyzer

In [7]:
def check_analyzer(analyzer, text):
    body = analyzer
    body['text'] = text
    
    tokens = es.indices.analyze(index='en1kindex', body=body)['tokens']
    tokens = [token_info['token'] for token_info in tokens]
    return tokens

In [8]:
text = 'it was used in landing craft during world war ii and is used today in private boats'

In [9]:
analyzer = {
    'analyzer': 'default'
}

check_analyzer(analyzer, text)

  tokens = es.indices.analyze(index='en1kindex', body=body)['tokens']


['it',
 'was',
 'used',
 'in',
 'landing',
 'craft',
 'during',
 'world',
 'war',
 'ii',
 'and',
 'is',
 'used',
 'today',
 'in',
 'private',
 'boats']

In [10]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

# Index documents

In [11]:
def es_actions_generator():
    with open('documents.csv', 'r') as en1k:
        documentReader = csv.DictReader(en1k)
        for document in documentReader:
            text_right = document['text_right']
            docID = document["id_right"]
            doc = json.dumps({'text_right' : text_right})
            yield create_es_action('en1kindex', docID, doc)

In [12]:
start_time = time.time()
for ok, result in parallel_bulk(es, es_actions_generator()):
    if not ok:
        print(result)
print("--- %s seconds ---" % (time.time() - start_time))

--- 16.379539012908936 seconds ---


# Perform Search

In [13]:
def search(query):
    res = es.search(index='en1kindex', query=query, size=20, min_score=0)['hits']
    finalRes = []
    docInfo = {}
    for hit in res['hits']:
        finalRes.append(hit["_source"]["text_right"])
        docInfo[hit["_source"]["text_right"]] = hit["_id"]
    return finalRes, docInfo

# Create query format

In [14]:
query = {
    'bool': {
        'should': [
            {
                'match': {
                    'text_right': ''
                }
            },
            {
                "match_all": {}
            }
        ]
    }
}


# Copy queries from test folder and generate triples and query execution time data

In [15]:
queryToDocs = {}
docInfo = {}
queryInfo = {}

with open('test/queries.csv', 'r') as qw:
    queryReader = csv.DictReader(qw)
    for q in queryReader:
        text_left = q['text_left']
        queryInfo[text_left] = q["id_left"]
        query['bool']['should'][0]['match']['text_right'] = text_left
        queryToDocs[text_left], partialDocInfo = search(query)
        docInfo.update(partialDocInfo)

In [16]:
model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L6-cos-v5', device='cuda')

with open("ST.res", "w") as run:
    for qry, docs in queryToDocs.items():
        
        query_embedding = model.encode(qry, convert_to_tensor=True)
        doc_embeddings = model.encode(docs, convert_to_tensor=True)

        cos_scores = util.cos_sim(query_embedding, doc_embeddings)[0]
        topResults = torch.topk(cos_scores, k=20)
        
        rank = 0
        for score, idx in zip(topResults[0], topResults[1]):
            run.write(str(queryInfo[qry]) + " Q0 " + str(docInfo[docs[idx]]) + " " + str(rank) + " " + str(float(score)) + " ST\n")
            rank += 1
    

# Performance Evaluation

In [17]:
qrels = ir_measures.read_trec_qrels('test/qrels')
run = ir_measures.read_trec_run('ST.res')
ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, run)

{AP@20: 0.16783148568225636,
 P@20: 0.14800000000000002,
 P@10: 0.21599999999999994}