In [1]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
import time
import csv
import json
import requests
import ir_measures
from ir_measures import *
from sentence_transformers import SentenceTransformer, util
import torch
import random
from sklearn.preprocessing import MinMaxScaler
import numpy as np
requests.packages.urllib3.disable_warnings() 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
es = Elasticsearch([{'host' : 'localhost', 'port' : 9200, 'scheme' : 'https'}], basic_auth=("elastic","X8w8*Kabqp+5d5ROVoYM"), verify_certs=False)

  _transport = transport_class(


# Create Index

In [3]:
es.indices.create(index='en1kindex')

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [en1kindex/qS1Vyc4pSgma5f_AEPL2Vw] already exists')

# Configure Settings

In [4]:
mappings = {
    'properties' : {
        'text_right' : {
            'type' : 'text'
        }
    }
}

In [5]:
settingsNoStemmer = {
    'analysis' : {
        'analyzer' : {
            'default' : {
                'tokenizer' : 'whitespace'
            },
            "default_search": {
                'tokenizer' : 'whitespace'
            }
        }
    }
}

# Index modification

In [6]:
def recreate_index():
    es.indices.delete(index='en1kindex')
    es.indices.create(index='en1kindex', mappings=mappings, settings=settingsNoStemmer)

In [7]:
recreate_index()

# Check Analyzer

In [8]:
def check_analyzer(analyzer, text):
    body = analyzer
    body['text'] = text
    
    tokens = es.indices.analyze(index='en1kindex', body=body)['tokens']
    tokens = [token_info['token'] for token_info in tokens]
    return tokens

In [9]:
text = 'it was used in landing craft during world war ii and is used today in private boats'

In [10]:
analyzer = {
    'analyzer': 'default'
}

check_analyzer(analyzer, text)

  tokens = es.indices.analyze(index='en1kindex', body=body)['tokens']


['it',
 'was',
 'used',
 'in',
 'landing',
 'craft',
 'during',
 'world',
 'war',
 'ii',
 'and',
 'is',
 'used',
 'today',
 'in',
 'private',
 'boats']

In [11]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

# Index documents

In [12]:
def es_actions_generator():
    with open('documents.csv', 'r') as en1k:
        documentReader = csv.DictReader(en1k)
        for document in documentReader:
            text_right = document['text_right']
            docID = document["id_right"]
            doc = json.dumps({'text_right' : text_right})
            yield create_es_action('en1kindex', docID, doc)

In [13]:
start_time = time.time()
for ok, result in parallel_bulk(es, es_actions_generator()):
    if not ok:
        print(result)
print("--- %s seconds ---" % (time.time() - start_time))

--- 19.133228302001953 seconds ---


# Perform Search and normalize Elasticsearch scores

In [14]:
def search(query):
    res = es.search(index='en1kindex', query=query, size=50)['hits']
    doc_content = []
    scores = []
    docInfo = {}
    
    for hit in res['hits']:
        doc_content.append(hit["_source"]["text_right"])
        scores.append(hit["_score"])
        docInfo[hit["_source"]["text_right"]] = hit["_id"]
        
    scaler = MinMaxScaler()
    scores = np.array(scores).reshape(-1, 1)
    normalized_scores = scaler.fit_transform(scores).tolist()
    
    for i in range(0, 50):
        normalized_scores[i].append(doc_content[i])
        
    return normalized_scores, docInfo

# Create query format

In [15]:
query = {
    'bool': {
        'should': [
            {
                'match': {
                    'text_right': ''
                }
            },
            {
                "match_all": {}
            }
        ]
    }
}


# Sampling queries from training dataset and running search on them

In [18]:
queryToDocs = {}
docInfo = {}
queryInfo = {}

with open('training/queries.csv', 'r') as qw:
    queryReader = csv.DictReader(qw)
    max_size = 500
    current_size = 0
    
    for q in queryReader:
        if current_size >= max_size:
            break
        
        if bool(random.getrandbits(1)):
            current_size += 1
            text_left = q['text_left']
            queryInfo[text_left] = q["id_left"]
            query['bool']['should'][0]['match']['text_right'] = text_left
            queryToDocs[text_left], partialDocInfo = search(query)
            docInfo.update(partialDocInfo)


# Generating runs corresponding to alpha values from 0 to 1 with step = 0.05

In [24]:
model = SentenceTransformer('sentence-transformers/msmarco-MiniLM-L6-cos-v5', device='cuda')

alpha = 0

while alpha <= 1:
    with open("alpha_training/ST_" + str(alpha) + ".res", "w") as run:
        for qry, doc_pairs in queryToDocs.items():
            docs = []
            es_scores = []

            for doc in doc_pairs:
                docs.append(doc[1])
                es_scores.append(doc[0])
                
            query_embedding = model.encode(qry, convert_to_tensor=True)
            doc_embeddings = model.encode(docs, convert_to_tensor=True)

            cos_scores = util.cos_sim(query_embedding, doc_embeddings)[0]

            result_scores = []
            
            for i in range(0, 50):
                result_score = alpha * es_scores[i] + (1 - alpha) * cos_scores[i]
                result_scores.append(result_score)
                
            top_results = torch.topk(torch.tensor(result_scores), k=50)
            
            rank = 0
            for score, idx in zip(top_results[0], top_results[1]):
                run.write(str(queryInfo[qry]) + " Q0 " + str(docInfo[docs[idx]]) + " " + str(rank) + " " + str(float(score)) + " ST\n")
                rank += 1
                
    print("Finished generating run with alpha=" + str(alpha))
    alpha = round(alpha + 0.05, 2)
    

Finished generating run with alpha=0
Finished generating run with alpha=0.05
Finished generating run with alpha=0.1
Finished generating run with alpha=0.15
Finished generating run with alpha=0.2
Finished generating run with alpha=0.25
Finished generating run with alpha=0.3
Finished generating run with alpha=0.35
Finished generating run with alpha=0.4
Finished generating run with alpha=0.45
Finished generating run with alpha=0.5
Finished generating run with alpha=0.55
Finished generating run with alpha=0.6
Finished generating run with alpha=0.65
Finished generating run with alpha=0.7
Finished generating run with alpha=0.75
Finished generating run with alpha=0.8
Finished generating run with alpha=0.85
Finished generating run with alpha=0.9
Finished generating run with alpha=0.95
Finished generating run with alpha=1.0


# Finding best alpha

## Creators training run evaluation

In [36]:
qrels = ir_measures.read_trec_qrels('training/qrels')
run = ir_measures.read_trec_run('training/BM25'  + '.res')
ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, run)

{P@20: 0.10813711911357335,
 P@10: 0.14584487534625967,
 AP@20: 0.10369050070071936}

## Searching for the best alpha value

In [46]:
map20_max = 0
alpha_best = -1
alpha = 0
res_best = {}
all_res = []
while alpha <=1:
    qrels = ir_measures.read_trec_qrels('training/qrels')
    run = ir_measures.read_trec_run('alpha_training/ST_' + str(alpha) + '.res')
    res = ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, run)
    all_res.append((alpha, res))
    
    if res[MAP@20] > map20_max:
        map20_max = res[MAP@20]
        alpha_best = alpha
        res_best = res
        
    alpha = round(alpha + 0.05, 2)
    
print("The best alpha value: " + str(alpha_best))
print("With following precision metrics: " + str(res_best))
print("Other alpha values and their corresponding precision metrics:")
for result in all_res:
    print(result)

The best alpha value: 0.25
With following precision metrics: {P@20: 0.058344875346260316, P@10: 0.08483379501385031, AP@20: 0.06224365087344461}
Other alpha values and their corresponding precision metrics:
(0, {P@20: 0.05519390581717443, P@10: 0.07873961218836552, AP@20: 0.0570357183876199})
(0.05, {P@20: 0.05599030470914118, P@10: 0.08005540166204977, AP@20: 0.058944251653649214})
(0.1, {P@20: 0.0569598337950138, P@10: 0.08199445983379491, AP@20: 0.06024241231715039})
(0.15, {P@20: 0.057479224376731246, P@10: 0.0829639889196675, AP@20: 0.06102436520136578})
(0.2, {P@20: 0.058240997229916845, P@10: 0.08379501385041545, AP@20: 0.06171702093026178})
(0.25, {P@20: 0.058344875346260316, P@10: 0.08483379501385031, AP@20: 0.06224365087344461})
(0.3, {P@20: 0.05865650969529078, P@10: 0.08462603878116332, AP@20: 0.06210492415624602})
(0.35, {P@20: 0.058448753462603815, P@10: 0.08504155124653731, AP@20: 0.062033002131624375})
(0.4, {P@20: 0.05858725761772843, P@10: 0.08531855955678666, AP@20: 

# Applying the formula to the test data

## Performing search

In [47]:
queryToDocs = {}
docInfo = {}
queryInfo = {}

with open('test/queries.csv', 'r') as qw:
    queryReader = csv.DictReader(qw)

    for q in queryReader:
        text_left = q['text_left']
        queryInfo[text_left] = q["id_left"]
        query['bool']['should'][0]['match']['text_right'] = text_left
        queryToDocs[text_left], partialDocInfo = search(query)
        docInfo.update(partialDocInfo)

## ranking

In [49]:
with open("alpha_test/ST_" + str(alpha_best) + ".res", "w") as run:
    for qry, doc_pairs in queryToDocs.items():
        docs = []
        es_scores = []

        for doc in doc_pairs:
            docs.append(doc[1])
            es_scores.append(doc[0])

        query_embedding = model.encode(qry, convert_to_tensor=True)
        doc_embeddings = model.encode(docs, convert_to_tensor=True)

        cos_scores = util.cos_sim(query_embedding, doc_embeddings)[0]

        result_scores = []

        for i in range(0, 50):
            result_score = alpha_best * es_scores[i] + (1 - alpha_best) * cos_scores[i]
            result_scores.append(result_score)

        top_results = torch.topk(torch.tensor(result_scores), k=50)

        rank = 0
        for score, idx in zip(top_results[0], top_results[1]):
            run.write(str(queryInfo[qry]) + " Q0 " + str(docInfo[docs[idx]]) + " " + str(rank) + " " + str(float(score)) + " ST\n")
            rank += 1

# Precision evaluation

In [50]:
qrels = ir_measures.read_trec_qrels('test/qrels')
run = ir_measures.read_trec_run('alpha_test/ST_0.25'  + '.res')
ir_measures.calc_aggregate([P@10, P@20, MAP@20], qrels, run)

{P@20: 0.154, P@10: 0.22599999999999995, AP@20: 0.17337246524327976}