In [3]:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import parallel_bulk
import time
import csv
import json
import requests
import ir_measures
from ir_measures import *
requests.packages.urllib3.disable_warnings() 

In [8]:
es = Elasticsearch([{'host' : 'localhost', 'port' : 9200, 'scheme' : 'https'}], basic_auth=("elastic","=sz+xFIY+6ZDsVb_HaLX"), verify_certs=False)

# Create Index

In [10]:
es.indices.create(index='en1kindex')

BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [en1kindex/WUDqdbuOQ4GE7ZTxI4gbKg] already exists')

# Configure Settings

In [11]:
mappings = {
    'properties' : {
        'text_right' : {
            'type' : 'text'
        }
    }
}

In [12]:
settingsWithStemmer = {
    'analysis' : {
        'analyzer' : {
            "default": {
                'tokenizer' : 'whitespace',
                'filter' : [
                    'stemmer'
                ]
            },
            "default_search": {
                'tokenizer' : 'whitespace',
                'filter' : [
                    'stemmer'
                ]
            }
        }
    }
    
}

In [13]:
settingsNoStemmer = {
    'analysis' : {
        'analyzer' : {
            'default' : {
                'tokenizer' : 'whitespace'
            },
            "default_search": {
                'tokenizer' : 'whitespace'
            }
        }
    }
}

# Index modification

In [14]:
def recreate_index():
    es.indices.delete(index='en1kindex')
    es.indices.create(index='en1kindex', mappings=mappings, settings=settingsWithStemmer)

In [15]:
recreate_index()

# Check Analyzer

In [16]:
def check_analyzer(analyzer, text):
    body = analyzer
    body['text'] = text
    
    tokens = es.indices.analyze(index='en1kindex', body=body)['tokens']
    tokens = [token_info['token'] for token_info in tokens]
    return tokens

In [17]:
text = 'it was used in landing craft during world war ii and is used today in private boats and training facilities the 6 71 is an inline six cylinder diesel engine the 71 refers to the displacement in cubic inches of each cylinder the firing order of the engine is 1 5 3 6 2 4 the engine s compression ratio is 18 7 1 with a 4 250 inch bore and a 5 00 inch stroke the engine weighs and is 54 inches long 29 inches wide and 41 inches tall at 2 100 revolutions per minute the engine is capable of producing 230 horse power 172 kilowatts v type versions of the 71 series were developed in 1957 the 6 71 is a two stroke engine as the engine will not naturally aspirate air is provided via a roots type blower however on the 6 71t models a turbocharger and a supercharger are utilized fuel is provided by unit injectors one per cylinder the amount of fuel injected into the engine is controlled by the engine s governor the engine cooling is via liquid in a water jacket in a boat cool external water is pumped into the engine'

In [18]:
analyzer = {
    'analyzer': 'default'
}

check_analyzer(analyzer, text)

  tokens = es.indices.analyze(index='en1kindex', body=body)['tokens']


['it',
 'wa',
 'us',
 'in',
 'land',
 'craft',
 'dure',
 'world',
 'war',
 'ii',
 'and',
 'is',
 'us',
 'todai',
 'in',
 'privat',
 'boat',
 'and',
 'train',
 'facil',
 'the',
 '6',
 '71',
 'is',
 'an',
 'inlin',
 'six',
 'cylind',
 'diesel',
 'engin',
 'the',
 '71',
 'refer',
 'to',
 'the',
 'displac',
 'in',
 'cubic',
 'inch',
 'of',
 'each',
 'cylind',
 'the',
 'fire',
 'order',
 'of',
 'the',
 'engin',
 'is',
 '1',
 '5',
 '3',
 '6',
 '2',
 '4',
 'the',
 'engin',
 's',
 'compress',
 'ratio',
 'is',
 '18',
 '7',
 '1',
 'with',
 'a',
 '4',
 '250',
 'inch',
 'bore',
 'and',
 'a',
 '5',
 '00',
 'inch',
 'stroke',
 'the',
 'engin',
 'weigh',
 'and',
 'is',
 '54',
 'inch',
 'long',
 '29',
 'inch',
 'wide',
 'and',
 '41',
 'inch',
 'tall',
 'at',
 '2',
 '100',
 'revolut',
 'per',
 'minut',
 'the',
 'engin',
 'is',
 'capabl',
 'of',
 'produc',
 '230',
 'hors',
 'power',
 '172',
 'kilowatt',
 'v',
 'type',
 'version',
 'of',
 'the',
 '71',
 'seri',
 'were',
 'develop',
 'in',
 '1957',
 'the'

In [19]:
def create_es_action(index, doc_id, document):
    return {
        '_index': index,
        '_id': doc_id,
        '_source': document
    }

# Index documents

In [20]:
def es_actions_generator():
    with open('documents.csv', 'r') as en1k:
        documentReader = csv.DictReader(en1k)
        for document in documentReader:
            text_right = document['text_right']
            docID = document["id_right"]
            doc = json.dumps({'text_right' : text_right})
            yield create_es_action('en1kindex', docID, doc)

In [21]:
start_time = time.time()
for ok, result in parallel_bulk(es, es_actions_generator()):
    if not ok:
        print(result)
print("--- %s seconds ---" % (time.time() - start_time))

--- 18.71962332725525 seconds ---


# Perform Search

In [22]:
def search(queryID, query, *args):
#     pretty_print_result(es.search(index='en1kindex', body=query, size=20), args)
    res = es.search(index='en1kindex', query=query, size=20)['hits']
#     fields = es.search(index='en1kindex', body=query, size=20)['fields']
    finalRes = []
    for hit in res['hits']:
#         print(fields['text_right'])
        finalRes.append((queryID, hit["_id"], hit["_score"]))
    return finalRes
    # note that size set to 20 just because default value is 10 and we know that we have 12 docs and 10 < 12 < 20
                        

# Create query format

In [23]:
query = {
    'bool': {
        'should': [
            {
                'match': {
                    'text_right': ''
                }
            }
        ]
    }
}


# Copy queries from test folder and generate triples and query execution time data

In [24]:
triples = []
processingTimes = []
sumOfTimes = 0

with open('test/queries.csv', 'r') as qw:
        queryReader = csv.DictReader(qw)
        for q in queryReader:
            text_left = q['text_left']
            qID = q["id_left"]
            query['bool']['should'][0]['match']['text_right'] = text_left
            start_time = time.time()
            triples.append(search(qID, query, 'text_right'))
            processingTimes.append((qID, time.time() - start_time))
            
with open("triplesWithStemming.txt", "w") as trp:
    trp.write("q_id\t" + "doc_id\t" + "score\n")
    for queryRes in triples:
        for triple in queryRes:
            trp.write(str(triple[0]) + "\t" + str(triple[1]) + "\t" + str(triple[2]) + "\n")

with open("queryTimeStemmerEnabled.txt", "w") as prt:
    prt.write("q_id\t" + "time (s)\n")
    for rec in processingTimes:
        sumOfTimes += rec[1]
        prt.write(str(rec[0]) + "\t" + str(rec[1]) + "\n")
        
print('Average query processing time: ' + str(sumOfTimes / len(processingTimes)))

Average query processing time: 0.04237892866134643


# Generate qrels and runs according to TREC format

In [25]:
with open("qrelsWithStemmer", "w") as qrl:
    for queryRes in triples:
        rel = "1"
        for triple in queryRes:
            if triple[0] == triple[1]:
                rel = "2"
            else:
                rel = "1"
            qrl.write(str(triple[0]) + "\t0\t" + str(triple[1]) + "\t" + rel + "\n")

In [26]:
with open("ESWithStemmer.res", "w") as run:
    for queryRes in triples:
        rank = 0
        for triple in queryRes:
            run.write(str(triple[0]) + " Q0 " + str(triple[1]) + " " + str(rank) + " " + str(triple[2]) + " ES\n")
            rank += 1

In [405]:
with open("qrelsNoStemmer", "w") as qrl:
    for queryRes in triples:
        rel = ""
        for triple in queryRes:
            if triple[0] == triple[1]:
                rel = "1"
            else:
                rel = "0"
            qrl.write(str(triple[0]) + "\t0\t" + str(triple[1]) + "\t" + rel + "\n")

In [406]:
with open("ESNoStemmer.res", "w") as run:
    for queryRes in triples:
        rank = 0
        for triple in queryRes:
            run.write(str(triple[0]) + " Q0 " + str(triple[1]) + " " + str(rank) + " " + str(triple[2]) + " ES\n")
            rank += 1

# Performance Evaluation

In [32]:
qrels = ir_measures.read_trec_qrels('test/qrels')
run = ir_measures.read_trec_run('test/BM25.res')
ir_measures.calc_aggregate([P@10, P@20, MAP], qrels, run)

{P@10: 0.1319999999999999, AP: 0.11196168401599797, P@20: 0.09499999999999999}

In [36]:
qrels = ir_measures.read_trec_qrels('test/qrels')
run = ir_measures.read_trec_run('ESNoStemmer.res')
ir_measures.calc_aggregate([P(rel=2)@10, P(rel=2)@20, MAP(rel=2)], qrels, run)

{P@10: 0.20699999999999988, AP: 0.14816807138533647, P@20: 0.14800000000000008}

In [34]:
qrels = ir_measures.read_trec_qrels('test/qrels')
run = ir_measures.read_trec_run('ESWithStemmer.res')
ir_measures.calc_aggregate([P@10, P@20, MAP], qrels, run)

{P@10: 0.20999999999999994, AP: 0.1460174936389072, P@20: 0.142}