In [None]:
import os
import json
from json import JSONDecodeError
import sys
import elasticsearch
import warnings
warnings.filterwarnings("ignore", message="ElasticsearchWarning")
sys.path.insert(0, '/home/guinzburg/NLP/Data')
from data_parser import parse_queries, parse_documents, parse_rel

In [None]:
# rel = parse_rel(path='/home/guinzburg/NLP/Data/CISI.REL')

In [None]:
# print(rel)

## ElasticSearch
Full-text search queries and performs linguistic searches against documents. It includes single or multiple words or phrases and returns documents that match search condition.

ElasticSearch is a search engine based on Apache Lucene, a free and open-source information retrieval software library. It provides a distributed, full-text search engine with an HTTP web interface and schema-free JSON documents.

Documents are represented as JSON objects. JSON serialization is supported by most programming languages and has become the standard format used by the NoSQL movement. It is simple, concise, and easy to read.

https://www.baeldung.com/elasticsearch-full-text-search-rest-api

## Documents and Queries
Load the files containing queries and documents using 'parse_queries' and 'parse_documents', then create json objects for each 
for further processing

In [6]:
# Get all queries
queries = json.loads(json.dumps(parse_queries(path='/home/guinzburg/NLP/Data/CISI.QRY'), indent = 4))
# Get all documents
documents = json.loads(json.dumps(parse_documents(path='/home/guinzburg/NLP/Data/CISI.ALL'), indent = 4))

## Validate ElasticSearch service is up and running
By instantiate a client instance and using an API callcall' in this example `info()` we can validate that the service is running properly <br><br>
<i>Plain vanilla curl command: curl -XGET 'http://localhost:9200/</i>

In [7]:
try:
    es = elasticsearch.Elasticsearch("http://localhost:9200")
    print("ElasticSearch service is up and running.","\nVersion:", es.info()['version']['number'])
except elasticsearch.ElasticsearchException as e:
    print("ElasticSearch service is not running.\n")

ElasticSearch service is up and running. 
Version: 7.15.0




## Updating ElasticSearch with Documents
ElasticSearch is document oriented. It stores and indexes documents. Indexing creates or updates documents. After indexing, you can search, sort, and filter complete documents. Using ElasticSearch.index<br><br>
<i>Plain vanilla curl command: 
curl -XPUT 'localhost:9200/text/article/1?pretty'
-H 'Content-Type: application/json' -d '
{
  "title": "He went",
  "random_text": 
    "He went such dare good fact. The small own seven saved man age."
}</i>

In [8]:
for id in documents:
    
    # Create document 
    document = {
        "title": documents[id]['title'],
        "text": documents[id]['body']
    }

    # Use ElasticSearch client to insert a document
    res = es.index(index="document", id=id, document=document)
    
    # Verify update was sucessful
    try:
        _ = res['result']

    except elasticsearch.ElasticsearchException as e:
        print("Update error", "Document ID:", id, "Actual Error:", e)

## Count ElasticSearch Documents
Get the number of documents within ElasticSearch using ElasticSearch.count<br><br>
<i>plain vanilla curl command:"curl -XGET 'localhost:9200/document/_count?pretty'</i>

In [9]:
es_doc_count = es.count(index="document", body = dict())['count']

print(f"{es_doc_count} / {len(documents)} documents were UPDATED")

1460 / 1460 documents were UPDATED


## Search ElasticSearch for Documents
Get all documents hits that match'es the query defined in the request using ElasticSearch.search<br><br>
<i>plain vanilla curl command: 
curl -XGET 'localhost:9200/text/article/_search?pretty' 
-H 'Content-Type: application/json' -d '
{
  "query": {
    "match": {
      "random_text": "him departure"
    }
  }
}'</i>

In [11]:
# create a Python dictionary for the search query:

queries_response = {}

for q_id in queries:
    
    # Prepare query 
    search_param = {
      "query": {
          "simple_query_string" : {
              "query": queries[q_id] 
        }
      }
    }
    
    # Get a response from the cluster
    response = es.search(index="document", body=search_param)
    
    # Save all results (documents ID's) for each query
    queries_response[q_id] = []
    for item in response['hits']['hits']:
        queries_response[q_id].append((item['_id'], item['_score']))

  response = es.search(index="document", body=search_param)


In [12]:
print(queries_response)

{'1': [('589', 49.35979), ('429', 49.302505), ('722', 45.253937), ('1281', 40.84774), ('60', 40.149933), ('236', 39.68808), ('1195', 38.193356), ('510', 38.16082), ('813', 35.37086), ('650', 33.80413)], '2': [('790', 23.856686), ('526', 19.638227), ('605', 19.61727), ('1096', 19.512949), ('806', 19.384413), ('1156', 18.924664), ('78', 18.778719), ('1158', 18.359062), ('768', 18.351942), ('1399', 17.68248)], '3': [('60', 26.347462), ('469', 17.288284), ('236', 16.019255), ('147', 14.676644), ('1235', 14.065663), ('599', 13.330776), ('640', 13.208814), ('1181', 12.854783), ('1169', 11.900635), ('592', 11.424519)], '4': [('790', 18.415758), ('565', 17.927382), ('320', 17.051666), ('746', 16.915049), ('636', 15.959225), ('1252', 15.78834), ('332', 15.3770075), ('739', 15.117075), ('608', 14.859041), ('677', 14.727039)], '5': [('648', 32.25313), ('60', 31.497679), ('656', 29.402664), ('388', 27.448078), ('236', 26.454494), ('1105', 26.257797), ('445', 25.43165), ('630', 24.8775), ('471', 24