# ElasticSearch DataStore Initialization
This notebook serves to initialize the DocumentStore and write to it all the saved passages obtained from <i>PDF_Reader</i> and <i>Web_Scraper</i>

---

In [1]:
from haystack.document_stores import ElasticsearchDocumentStore
from elasticsearch import Elasticsearch
import os
import numpy as np
import re
import requests

In [2]:
#Path variables

file_path = os.getcwd()
processed_dir = 'ProcessedData'

processed_path = os.path.join(file_path, processed_dir)

In [3]:
host = os.environ.get("ELASTICSEARCH_HOST", "localhost")

#ds_astronomy: document store with data from both papers and web pages
ds_astronomy = ElasticsearchDocumentStore(
    host=host,
    username="",
    password="",
    index="ds_astronomy"
)

curr_store = ds_astronomy
curr_dir = processed_path
localhost = 'http://localhost:9200/ds_astronomy/_count'

In [4]:
###  Run this cell to get the processed information, both from papers and webpages  ###

files_txt = []   #Stores the path to all the TXT files

for file in os.listdir(os.path.join(file_path, processed_dir)):
    # check only txt files
    if file.endswith('.txt'):
        files_txt.append(file)

In [18]:
files_txt

['Dark Energy by Robert Caldwell.txt',
 'Dark Matter A Primer.txt',
 'Geophysical Classification of Planets, Dwarf Planets, and Moons.txt',
 'Stars Science Mission Directorate.txt',
 'SuperNova Stages.txt',
 'Visible Light Science Mission Directorate.txt',
 'What Is a Black Hole NASA.txt',
 'What makes stars shine.txt']

<h2>Start ElasticSearch</h2>
C:\Program Files\Elastic\Elasticsearch\7.11.2\bin

Run elasticsearch.exe as Admin

In [5]:
#Run this cell to ensure that ElasticSearch is running
requests.get('http://localhost:9200/_cluster/health').json()

{'cluster_name': 'elasticsearch',
 'status': 'yellow',
 'timed_out': False,
 'number_of_nodes': 1,
 'number_of_data_nodes': 1,
 'active_primary_shards': 5,
 'active_shards': 5,
 'relocating_shards': 0,
 'initializing_shards': 0,
 'unassigned_shards': 5,
 'delayed_unassigned_shards': 0,
 'number_of_pending_tasks': 0,
 'number_of_in_flight_fetch': 0,
 'task_max_waiting_in_queue_millis': 0,
 'active_shards_percent_as_number': 50.0}

In [6]:
requests.get('http://localhost:9200/_cat/indices').text

'yellow open scrap_astronomy m2wGWiloRf2WBv580_DBig 1 1  12 0 203.7kb 203.7kb\nyellow open pdf_astronomy   q5fonswfTv-B32EH89kaGw 1 1 263 0   3.9mb   3.9mb\nyellow open label           QpdbKOnESnanAYyOa1pwKQ 1 1   0 0    208b    208b\nyellow open astronomy       6ytjQkX2RbGTDV0cp-VGuA 1 1   0 0    208b    208b\nyellow open ds_astronomy    kl-xrSTTT2OCr8IMEO75MA 1 1 168 0   2.6mb   2.6mb\n'

In [7]:
def get_tot_passages(data_list, file_names):
    for idx, inst in enumerate(data_list):
        words = 0
        for passage in inst:
            words += len(passage.split())
        print(f'{file_names[idx]} has {len(inst)} passages, with an average of {words/len(inst)} words per passage!')

In [8]:
data = np.empty(len(files_txt), dtype=object) #data[document][paragraph]

for file_idx in range(len(files_txt)):
    #if files_txt[file_idx] == 'Dark Energy by Robert Caldwell.txt':
    #print(os.path.join(curr_dir, files_txt[file_idx]))
    with open(os.path.join(curr_dir, files_txt[file_idx]), 'r', encoding='utf-8', errors='ignore') as file:
        text = file.read()
        text_split = text.split('\n')

        if len(text_split) == 1:
            text = text.split('\\r\\n')
        else:
            text = text_split
        if text[-1] == '':
            data[file_idx] = text[0:-1]
        else:
            data[file_idx] = text

        file.close()


In [9]:
get_tot_passages(data , files_txt)

Dark Energy by Robert Caldwell.txt has 12 passages, with an average of 489.9166666666667 words per passage!
Dark Matter A Primer.txt has 111 passages, with an average of 102.44144144144144 words per passage!
Geophysical Classification of Planets, Dwarf Planets, and Moons.txt has 18 passages, with an average of 496.3888888888889 words per passage!
Stars Science Mission Directorate.txt has 10 passages, with an average of 80.1 words per passage!
SuperNova Stages.txt has 1 passages, with an average of 175.0 words per passage!
Visible Light Science Mission Directorate.txt has 17 passages, with an average of 39.05882352941177 words per passage!
What Is a Black Hole NASA.txt has 21 passages, with an average of 29.952380952380953 words per passage!
What makes stars shine.txt has 14 passages, with an average of 62.214285714285715 words per passage!


In [10]:
data_json = np.empty(len(data), dtype=object) #data[document][data]

for file_idx in range(len(files_txt)):
    title = files_txt[file_idx].split('.txt')[0]
    data_json[file_idx] = [
        {
            'content': paragraph,
            'meta': {
                'source': title
            }
        } for paragraph in data[file_idx]
    ]

Write passages to ElasticSearch DocumentStore

In [16]:
#Run to empty document store
curr_store.delete_documents()

In [18]:
for json_idx in range(len(data_json)):
    curr_store.write_documents(data_json[json_idx])

Run bottom cell to get the currently stored passages

In [19]:
#'count': shows how many passages are present in the DataStore
requests.get(localhost).json()

{'count': 200,
 '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}}