# Data Preparation


## OpenSearch Connection Settings

## Opening an Index

In [3]:
import pprint as pp
import requests
from opensearchpy import OpenSearch
from opensearchpy import helpers

host = 'api.novasearch.org'
port = 443

user = 'user13' 
password = 'rumoao+20' 
index_name = user # We can only have an index with the same name has our user name.

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = (user, password),
    use_ssl = True,
    url_prefix = 'opensearch_v2',
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)

#if client.indices.exists(index_name):
if True:

    resp = client.indices.open(index = index_name)
    print(resp)

    print('\n----------------------------------------------------------------------------------- INDEX SETTINGS')
    settings = client.indices.get_settings(index = index_name)
    pp.pprint(settings)

    print('\n----------------------------------------------------------------------------------- INDEX MAPPINGS')
    mappings = client.indices.get_mapping(index = index_name)
    pp.pprint(mappings)

    print('\n----------------------------------------------------------------------------------- INDEX #DOCs')
    print(client.count(index = index_name))
else:
    print("Index does not exist.")

ModuleNotFoundError: No module named 'requests'

In [None]:
## Closing an Index
resp = client.indices.close(index = index_name)
print(resp)

## Creating an Index with our Own Settings

In [None]:

index_body = {
   "settings":{
      "index":{
         "number_of_replicas":0,
         "number_of_shards":4,
         "refresh_interval":"-1",
         "knn":"true" ## Support for Knn vector data types
      }
   },
   "mappings":{
       "dynamic":      "strict",
       "properties":{
         "doc_id":{
            "type":"keyword"
         },
         "tags":{
            "type":"keyword"
         },
         "json":{
            "type":"flat_object"
         },
         "contents":{
            "type":"text",
            "analyzer":"standard",
            "similarity":"BM25"
         }
      }
   }
}

if client.indices.exists(index=index_name):
    print("Index already existed. Nothing to be done.")
else:        
    response = client.indices.create(index_name, body=index_body)
    print('\nCreating index:')
    print(response)


## Index Creation Check (Verify that the created index is what we desire / configured)

In [None]:
print('\n----------------------------------------------------------------------------------- INDEX SETTINGS')
index_settings = {
    "settings":{
      "index":{
         "refresh_interval" : "1s"
      }
   }
}
pp.pprint(client.indices.get_alias("*"))

client.indices.put_settings(index = index_name, body = index_settings)
settings = client.indices.get_settings(index = index_name)
pp.pprint(settings)

print('\n----------------------------------------------------------------------------------- INDEX MAPPINGS')
mappings = client.indices.get_mapping(index = index_name)
pp.pprint(mappings)

print('\n----------------------------------------------------------------------------------- INDEX #DOCs')
print(client.count(index = index_name))

## Text Based Search

OpenSearch is one of the best solutions for searching text. The text-based search documentation is available here:

https://opensearch.org/docs/latest/opensearch/query-dsl/full-text/

In the example below the 'query'  parameter indicates the search query, the 'size' parameter indicates the number of documents to be returned, the parametner 'source' indicates which fields should be returned in the search results, and the 'fields' parameter indicates the list of fields to be searched. 

In [None]:
# Change this to use our own stuff?

qtxt = "How many people live in London?"

query_bm25 = {
  'size': 5,
  '_source': ['_tags'],
#  '_source': ['doc_id'],
#  '_source': '',
  'query': {
    'multi_match': {
      'query': qtxt,
      'fields': ['contents']
    }
  }
}

response = client.search(
    body = query_bm25,
    index = index_name
)

print('\nSearch results:')
pp.pprint(response)
