In [1]:
from opensearchpy import OpenSearch
from elasticsearch.helpers import bulk
import json
from opensearchpy import OpenSearch, RequestsHttpConnection, AWSV4SignerAuth
import pandas as pd
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [2]:

host = 'localhost'
port = 9200
auth = ('admin', 'admin') # For testing only. Don't store credentials in code.
#ca_certs_path = '/full/path/to/root-ca.pem' # Provide a CA bundle if you use intermediate CAs with your root CA.

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    use_ssl = True,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

In [3]:
with open("Retail_DP_Sample.json") as f:
    doc = json.load(f)

In [4]:
doc.keys()

dict_keys(['title', 'owner', 'tier', 'version', 'version_date', 'description', 'tags', 'Health', 'governance', 'stacks', 'document', 'data_sources'])

In [5]:
doc

{'title': 'Customer_360',
 'owner': 'Ishmin Singh',
 'tier': 'Gold',
 'version': 'v1.1',
 'version_date': 'July 17, 2023 13:00',
 'description': 'Customer data from various sources, enabling a holistic view of customers. ',
 'tags': ['marketing', 'customer.360'],
 'Health': {'quality_score': '70%', 'operational_health': '70%'},
 'governance': {'access_policies': 3, 'data_policies': 3},
 'stacks': ['flare', 'benthos', 'alpha'],
 'document': 'What is it?\nThe Customer 360 is a tool designed to create a comprehensive view of customers by combining data from multiple sources. It includes information centered around a customer and further associated details of stores, products, and transaction details. \n\nWhat does it solve? \nCustomer 360 addresses the challenge of aggregating and analyzing customer-related data from disparate sources. By consolidating data into a unified format, it enables users to gain valuable insights into customer behavior, purchase patterns, and preferences. This co

In [8]:
client.indices.create(index = "dp_test_1")

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'dp_test_1'}

In [7]:
client.indices.delete(index = "dp_test_1")

{'acknowledged': True}

In [9]:
client.index(index = "dp_test_1", body=doc)

{'_index': 'dp_test_1',
 '_id': 'hxyiQI4B8m6BE82W_4Uy',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 2, 'failed': 0},
 '_seq_no': 0,
 '_primary_term': 1}

In [8]:
with open("combined_finance.json") as f:
    doc_fin = json.load(f)

In [9]:
client.index(index = "dp_test_1", body=doc_fin)

{'_index': 'dp_test_1',
 '_id': 'oETnm40Bt1213c7wTl7x',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 2, 'failed': 0},
 '_seq_no': 1,
 '_primary_term': 1}

In [10]:
with open("combined_health.json") as f:
    doc_hel = json.load(f)

In [11]:
client.index(index = "dp_test_1", body=doc_hel)

{'_index': 'dp_test_1',
 '_id': 'oUTnm40Bt1213c7wU14r',
 '_version': 1,
 'result': 'created',
 '_shards': {'total': 2, 'successful': 2, 'failed': 0},
 '_seq_no': 2,
 '_primary_term': 1}

In [12]:
qr = {
  "query": {
    "match_all": {}
  },
  "size":50
}    

In [13]:
client.search(body = qr, index = "dp_test_1")["hits"]["hits"]

[{'_index': 'dp_test_1',
  '_id': 'n0Tnm40Bt1213c7wSl7s',
  '_score': 1.0,
  '_source': {'title': 'Customer_360',
   'owner': 'Ishmin Singh',
   'tier': 'Gold',
   'version': 'v1.1',
   'version_date': 'July 17, 2023 13:00',
   'description': 'Customer data from various sources, enabling a holistic view of customers. ',
   'tags': ['marketing', 'customer.360'],
   'Health': {'quality_score': '70%', 'operational_health': '70%'},
   'governance': {'access_policies': 3, 'data_policies': 3},
   'stacks': ['flare', 'benthos', 'alpha'],
   'document': 'What is it?\nThe Customer 360 is a tool designed to create a comprehensive view of customers by combining data from multiple sources. It includes information centered around a customer and further associated details of stores, products, and transaction details. \n\nWhat does it solve? \nCustomer 360 addresses the challenge of aggregating and analyzing customer-related data from disparate sources. By consolidating data into a unified format, it

In [10]:
qr ={ 
    "query":{
        "match":{
            "document":"customer",
        }
}}

In [11]:
client.search(index = "dp_test_1", body = qr)["hits"]["hits"]

[{'_index': 'dp_test_1',
  '_id': 'hxyiQI4B8m6BE82W_4Uy',
  '_score': 0.5772112,
  '_source': {'title': 'Customer_360',
   'owner': 'Ishmin Singh',
   'tier': 'Gold',
   'version': 'v1.1',
   'version_date': 'July 17, 2023 13:00',
   'description': 'Customer data from various sources, enabling a holistic view of customers. ',
   'tags': ['marketing', 'customer.360'],
   'Health': {'quality_score': '70%', 'operational_health': '70%'},
   'governance': {'access_policies': 3, 'data_policies': 3},
   'stacks': ['flare', 'benthos', 'alpha'],
   'document': 'What is it?\nThe Customer 360 is a tool designed to create a comprehensive view of customers by combining data from multiple sources. It includes information centered around a customer and further associated details of stores, products, and transaction details. \n\nWhat does it solve? \nCustomer 360 addresses the challenge of aggregating and analyzing customer-related data from disparate sources. By consolidating data into a unified form

In [11]:
# semantic_qr= {
#   "_source": {
#     "excludes": ["title_vec", "description_vec"]
#   },
#   "query": {
#     "neural": {
#       "title_vec": {
#         "query_text":"banking",
#         "k": 10
#       }
#     }
#   }
# }

In [12]:
client.search(index="dp_test_1", body=semantic_qr)["hits"]["hits"]
# client.search(index = "dp_test_1", body = qr)["hits"]["hits"]

TransportError: TransportError(500, 'null_pointer_exception', 'modelId is marked non-null but is null')

In [13]:
search_query = {
    "query": {
        "match": {
            "title": input("Enter your search text: ")
        }
    },
    "size": 10  
}

Enter your search text: health


In [55]:
response = client.search(index="dp_test_1", body=search_query)

In [56]:
total_hits = response["hits"]["total"]["value"]
print("Total hits:", total_hits)

Total hits: 0


In [79]:
query = {
    "query": {
        "match": {
            "title": "customer"
        }
    }
}

In [80]:
response = client.search(index="dp_test_1", body=query)


In [81]:
hits = response["hits"]["hits"]

# Print the results
for hit in hits:
    print(hit["_source"])

In [82]:
total_hits = response["hits"]["total"]["value"]
print("Total hits:", total_hits)

Total hits: 0


In [36]:
import numpy as np

In [43]:
title_vec = np.array([doc['title_vec'], doc_fin['title_vec'], doc_hel['title_vec']])
description_vec = np.array([doc['description_vec'],doc_fin['description_vec'],doc_hel['description_vec']])

In [44]:
query = {
    "query": {
        "script_score": {
            "query": {"match_all": {}},
            "script": {
                "source": "cosineSimilarity(params.query_vector, doc['title_vec']) + cosineSimilarity(params.query_vector, doc['description_vec'])",
                "params": {"query_vector": title_vec + description_vec}  # combining title and description vectors
            }
        }
    }
}

In [45]:
result = client.search(index="dp_test_1", body=query)

RequestError: RequestError(400, 'search_phase_execution_exception', "class_cast_exception: class org.opensearch.index.fielddata.ScriptDocValues$Doubles cannot be cast to class org.opensearch.knn.index.KNNVectorScriptDocValues (org.opensearch.index.fielddata.ScriptDocValues$Doubles is in unnamed module of loader 'app'; org.opensearch.knn.index.KNNVectorScriptDocValues is in unnamed module of loader java.net.FactoryURLClassLoader @1235151c)")

In [12]:
search_query = {
    "_source": {
        "excludes": [
            "passage_embedding"
        ]
    },
    "query": {
        "bool": {
            "filter": {
                "wildcard": {
                }
            },
            "should": [
                {
                    "script_score": {
                        "query": {
                            "neural": {
                                "passage_embedding": {
                                    "query_text": input("Enter your search text: "),
                                    "model_id": "Your model ID",
                                    "k": 100
                                }
                            }
                        },
                        "script": {
                            "source": "_score * 1.5"
                        }
                    }
                },
                {
                    "script_score": {
                        "query": {
                            "match": {
                            }
                        },
                        "script": {
                            "source": "_score * 1.7"
                        }
                    }
                }
            ]
        }
    }
}


Enter your search text: customer


In [13]:
result = client.search(index="dp_test_1", body=search_query)

RequestError: RequestError(400, 'x_content_parse_exception', '[1:86] [bool] failed to parse field [filter]')