In [47]:
from elasticsearch import Elasticsearch
import json
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
import warnings
warnings.filterwarnings("ignore")

In [48]:
es = Elasticsearch("https://localhost:9200", basic_auth=("elastic", "Data_is_Lyfe"), verify_certs=False)

# Indexing in Elasticsearch

# Retail DP

In [49]:
with open("combined_retail.json") as f:
    doc = json.load(f)

In [50]:
doc.keys()

dict_keys(['title', 'owner', 'tier', 'version', 'version_date', 'description', 'tags', 'Health', 'governance', 'stacks', 'document', 'data_sources', 'title_vec', 'description_vec'])

In [51]:
len(doc["title_vec"])

384

In [52]:
mapping_out = {'properties': {'description': {'type': 'text',
    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
   'description_vec': {'type': 'dense_vector', 'dims':384},
   'title': {'type': 'text',
    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
   'title_vec': {'type': 'dense_vector', 'dims':384}}}

In [53]:
mapping_out["properties"]["title_vec"]

{'type': 'dense_vector', 'dims': 384}

In [54]:
mapping_out["properties"]["description_vec"]

{'type': 'dense_vector', 'dims': 384}

In [55]:
es.indices.create(index = "dp_test_1", mappings=mapping_out)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'dp_test_1'})

In [56]:
es.index(index = "dp_test_1", document=doc)

ObjectApiResponse({'_index': 'dp_test_1', '_id': '6nM5ZosB_R56sph9g_wI', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

# Finance DP

In [57]:
with open("combined_finance.json") as f:
    fin_doc = json.load(f)

In [58]:
fin_out = {}
fin_out["title"] = fin_doc["title"]
fin_out["description"] = fin_doc["description"]
fin_out["title_vec"] = fin_doc["title_vec"]
fin_out["description_vec"] = fin_doc["description_vec"]

In [59]:
es.index(index = "dp_test_1", document=fin_out)

ObjectApiResponse({'_index': 'dp_test_1', '_id': '63M5ZosB_R56sph9pfz3', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1})

# Healthcare DP

In [60]:
with open("combined_health.json") as f:
    health_doc = json.load(f)

In [61]:
health_out = {}
health_out["title"] = health_doc["title"]
health_out["description"] = health_doc["description"]
health_out["title_vec"] = health_doc["title_vec"]
health_out["description_vec"] = health_doc["description_vec"]

In [62]:
es.index(index = "dp_test_1", document=health_out)

ObjectApiResponse({'_index': 'dp_test_1', '_id': '7HM5ZosB_R56sph9svyf', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1})

# -

In [63]:
query_phrase = "retail"

In [64]:
qr_vec = model.encode(query_phrase)

In [65]:
qr = {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "2*(cosineSimilarity(params.query_vector, 'title_vec') + 1.0) + (cosineSimilarity(params.query_vector, 'description_vec') + 1.0)",
"params": {"query_vector": qr_vec}
}
}
}

In [66]:
search_res = [{"title":t["_source"]["title"], "score":t["_score"]} for t in es.search(query=qr, index = "dp_test_1")["hits"]["hits"]]

In [67]:
search_res

[{'title': 'Customer_360', 'score': 3.930916},
 {'title': 'Banking_360', 'score': 3.7013264},
 {'title': 'Healthcare_360', 'score': 3.484863}]

In [68]:
test_phrase = "ecommerce"

In [69]:
query_vec = model.encode(test_phrase)

In [70]:
len(query_vec)

384

In [71]:
qr = {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "2*(cosineSimilarity(params.query_vector, 'title_vec') + 1.0) + (cosineSimilarity(params.query_vector, 'description_vec') + 1.0)",
"params": {"query_vector": query_vec}
}
}
}

In [72]:
word_res = [{"title":t["_source"]["title"], "score":t["_score"]} for t in es.search(query=qr, index = "dp_test_1")["hits"]["hits"]]

In [73]:
word_res

[{'title': 'Banking_360', 'score': 4.0541496},
 {'title': 'Customer_360', 'score': 4.023537},
 {'title': 'Healthcare_360', 'score': 3.4357421}]