In [1]:
from elasticsearch import Elasticsearch
import json
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
import warnings
warnings.filterwarnings("ignore")

ModuleNotFoundError: No module named 'elasticsearch'

In [2]:
es = Elasticsearch("https://localhost:9200", basic_auth=("elastic", "Data_is_Lyfe"), verify_certs=False)

# Indexing in Elasticsearch

# Retail DP

In [4]:
with open("C360_updated.json") as f:
    doc = json.load(f)

In [5]:
doc.keys()

dict_keys(['title', 'description', 'owners', 'version', 'version_date', 'tags', 'health', 'data_sources', 'lens'])

In [6]:
len(doc["title_vec"])

KeyError: 'title_vec'

In [37]:
mapping_out = {'properties': {'description': {'type': 'text',
    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
   'description_vec': {'type': 'dense_vector', 'dims':384, "index": True,"similarity": "l2_norm"},
   'title': {'type': 'text',
    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
   'title_vec': {'type': 'dense_vector', 'dims':384, "index": True,"similarity": "l2_norm"}}}

In [36]:
es.indices.delete(index = "dp_test_1")

ObjectApiResponse({'acknowledged': True})

In [38]:
es.indices.create(index = "dp_test_1", mappings=mapping_out)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'dp_test_1'})

In [40]:
es.index(index = "dp_test_1", document=doc)

ObjectApiResponse({'_index': 'dp_test_1', '_id': 'vyBrhIsB_BrIaB8k4kGX', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

# Finance DP

In [41]:
with open("combined_finance.json") as f:
    fin_doc = json.load(f)

In [42]:
fin_out = {}
fin_out["title"] = fin_doc["title"]
fin_out["description"] = fin_doc["description"]
fin_out["title_vec"] = fin_doc["title_vec"]
fin_out["description_vec"] = fin_doc["description_vec"]

In [43]:
es.index(index = "dp_test_1", document=fin_out)

ObjectApiResponse({'_index': 'dp_test_1', '_id': 'wCBrhIsB_BrIaB8k60E9', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1})

# Healthcare DP

In [44]:
with open("combined_health.json") as f:
    health_doc = json.load(f)

In [45]:
health_out = {}
health_out["title"] = health_doc["title"]
health_out["description"] = health_doc["description"]
health_out["title_vec"] = health_doc["title_vec"]
health_out["description_vec"] = health_doc["description_vec"]

In [46]:
es.index(index = "dp_test_1", document=health_out)

ObjectApiResponse({'_index': 'dp_test_1', '_id': 'wSBrhIsB_BrIaB8k9EFd', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1})

# Semantic Search

In [152]:
query_phrase = "medicines"

In [153]:
qr_vec = model.encode(query_phrase)

In [154]:
qr = {
"script_score": {
# "min_score": 4.0,
"query": {"match_all": {}},
"script": {
"source": "2*(cosineSimilarity(params.query_vector, 'title_vec') + 1.0) + (cosineSimilarity(params.query_vector, 'description_vec') + 1.0)",
"params": {"query_vector": qr_vec}
}
}
}

In [112]:
qr_fuzz = {
"script_score": {
"query": {
    "fuzzy": {
      "title.keyword": {
        "value": "custumer"
      }
    }
},
"script": {
"source": "2*(cosineSimilarity(params.query_vector, 'title_vec') + 1.0) + (cosineSimilarity(params.query_vector, 'description_vec') + 1.0)",
"params": {"query_vector": qr_vec}
}
}
}

In [155]:
search_res = [{"title":t["_source"]["title"], "score":t["_score"]} for t in es.search(query=qr, index = "dp_test_1")["hits"]["hits"]]

In [156]:
search_res

[{'title': 'Healthcare_360', 'score': 3.716805},
 {'title': 'Customer_360', 'score': 3.2733693},
 {'title': 'Banking_360', 'score': 3.116454}]

In [52]:
search_1 = es.search(index="dp_test_1", body={"query": qr})

In [53]:
test_phrase = "ecommerce"

In [54]:
query_vec = model.encode(test_phrase)

In [55]:
len(query_vec)

384

In [56]:
qr = {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "2*(cosineSimilarity(params.query_vector, 'title_vec') + 1.0) + (cosineSimilarity(params.query_vector, 'description_vec') + 1.0)",
"params": {"query_vector": query_vec}
}
}
}

In [57]:
word_res = [{"title":t["_source"]["title"], "score":t["_score"]} for t in es.search(query=qr, index = "dp_test_1")["hits"]["hits"]]

In [58]:
word_res

[{'title': 'Banking_360', 'score': 4.0541496},
 {'title': 'Customer_360', 'score': 4.0235376},
 {'title': 'Healthcare_360', 'score': 3.4357424}]

# KNN search

In [162]:
test_phrase = "blood test"

In [163]:
query_vec = model.encode(test_phrase)

In [164]:
qr ={
    "field": "title_vec",
    "query_vector": query_vec,
    "k": 3,
    "num_candidates": 10
  }

In [165]:
word_res = [{"title":t["_source"]["title"], "score":t["_score"]} for t in es.knn_search(knn = qr, index = "dp_test_1")["hits"]["hits"]]

In [166]:
word_res

[{'title': 'Healthcare_360', 'score': 0.38176438},
 {'title': 'Banking_360', 'score': 0.35068586},
 {'title': 'Customer_360', 'score': 0.34415022}]