In [1]:
from elasticsearch import Elasticsearch
import json
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
import warnings
warnings.filterwarnings("ignore")

In [2]:
es = Elasticsearch("https://localhost:9200", basic_auth=("elastic", "Data_is_Lyfe"), verify_certs=False)

# Indexing in Elasticsearch

# Retail DP

In [3]:
with open("combined_retail.json") as f:
    doc = json.load(f)

In [4]:
doc.keys()

dict_keys(['title', 'owner', 'tier', 'version', 'version_date', 'description', 'tags', 'Health', 'governance', 'stacks', 'document', 'data_sources', 'title_vec', 'description_vec'])

In [5]:
len(doc["title_vec"])

384

In [6]:
mapping_out = {'properties': {'description': {'type': 'text',
    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
   'description_vec': {'type': 'dense_vector', 'dims':384, "index": True,"similarity": "l2_norm"},
   'title': {'type': 'text',
    'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}},
   'title_vec': {'type': 'dense_vector', 'dims':384, "index": True,"similarity": "l2_norm"}}}

In [7]:
# es.indices.delete(index = "dp_test_1")

NotFoundError: NotFoundError(404, 'index_not_found_exception', 'no such index [dp_test_1]', dp_test_1, index_or_alias)

In [8]:
es.indices.create(index = "dp_test_1", mappings=mapping_out)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'dp_test_1'})

In [9]:
es.index(index = "dp_test_1", document=doc)

ObjectApiResponse({'_index': 'dp_test_1', '_id': '8-gDNY0BS2ZlpSQzZqO8', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

# Finance DP

In [10]:
with open("combined_finance.json") as f:
    fin_doc = json.load(f)

In [11]:
fin_out = {}
fin_out["title"] = fin_doc["title"]
fin_out["description"] = fin_doc["description"]
fin_out["title_vec"] = fin_doc["title_vec"]
fin_out["description_vec"] = fin_doc["description_vec"]

In [12]:
es.index(index = "dp_test_1", document=fin_out)

ObjectApiResponse({'_index': 'dp_test_1', '_id': '9OgDNY0BS2ZlpSQzb6O5', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 1, '_primary_term': 1})

# Healthcare DP

In [13]:
with open("combined_health.json") as f:
    health_doc = json.load(f)

In [14]:
health_out = {}
health_out["title"] = health_doc["title"]
health_out["description"] = health_doc["description"]
health_out["title_vec"] = health_doc["title_vec"]
health_out["description_vec"] = health_doc["description_vec"]

In [15]:
es.index(index = "dp_test_1", document=health_out)

ObjectApiResponse({'_index': 'dp_test_1', '_id': '9egDNY0BS2ZlpSQzdKNf', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 2, '_primary_term': 1})

# Semantic Search

In [38]:
query_phrase = "medicines"

In [39]:
qr_vec = model.encode(query_phrase)

2 x Cosine(search_vec, title_vec) + 1 x Cosine(search_vec, desc_vec)

In [21]:
# SELECT tittle FROM DP_index SORT BY COSINE () 

SyntaxError: invalid syntax (1449335069.py, line 1)

In [40]:
qr = {
"script_score": {
# "min_score": 4.0,
"query": {"match_all": {}},
"script": {
"source": "2*(cosineSimilarity(params.query_vector, 'title_vec') + 1.0) + (cosineSimilarity(params.query_vector, 'description_vec') + 1.0)",
"params": {"query_vector": qr_vec}
}
}
}

In [41]:
qr_fuzz = {
"script_score": {
"query": {
    "fuzzy": {
      "title.keyword": {
        "value": "custumer"
      }
    }
},
"script": {
"source": "2*(cosineSimilarity(params.query_vector, 'title_vec') + 1.0) + (cosineSimilarity(params.query_vector, 'description_vec') + 1.0)",
"params": {"query_vector": qr_vec}
}
}
}

In [42]:
search_res = [{"title":t["_source"]["title"], "score":t["_score"]} for t in es.search(query=qr, index = "dp_test_1")["hits"]["hits"]]

In [43]:
search_res

[{'title': 'Healthcare_360', 'score': 3.7168052},
 {'title': 'Customer_360', 'score': 3.2733698},
 {'title': 'Banking_360', 'score': 3.116454}]

In [44]:
search_1 = es.search(index="dp_test_1", body={"query": qr})

In [45]:
test_phrase = "credit card"

In [46]:
query_vec = model.encode(test_phrase)

In [47]:
len(query_vec)

384

In [48]:
qr = {
"script_score": {
"query": {"match_all": {}},
"script": {
"source": "2*(cosineSimilarity(params.query_vector, 'title_vec') + 1.0) + (cosineSimilarity(params.query_vector, 'description_vec') + 1.0)",
"params": {"query_vector": query_vec}
}
}
}

In [49]:
word_res = [{"title":t["_source"]["title"], "score":t["_score"]} for t in es.search(query=qr, index = "dp_test_1")["hits"]["hits"]]

In [50]:
word_res

[{'title': 'Banking_360', 'score': 4.276055},
 {'title': 'Customer_360', 'score': 3.884015},
 {'title': 'Healthcare_360', 'score': 3.734417}]

# KNN search

In [51]:
test_phrase = "blood test"

In [52]:
query_vec = model.encode(test_phrase)

In [53]:
qr ={
    "field": "title_vec",
    "query_vector": query_vec,
    "k": 3,
    "num_candidates": 10
  }

In [54]:
word_res = [{"title":t["_source"]["title"], "score":t["_score"]} for t in es.knn_search(knn = qr, index = "dp_test_1")["hits"]["hits"]]

In [55]:
word_res

[{'title': 'Healthcare_360', 'score': 0.38176438},
 {'title': 'Banking_360', 'score': 0.35068583},
 {'title': 'Customer_360', 'score': 0.3441502}]