In [1]:
import torch
import re
import time
from transformers import AutoTokenizer, AutoModel
from transformers import DistilBertTokenizer, DistilBertModel
import json
import numpy as np
from tqdm import tqdm
from opensearchpy import OpenSearch, RequestsHttpConnection

model_name = "sentence-transformers/distilbert-base-nli-stsb-mean-tokens"


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def sentence_to_vector(raw_inputs):
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    model = DistilBertModel.from_pretrained(model_name)
    inputs_tokens = tokenizer(raw_inputs, padding=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs_tokens)

    sentence_embeddings = mean_pooling(outputs, inputs_tokens['attention_mask'])
    return sentence_embeddings

In [2]:
host = 'search-costplus1-lammv6fxll6v6kj3cguiq5jy3u.us-east-1.es.amazonaws.com' # cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com
region = 'us-east-1'
service = 'es'
auth = ("pankaj", "Stack@123")

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

In [6]:
client.indices.delete(index = "dp_vec_test2")

{'acknowledged': True}

In [3]:
def convert_to_vectors(data):
    vectors = {}
    if isinstance(data, dict):
        for key, value in tqdm(data.items()):
            vectors[key] = value
            if isinstance(value, dict):
                vectors[key+"_vector"] = convert_to_vectors(value)
            elif isinstance(value, list):
                vectors[key+"_vector"] = [convert_to_vectors(item) for item in value]
            else:
                vectors[key+"_vector"] = list(np.array(sentence_to_vector(str(value)))[0])
    else:
        vectors = list(np.array(sentence_to_vector(str(data)))[0])
    return vectors

document_files = ["H360.json", "F360.json", "C360.json"]
document_vectors = []

for filename in tqdm(document_files):
    with open(filename) as f:
        doc = json.load(f)
    document_vectors.append(convert_to_vectors(doc))

  0%|                                                                                                                                                  | 0/3 [00:00<?, ?it/s]
  0%|                                                                                                                                                  | 0/7 [00:00<?, ?it/s][A
 14%|███████████████████▋                                                                                                                      | 1/7 [00:02<00:14,  2.46s/it][A
 29%|███████████████████████████████████████▍                                                                                                  | 2/7 [00:03<00:08,  1.62s/it][A
 43%|███████████████████████████████████████████████████████████▏                                                                              | 3/7 [00:05<00:06,  1.73s/it][A
 57%|██████████████████████████████████████████████████████████████████████████████▊                                  

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:33<00:00, 33.49s/it][A[A

 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 6/7 [00:41<00:12, 12.38s/it][A

  0%|                                                                                                                                                  | 0/8 [00:00<?, ?it/s][A[A

 12%|█████████████████▎                                                                                                                        | 1/8 [00:00<00:06,  1.03it/s][A[A

 25%|██████████████████████████████████▌                                                                                                       | 2/8 [00:02<00:06,  1.05s/it][A[A

 38%|███████████████████████████████████████████████████▊                                         

 38%|███████████████████████████████████████████████████▊                                                                                      | 3/8 [00:03<00:05,  1.11s/it][A[A[A


 50%|█████████████████████████████████████████████████████████████████████                                                                     | 4/8 [00:04<00:04,  1.12s/it][A[A[A


 62%|██████████████████████████████████████████████████████████████████████████████████████▎                                                   | 5/8 [00:05<00:03,  1.08s/it][A[A[A


 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 6/8 [00:06<00:02,  1.16s/it][A[A[A


 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 7/8 [00:07<00:01,  1.05s/it][A[A[A


100%|██████████████████████████████████████████████████████████████████████

 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 6/8 [00:07<00:02,  1.16s/it][A[A[A


 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 7/8 [00:08<00:01,  1.10s/it][A[A[A


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:09<00:00,  1.15s/it][A[A[A



  0%|                                                                                                                                                  | 0/8 [00:00<?, ?it/s][A[A[A


 12%|█████████████████▎                                                                                                                        | 1/8 [00:01<00:07,  1.01s/it][A[A[A


 25%|██████████████████████████████████▌                                  

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:52<00:00,  7.46s/it][A
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [02:38<00:00, 52.80s/it]


In [4]:
knn_index = {
  "settings": {
    "index.knn": True,
    "index.knn.space_type": "cosinesimil"
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "text"
      },
      "title_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "store": True
      },
      "description": {
        "type": "text"
      },
      "description_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "store": True
      },
      "tags": {
        "type": "text"
      },
      "tags_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "store": True
      },
      "purpose_and_objective": {
        "type": "text"
      },
      "purpose_and_objective_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "store": True
      },
      "target_audience": {
        "type": "text"
      },
      "target_audience_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "store": True
      },
      "datasets": {
        "type": "nested",
        "properties": {
          "table": {
            "type": "nested",
            "properties": {
              "title": {
                "type": "text"
              },
              "title_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "description": {
                "type": "text"
              },
              "description_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "version": {
                "type": "text"
              },
              "version_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "version_date": {
                "type": "text"
              },
              "version_date_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "quality_score": {
                "type": "text"
              },
              "quality_score_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "completeness": {
                "type": "text"
              },
              "completeness_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "recency": {
                "type": "text"
              },
              "recency_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "frequency": {
                "type": "text"
              },
              "frequency_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              }
            }
          }
        }
      },
      "lens": {
        "type": "nested",
        "properties": {
          "name": {
            "type": "text"
          },
          "name_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "description": {
            "type": "text"
          },
          "description_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "stack": {
            "type": "text"
          },
          "stack_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "published": {
            "type": "text"
          },
          "published_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "updated_at": {
            "type": "text"
          },
          "updated_at_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "version": {
            "type": "text"
          },
          "version_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "entities": {
            "type": "nested",
            "properties": {
              "entity_1": {
                "type": "text"
              },
              "entity_1_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "entity_2": {
                "type": "text"
              },
              "entity_2_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              }
            }
          }
        }
      }
    }
  }
}


In [7]:
client.indices.create(index="dp_vec_test2", body=knn_index, ignore=400)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'dp_vec_test2'}

In [8]:
client.indices.get(index="dp_vec_test2")

{'dp_vec_test2': {'aliases': {},
  'mappings': {'properties': {'datasets': {'type': 'nested',
     'properties': {'table': {'type': 'nested',
       'properties': {'completeness': {'type': 'text'},
        'completeness_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'description': {'type': 'text'},
        'description_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'frequency': {'type': 'text'},
        'frequency_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'quality_score': {'type': 'text'},
        'quality_score_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'recency': {'type': 'text'},
        'recency_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'title': {'type': 'text'},
        'title_vector': {'type': 'knn_vector',
         'store': True,
         'dime

In [9]:
for document_data in document_vectors:
    document_data.pop("tags")
    document_data.pop("tags_vector")
    client.index(index='dp_vec_test2', body=document_data)

In [28]:
query_raw_sentences = ['revenue']
search_vector = sentence_to_vector(query_raw_sentences)[0].tolist()

In [29]:
query = {
    "size": 30,
    "query": {
        "knn": {
            "title_vector": {
                "vector": search_vector,
                "k": 30
            }
        }
    }
}

res = client.search(index="dp_vec_test2", body=query)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title'])
    print()

Score: 0.59896296
Title: Finance_360

Score: 0.56256574
Title: Customer_360

Score: 0.5331324
Title: Healthcare_360

