In [1]:
import torch
import re
import time
from transformers import AutoTokenizer, AutoModel
from transformers import DistilBertTokenizer, DistilBertModel
import json
import numpy as np
from tqdm import tqdm
from opensearchpy import OpenSearch, RequestsHttpConnection

model_name = "sentence-transformers/distilbert-base-nli-stsb-mean-tokens"


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def sentence_to_vector(raw_inputs):
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    model = DistilBertModel.from_pretrained(model_name)
    inputs_tokens = tokenizer(raw_inputs, padding=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs_tokens)

    sentence_embeddings = mean_pooling(outputs, inputs_tokens['attention_mask'])
    return sentence_embeddings

In [2]:
# import torch
# from transformers import AutoModel, AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained('Snowflake/snowflake-arctic-embed-l')
# model = AutoModel.from_pretrained('Snowflake/snowflake-arctic-embed-l', add_pooling_layer=False)

# # Mean Pooling - Take attention mask into account for correct averaging
# def mean_pooling(model_output, attention_mask):
#     token_embeddings = model_output.last_hidden_state  
#     masked_embeddings = token_embeddings * attention_mask.unsqueeze(-1).float()
#     mean_embeddings = torch.mean(masked_embeddings, dim=1)
#     return mean_embeddings

# def sentence_to_vector(raw_inputs):
#     inputs_tokens = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt", max_length=512)
    
#     with torch.no_grad():
#         outputs = model(**inputs_tokens)  # Directly access the embeddings and apply mean pooling

#     sentence_embeddings = mean_pooling(outputs, inputs_tokens['attention_mask'])
#     return sentence_embeddings


In [3]:
host = 'search-costplus1-lammv6fxll6v6kj3cguiq5jy3u.us-east-1.es.amazonaws.com' # cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com
region = 'us-east-1'
service = 'es'
auth = ("pankaj", "Stack@123")

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

In [4]:
client.indices.delete(index = "dp_vec_test2")

{'acknowledged': True}

In [5]:
def convert_to_vectors(data):
    vectors = {}
    if isinstance(data, dict):
        for key, value in tqdm(data.items()):
            vectors[key] = value
            if isinstance(value, dict):
                vectors[key+"_vector"] = convert_to_vectors(value)
            elif isinstance(value, list):
                vectors[key+"_vector"] = [convert_to_vectors(item) for item in value]
            else:
                vectors[key+"_vector"] = list(np.array(sentence_to_vector(str(value)))[0])
    else:
        vectors = list(np.array(sentence_to_vector(str(data)))[0])
    return vectors

document_files = ["H360.json", "F360.json", "C360.json"]
document_vectors = []

for filename in tqdm(document_files):
    with open(filename) as f:
        doc = json.load(f)
    document_vectors.append(convert_to_vectors(doc))

# def convert_to_vectors(data):
#     vectors = {}
#     if isinstance(data, dict):
#         for key, value in data.items():
#             if isinstance(value, dict):
#                 vectors[key+"_vector"] = convert_to_vectors(value)
#             elif isinstance(value, list):
#                 vectors[key+"_vector"] = [convert_to_vectors(item) for item in value]
#             else:
#                 vectors[key] = value
#                 vectors[key+"_vector"] = list(np.array(sentence_to_vector(str(value)))[0])
#     else:
#         vectors = list(np.array(sentence_to_vector(str(data)))[0])
#     return vectors

# document_files = ["H360.json", "F360.json", "C360.json"]
# document_vectors = []
# for filename in tqdm(document_files):
#     with open(filename) as f:
#         doc = json.load(f)
#     document_vectors.append(convert_to_vectors(doc))

  0%|                                                                                                                                                  | 0/3 [00:00<?, ?it/s]
  0%|                                                                                                                                                  | 0/7 [00:00<?, ?it/s][A
 14%|███████████████████▋                                                                                                                      | 1/7 [00:03<00:21,  3.55s/it][A
 29%|███████████████████████████████████████▍                                                                                                  | 2/7 [00:05<00:12,  2.46s/it][A
 43%|███████████████████████████████████████████████████████████▏                                                                              | 3/7 [00:07<00:09,  2.35s/it][A
 57%|██████████████████████████████████████████████████████████████████████████████▊                                  

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:41<00:00, 41.26s/it][A[A

 86%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎                   | 6/7 [00:51<00:15, 15.17s/it][A

  0%|                                                                                                                                                  | 0/8 [00:00<?, ?it/s][A[A

 12%|█████████████████▎                                                                                                                        | 1/8 [00:01<00:08,  1.18s/it][A[A

 25%|██████████████████████████████████▌                                                                                                       | 2/8 [00:02<00:07,  1.24s/it][A[A

 38%|███████████████████████████████████████████████████▊                                         

 38%|███████████████████████████████████████████████████▊                                                                                      | 3/8 [00:04<00:06,  1.37s/it][A[A[A


 50%|█████████████████████████████████████████████████████████████████████                                                                     | 4/8 [00:05<00:05,  1.37s/it][A[A[A


 62%|██████████████████████████████████████████████████████████████████████████████████████▎                                                   | 5/8 [00:06<00:03,  1.28s/it][A[A[A


 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 6/8 [00:07<00:02,  1.27s/it][A[A[A


 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 7/8 [00:09<00:01,  1.27s/it][A[A[A


100%|██████████████████████████████████████████████████████████████████████

 75%|███████████████████████████████████████████████████████████████████████████████████████████████████████▌                                  | 6/8 [00:08<00:02,  1.42s/it][A[A[A


 88%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊                 | 7/8 [00:09<00:01,  1.36s/it][A[A[A


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:10<00:00,  1.37s/it][A[A[A



  0%|                                                                                                                                                  | 0/8 [00:00<?, ?it/s][A[A[A


 12%|█████████████████▎                                                                                                                        | 1/8 [00:01<00:08,  1.22s/it][A[A[A


 25%|██████████████████████████████████▌                                  

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 7/7 [01:13<00:00, 10.49s/it][A
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [03:26<00:00, 68.96s/it]


In [6]:
knn_index = {
  "settings": {
    "index.knn": True,
    "index.knn.space_type": "cosinesimil"
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "text"
      },
      "title_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "store": True
      },
      "description": {
        "type": "text"
      },
      "description_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "store": True
      },
      "purpose_and_objective": {
        "type": "text"
      },
      "purpose_and_objective_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "store": True
      },
      "target_audience": {
        "type": "text"
      },
      "target_audience_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "store": True
      },
      "datasets": {
        "type": "nested",
        "properties": {
          "table": {
            "type": "nested",
            "properties": {
              "title": {
                "type": "text"
              },
              "title_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "description": {
                "type": "text"
              },
              "description_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "version": {
                "type": "text"
              },
              "version_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "version_date": {
                "type": "text"
              },
              "version_date_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "quality_score": {
                "type": "text"
              },
              "quality_score_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "completeness": {
                "type": "text"
              },
              "completeness_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "recency": {
                "type": "text"
              },
              "recency_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "frequency": {
                "type": "text"
              },
              "frequency_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              }
            }
          }
        }
      },
      "lens": {
        "type": "nested",
        "properties": {
          "name": {
            "type": "text"
          },
          "name_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "description": {
            "type": "text"
          },
          "description_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "stack": {
            "type": "text"
          },
          "stack_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "published": {
            "type": "text"
          },
          "published_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "updated_at": {
            "type": "text"
          },
          "updated_at_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "version": {
            "type": "text"
          },
          "version_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "entities": {
            "type": "nested",
            "properties": {
              "entity_1": {
                "type": "text"
              },
              "entity_1_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "entity_2": {
                "type": "text"
              },
              "entity_2_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              }
            }
          }
        }
      }
    }
  }
}


In [7]:
client.indices.create(index="dp_vec_test2", body=knn_index, ignore=400)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'dp_vec_test2'}

In [8]:
client.indices.get(index="dp_vec_test2")

{'dp_vec_test2': {'aliases': {},
  'mappings': {'properties': {'datasets': {'type': 'nested',
     'properties': {'table': {'type': 'nested',
       'properties': {'completeness': {'type': 'text'},
        'completeness_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'description': {'type': 'text'},
        'description_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'frequency': {'type': 'text'},
        'frequency_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'quality_score': {'type': 'text'},
        'quality_score_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'recency': {'type': 'text'},
        'recency_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'title': {'type': 'text'},
        'title_vector': {'type': 'knn_vector',
         'store': True,
         'dime

In [10]:
for document_data in document_vectors:
    client.index(index='dp_vec_test2', body=document_data)

In [12]:
query_raw_sentences = ['which data prodcut has information on medicine']
search_vector = sentence_to_vector(query_raw_sentences)[0].tolist()

In [13]:
query = {
    "size": 30,
    "query": {
        "knn": {
            "title_vector": {
                "vector": search_vector,
                "k": 30
            }
        }
    }
}

res = client.search(index="dp_vec_test2", body=query)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title'])
    print()

Score: 0.70633495
Title: Healthcare_360

Score: 0.70633495
Title: Healthcare_360

Score: 0.5300336
Title: Customer_360

Score: 0.5037399
Title: Finance_360



In [35]:
query_term = {
    "size": 30,
    "query": {
        "match": {
            "target_audience": "engineering"  
        }
    }
}

res = client.search(index="dp_vec_test2", body=query_term)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title']) 
    print()


Score: 0.29379332
Title: Customer_360



In [53]:
query_fuzzy = {
    "size": 30,
    "query": {
        "fuzzy": {
            "description": {
                "value": "integration",
                "fuzziness": "AUTO"
            }
        }
    }
}
res = client.search(index="dp_vec_test2", body=query_fuzzy)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title']) 
    print()

Score: 0.23707777
Title: Customer_360

Score: 0.23537624
Title: Finance_360

Score: 0.15104464
Title: Healthcare_360

Score: 0.15104464
Title: Healthcare_360



In [56]:
query_wildcard = {
    "size": 30,
    "query": {
        "wildcard": {
            "description": "integrat*"
        }
    }
}
res = client.search(index="dp_vec_test2", body=query_wildcard)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title']) 
    print()

Score: 1.0
Title: Customer_360

Score: 1.0
Title: Finance_360

Score: 1.0
Title: Healthcare_360

Score: 1.0
Title: Healthcare_360



In [60]:
query_fuzzy_with_match = {
    "size": 30,
    "query": {
        "bool": {
            "should": [
                {
                    "fuzzy": {
                        "description": {
                            "value": "medical",
                            "fuzziness": "AUTO"
                        }
                    }
                },
                {
                    "match": {
                        "description": {
                            "query": "healthcare",
                            "fuzziness": "AUTO"
                        }
                    }
                }
            ]
        }
    }
}
res = client.search(index="dp_vec_test2", body=query_fuzzy_with_match)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title']) 
    print()

Score: 0.49478108
Title: Healthcare_360

Score: 0.49478108
Title: Healthcare_360

