In [1]:
import torch
import re
import time
from transformers import AutoTokenizer, AutoModel
from transformers import DistilBertTokenizer, DistilBertModel
import json
import numpy as np
from tqdm import tqdm
from opensearchpy import OpenSearch, RequestsHttpConnection

model_name = "sentence-transformers/distilbert-base-nli-stsb-mean-tokens"


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def sentence_to_vector(raw_inputs):
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    model = DistilBertModel.from_pretrained(model_name)
    inputs_tokens = tokenizer(raw_inputs, padding=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs_tokens)

    sentence_embeddings = mean_pooling(outputs, inputs_tokens['attention_mask'])
    return sentence_embeddings

In [2]:
# import torch
# from transformers import AutoModel, AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained('Snowflake/snowflake-arctic-embed-l')
# model = AutoModel.from_pretrained('Snowflake/snowflake-arctic-embed-l', add_pooling_layer=False)

# # Mean Pooling - Take attention mask into account for correct averaging
# def mean_pooling(model_output, attention_mask):
#     token_embeddings = model_output.last_hidden_state  
#     masked_embeddings = token_embeddings * attention_mask.unsqueeze(-1).float()
#     mean_embeddings = torch.mean(masked_embeddings, dim=1)
#     return mean_embeddings

# def sentence_to_vector(raw_inputs):
#     inputs_tokens = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors="pt", max_length=512)
    
#     with torch.no_grad():
#         outputs = model(**inputs_tokens)  # Directly access the embeddings and apply mean pooling

#     sentence_embeddings = mean_pooling(outputs, inputs_tokens['attention_mask'])
#     return sentence_embeddings


In [3]:
host = 'search-costplus1-lammv6fxll6v6kj3cguiq5jy3u.us-east-1.es.amazonaws.com' # cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com
region = 'us-east-1'
service = 'es'
auth = ("pankaj", "Stack@123")

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

In [21]:
client.indices.delete(index = "dp_vec_test2")

{'acknowledged': True}

In [5]:
def convert_to_vectors(data):
    vectors = {}
    if isinstance(data, dict):
        for key, value in tqdm(data.items()):
            vectors[key] = value
            if isinstance(value, dict):
                vectors[key+"_vector"] = convert_to_vectors(value)
            elif isinstance(value, list):
                vectors[key+"_vector"] = [convert_to_vectors(item) for item in value]
            else:
                vectors[key+"_vector"] = list(np.array(sentence_to_vector(str(value)))[0])
    else:
        vectors = list(np.array(sentence_to_vector(str(data)))[0])
    return vectors

document_files = ["H360.json", "F360.json", "C360.json"]
document_vectors = []

for filename in tqdm(document_files):
    with open(filename) as f:
        doc = json.load(f)
    document_vectors.append(convert_to_vectors(doc))

# def convert_to_vectors(data):
#     vectors = {}
#     if isinstance(data, dict):
#         for key, value in data.items():
#             if isinstance(value, dict):
#                 vectors[key+"_vector"] = convert_to_vectors(value)
#             elif isinstance(value, list):
#                 vectors[key+"_vector"] = [convert_to_vectors(item) for item in value]
#             else:
#                 vectors[key] = value
#                 vectors[key+"_vector"] = list(np.array(sentence_to_vector(str(value)))[0])
#     else:
#         vectors = list(np.array(sentence_to_vector(str(data)))[0])
#     return vectors

# document_files = ["H360.json", "F360.json", "C360.json"]
# document_vectors = []
# for filename in tqdm(document_files):
#     with open(filename) as f:
#         doc = json.load(f)
#     document_vectors.append(convert_to_vectors(doc))

  0%|                                                                                                                 | 0/3 [00:00<?, ?it/s]
  0%|                                                                                                                 | 0/7 [00:00<?, ?it/s][A
 14%|███████████████                                                                                          | 1/7 [00:04<00:27,  4.56s/it][A
 29%|██████████████████████████████                                                                           | 2/7 [00:06<00:14,  2.82s/it][A
 43%|█████████████████████████████████████████████                                                            | 3/7 [00:08<00:09,  2.48s/it][A
 57%|████████████████████████████████████████████████████████████                                             | 4/7 [00:09<00:05,  1.89s/it][A
 71%|███████████████████████████████████████████████████████████████████████████                              | 5/7 [00:10<00:03,  1.57s/it

In [6]:
knn_index = {
  "settings": {
    "index.knn": True,
    "index.knn.space_type": "cosinesimil"
  },
  "mappings": {
    "properties": {
      "title": {
        "type": "text"
      },
      "title_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "store": True
      },
      "description": {
        "type": "text"
      },
      "description_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "store": True
      },
      "purpose_and_objective": {
        "type": "text"
      },
      "purpose_and_objective_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "store": True
      },
      "target_audience": {
        "type": "text"
      },
      "target_audience_vector": {
        "type": "knn_vector",
        "dimension": 768,
        "store": True
      },
      "datasets": {
        "type": "nested",
        "properties": {
          "table": {
            "type": "nested",
            "properties": {
              "title": {
                "type": "text"
              },
              "title_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "description": {
                "type": "text"
              },
              "description_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "version": {
                "type": "text"
              },
              "version_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "version_date": {
                "type": "text"
              },
              "version_date_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "quality_score": {
                "type": "text"
              },
              "quality_score_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "completeness": {
                "type": "text"
              },
              "completeness_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "recency": {
                "type": "text"
              },
              "recency_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "frequency": {
                "type": "text"
              },
              "frequency_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              }
            }
          }
        }
      },
      "lens": {
        "type": "nested",
        "properties": {
          "name": {
            "type": "text"
          },
          "name_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "description": {
            "type": "text"
          },
          "description_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "stack": {
            "type": "text"
          },
          "stack_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "published": {
            "type": "text"
          },
          "published_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "updated_at": {
            "type": "text"
          },
          "updated_at_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "version": {
            "type": "text"
          },
          "version_vector": {
            "type": "knn_vector",
            "dimension": 768,
            "store": True
          },
          "entities": {
            "type": "nested",
            "properties": {
              "entity_1": {
                "type": "text"
              },
              "entity_1_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              },
              "entity_2": {
                "type": "text"
              },
              "entity_2_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
              }
            }
          }
        }
      }
    }
  }
}


In [22]:
client.indices.create(index="dp_vec_test2", body=knn_index, ignore=400)

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'dp_vec_test2'}

In [8]:
client.indices.get(index="dp_vec_test2")

{'dp_vec_test2': {'aliases': {},
  'mappings': {'properties': {'datasets': {'type': 'nested',
     'properties': {'table': {'type': 'nested',
       'properties': {'completeness': {'type': 'text'},
        'completeness_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'description': {'type': 'text'},
        'description_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'frequency': {'type': 'text'},
        'frequency_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'quality_score': {'type': 'text'},
        'quality_score_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'recency': {'type': 'text'},
        'recency_vector': {'type': 'knn_vector',
         'store': True,
         'dimension': 768},
        'title': {'type': 'text'},
        'title_vector': {'type': 'knn_vector',
         'store': True,
         'dime

In [23]:
for document_data in document_vectors:
    client.index(index='dp_vec_test2', body=document_data)

In [24]:
query_raw_sentences = ['which data prodcut has information on medicine']
search_vector = sentence_to_vector(query_raw_sentences)[0].tolist()

In [25]:
query = {
    "size": 30,
    "query": {
        "knn": {
            "title_vector": {
                "vector": search_vector,
                "k": 30
            }
        }
    }
}

res = client.search(index="dp_vec_test2", body=query)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title'])
    print()

Score: 0.70633495
Title: Healthcare_360

Score: 0.5300335
Title: Customer_360

Score: 0.5037399
Title: Finance_360



In [26]:
query_term = {
    "size": 30,
    "query": {
        "match": {
            "target_audience": "engineering"  
        }
    }
}

res = client.search(index="dp_vec_test2", body=query_term)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title']) 
    print()


Score: 0.29379332
Title: Customer_360



In [27]:
query_fuzzy = {
    "size": 30,
    "query": {
        "fuzzy": {
            "description": {
                "value": "integration",
                "fuzziness": "AUTO"
            }
        }
    }
}
res = client.search(index="dp_vec_test2", body=query_fuzzy)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title']) 
    print()

Score: 0.23707777
Title: Customer_360

Score: 0.15213588
Title: Finance_360

Score: 0.1482098
Title: Healthcare_360



In [39]:
query_wildcard = {
    "size": 30,
    "query": {
        "wildcard": {
            "description": "contains information*"
        }
    }
}
res = client.search(index="dp_vec_test2", body=query_wildcard)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title']) 
    print()

In [29]:
query_fuzzy_with_match = {
    "size": 30,
    "query": {
        "bool": {
            "should": [
                {
                    "fuzzy": {
                        "description": {
                            "value": "medical",
                            "fuzziness": "AUTO"
                        }
                    }
                },
                {
                    "match": {
                        "description": {
                            "query": "healthcare",
                            "fuzziness": "AUTO"
                        }
                    }
                }
            ]
        }
    }
}
res = client.search(index="dp_vec_test2", body=query_fuzzy_with_match)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title']) 
    print()

Score: 1.8584799
Title: Healthcare_360



In [33]:
comprehensive_query = {
    "size": 30,
    "query": {
        "bool": {
            "should": [
                {
                    "fuzzy": {
                        "title": {
                            "value": "cust",
                            "fuzziness": "AUTO"
                        }
                    }
                },
                {
                    "match_phrase_prefix": {
                        "description": {
                            "query": "optimize marketing"
                        }
                    }
                },
                {
                    "match": {
                        "datasets": {
                            "query": "clickstream",
                            "boost": 2
                        }
                    }
                },
                {
                    "match": {
                        "lens": {
                            "query": "Purchase History",
                            "boost": 1.5
                        }
                    }
                },
                {
                    "range": {
                        "quality_score": {
                            "lte": "90%"
                        }
                    }
                }
            ]
        }
    }
}
res = client.search(index="dp_vec_test2", body=comprehensive_query)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title']) 
    print()

Score: 0.57952344
Title: Customer_360



In [31]:
query_multiple = {
    "size": 30,
    "query": {
        "bool": {
            "should": [
                {
                    "fuzzy": {
                        "description": {
                            "value": "medical",
                            "fuzziness": "AUTO"
                        }
                    }
                },
                {
                    "match": {
                        "title": {
                            "query": "health",
                            "fuzziness": "AUTO"
                        }
                    }
                },
                {
                    "match_phrase_prefix": {
                        "purpose_and_objective": {
                            "query": "patient"
                        }
                    }
                }
            ]
        }
    }
}
res = client.search(index="dp_vec_test2", body=query_multiple)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title']) 
    print()

Score: 1.7616134
Title: Healthcare_360



In [32]:
expanded_query = {
    "size": 30,
    "query": {
        "bool": {
            "should": [
                {
                    "fuzzy": {
                        "description": {
                            "value": "finance",
                            "fuzziness": "AUTO"
                        }
                    }
                },
                {
                    "match_phrase_prefix": {
                        "title": {
                            "query": "financial"
                        }
                    }
                },
                {
                    "term": {
                        "description": "data"
                    }
                },
                {
                    "match": {
                        "title": {
                            "query": "finance",
                            "boost": 2
                        }
                    }
                },
                {
                    "match": {
                        "description": {
                            "query": "finance",
                            "boost": 1.5
                        }
                    }
                },
                {
                    "match_phrase": {
                        "target_audience": "Finance 360 Data Product"
                    }
                },
            ]
        }
    }
}
res = client.search(index="dp_vec_test2", body=expanded_query)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title']) 
    print()

Score: 3.3045292
Title: Finance_360

Score: 0.39752436
Title: Customer_360

Score: 0.24957798
Title: Healthcare_360

