In [31]:
import torch
import re
import time
import sagemaker
from transformers import AutoTokenizer, AutoModel
from transformers import DistilBertTokenizer, DistilBertModel
import json
import numpy as np
from opensearchpy import OpenSearch, RequestsHttpConnection

model_name = "sentence-transformers/distilbert-base-nli-stsb-mean-tokens"


#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask


def sentence_to_vector(raw_inputs):
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    model = DistilBertModel.from_pretrained(model_name)
    inputs_tokens = tokenizer(raw_inputs, padding=True, return_tensors="pt")
    
    with torch.no_grad():
        outputs = model(**inputs_tokens)

    sentence_embeddings = mean_pooling(outputs, inputs_tokens['attention_mask'])
    return sentence_embeddings

In [74]:
host = 'search-costplus1-lammv6fxll6v6kj3cguiq5jy3u.us-east-1.es.amazonaws.com' # cluster endpoint, for example: my-test-domain.us-east-1.es.amazonaws.com
region = 'us-east-1'
service = 'es'
auth = ("pankaj", "Stack@123")

# Create the client with SSL/TLS enabled, but hostname verification disabled.
client = OpenSearch(
    hosts = [{'host': host, 'port': 443}],
    http_compress = True, # enables gzip compression for request bodies
    http_auth = auth,
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection,
    pool_maxsize = 20
)

In [54]:
document_files = ["H360_updated.json", "F360_updated.json", "C360_updated.json"]

document_vectors = []

for filename in document_files:
    with open(filename) as f:
        doc = json.load(f)
    
    title = doc.get('title', None)
    description = doc.get('description', None)
    
    title_vector = sentence_to_vector(title)
    description_vector = sentence_to_vector(description)
    
    document_vectors.append({"title": title, "description": description, "title_vector": title_vector, "description_vector": description_vector})

In [55]:
knn_index = {
    "settings": {
        "index.knn": True,
        "index.knn.space_type": "cosinesimil"
    },
    "mappings": {
        "properties": {
            "title_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
            },
            "description_vector": {
                "type": "knn_vector",
                "dimension": 768,
                "store": True
            },
            "title": {
                "type": "text",
                "store": True
            },
            "description": {
                "type": "text",
                "store": True
            }
        }
    }
}

In [53]:
client.indices.delete(index = "02")

{'acknowledged': True}

In [56]:
client.indices.create(index="01", body=knn_index, ignore=400)

{'acknowledged': True, 'shards_acknowledged': True, 'index': '01'}

In [57]:
client.indices.get(index="01")

{'01': {'aliases': {},
  'mappings': {'properties': {'description': {'type': 'text', 'store': True},
    'description_vector': {'type': 'knn_vector',
     'store': True,
     'dimension': 768},
    'title': {'type': 'text', 'store': True},
    'title_vector': {'type': 'knn_vector', 'store': True, 'dimension': 768}}},
  'settings': {'index': {'replication': {'type': 'DOCUMENT'},
    'number_of_shards': '5',
    'provided_name': '01',
    'knn.space_type': 'cosinesimil',
    'knn': 'true',
    'creation_date': '1713258553633',
    'number_of_replicas': '1',
    'uuid': 'FpZghV_QT8aMQFhDexookA',
    'version': {'created': '136327827'}}}}}

In [58]:
for document_data in document_vectors:
    client.index(index='01', body={
        "title_vector": list(np.array(document_data["title_vector"][0])),
        "description_vector": list(np.array(document_data["description_vector"][0])),
        "title": document_data["title"],
        "description": document_data["description"]
    })

In [83]:
query_raw_sentences = ['Shopping']
search_vector = sentence_to_vector(query_raw_sentences)[0].tolist()

In [84]:
query = {
    "size": 30,
    "query": {
        "knn": {
            "title_vector": {
                "vector": search_vector,
                "k": 30
            }
        }
    }
}

res = client.search(index="01", body=query)
for hit in res['hits']['hits']:
    print("Score:", hit['_score'])
    print("Title:", hit['_source']['title'])
    print("Description:", hit['_source']['description'])
    print()

Score: 0.68041146
Title: Retail_360
Description: Comprehensive retail data integration platform for a unified view of customer shopping behaviors, inventory management, and sales performance.

Score: 0.5940871
Title: Customer_360
Description: Customer data from various sources, enabling a holistic view of customers.

Score: 0.55193615
Title: Healthcare_360
Description: Comprehensive healthcare data integration platform for a unified view of patient records and medical information.

