In [None]:
!pip install opensearch-py
!pip install python-dotenv

In [None]:
import json

with open('sessions_info.json', 'r', encoding='utf-8') as f:
    documents = json.load(f)

print(documents[0])

In [None]:
import boto3
from botocore.config import Config

region = 'us-west-2'
model_id = "amazon.titan-embed-text-v2:0"

def init_bedrock_client(region: str):
    retry_config = Config(
        region_name=region,
        retries={"max_attempts": 10, "mode": "standard"}
    )
    return boto3.client("bedrock-runtime", region_name=region, config=retry_config)

def embed_document(document):
    title_response = boto3_client.invoke_model(
        modelId=model_id,
        body=json.dumps({"inputText": document['title']})
    )
    title_embedding = json.loads(title_response['body'].read())['embedding']
    
    synopsis_response = boto3_client.invoke_model(
        modelId=model_id,
        body=json.dumps({"inputText": document['synopsis']})
    )
    synopsis_embedding = json.loads(synopsis_response['body'].read())['embedding']
    document['title_embedding'] = title_embedding
    document['synopsis_embedding'] = synopsis_embedding

    return document

boto3_client = init_bedrock_client(region)

document_with_emb = []
for document in documents:
    response = embed_document(document)
    document_with_emb.append(response)

print(document_with_emb[0])

In [None]:
output_file = 'session_info_with_emb.json'

with open(output_file, 'w', encoding='utf-8') as f:
    json.dump(document_with_emb, f, ensure_ascii=False, indent=4)

In [None]:
from opensearchpy import OpenSearch, RequestsHttpConnection
from dotenv import load_dotenv
import os

load_dotenv()
host = os.getenv('OPENSEARCH_HOST')
user = os.getenv('OPENSEARCH_USER')
password = os.getenv('OPENSEARCH_PASSWORD')
region = 'us-east-1'
index_name = 'reinvent_session'

In [None]:
os_client = OpenSearch(
    hosts = [{'host': host.replace("https://", ""), 'port': 443}],
    http_auth = (user, password),
    use_ssl = True,
    verify_certs = True,
    connection_class = RequestsHttpConnection
)

mapping = {
    "settings": {
        "index": {
            "knn": True,
            "knn.algo_param.ef_search": 512
        }
    },
    "mappings": {
        "properties": {
            "code": {"type": "keyword"},
            "title": {"type": "text"},
            "synopsis": {"type": "text"},
            "topics": {"type": "keyword"},
            "aws_services": {"type": "keyword"},
            "target_audience": {"type": "keyword"},
            "session_format": {"type": "keyword"},
            "title_embedding": {
                "type": "knn_vector",
                "dimension": 1024,
                "method": {
                    "name": "hnsw",
                    "space_type": "l2",
                    "engine": "faiss",
                    "parameters": {
                        "ef_construction": 512,
                        "m": 16
                    }
                }
            },
            "synopsis_embedding": {
                "type": "knn_vector",
                "dimension": 1024,
                "method": {
                    "name": "hnsw",
                    "space_type": "l2",
                    "engine": "faiss",
                    "parameters": {
                        "ef_construction": 512,
                        "m": 16
                    }
                }
            }
        }
    }
}


In [None]:
def init_opensearch_index(os_client, index_name, mapping):
    if os_client.indices.exists(index=index_name):
        os_client.indices.delete(index=index_name)
    os_client.indices.create(index=index_name, body=mapping)

init_opensearch_index(os_client, index_name, mapping)

In [None]:
bulk_data = []
for doc in document_with_emb:
    bulk_data.append({"index": {"_index": index_name, "_id": doc['code']}})
    bulk_data.append(doc)

if bulk_data:
    response = os_client.bulk(body=bulk_data)
    successful = sum(1 for item in response['items'] if item['index']['status'] in (200, 201))
    failed = len(response['items']) - successful

    print(f"Indexed {successful} documents successfully.")
    print(f"Failed to index {failed} documents.")
else:
    print("No data to index.")


In [None]:
search_response = os_client.search(
    index=index_name,
    body={
        "size": 1,
        "query": {
            "function_score": {
                "random_score": {}
            }
        }
    }
)

print(search_response['hits']['hits'][0]['_source']['title'])
print(search_response['hits']['hits'][0]['_source']['synopsis'])