In [2]:
import boto3
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
from langchain.embeddings import SagemakerEndpointEmbeddings
from langchain.embeddings.sagemaker_endpoint import EmbeddingsContentHandler
from langchain.vectorstores import OpenSearchVectorSearch
from typing import List
import json
import boto3
from opensearchpy import OpenSearch, RequestsHttpConnection
from langchain.vectorstores import OpenSearchVectorSearch

# Your AWS region where your OpenSearch Service is deployed
region = ''

# Host URL of your Amazon OpenSearch Service
host = ""


# sagemaker endpoint
endpoint_name = ""

# Get credentials from boto3 session
session = boto3.Session()
credentials = session.get_credentials()
auth = (credentials.access_key, credentials.secret_key, credentials.token)

# OpenSearch client
aos_client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=auth,  # Using the tuple for authentication
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

class BulkSagemakerEndpointEmbeddings(SagemakerEndpointEmbeddings):
    def embed_documents(self, texts: List[str], chunk_size: int = 1) -> List[List[float]]:
        results = []
        for i in range(0, len(texts), chunk_size):
            batch = texts[i:i + chunk_size]
            response = self._embedding_func(batch)
            results.extend(response)
        return results

class EmbeddingContentHandler(EmbeddingsContentHandler):
    content_type = "application/json"
    accepts = "application/json"

    def transform_input(self, prompts: List[str], model_kwargs={}) -> bytes:
        input_str = json.dumps({"inputs": prompts}, **model_kwargs)
        return input_str.encode('utf-8')

    def transform_output(self, output: bytes) -> List:
        response_json = json.loads(output.read().decode("utf-8"))
        embeddings = response_json["vectors"]
        return embeddings

embeddings = BulkSagemakerEndpointEmbeddings(
    endpoint_name="huggingface-pytorch-inference-2024-01-28-17-40-16-789",
    region_name=region, 
    content_handler=EmbeddingContentHandler()
)

embedding_index_name = 'opensearch_test0'


In [4]:
import langchain
from langchain.document_loaders.pdf import PyPDFLoader
loader = PyPDFLoader("file.pdf")
pages = loader.load_and_split()

In [20]:
import boto3
from requests_aws4auth import AWS4Auth
from opensearchpy import OpenSearch, RequestsHttpConnection

# Your AWS region where your OpenSearch Service is deployed
region = 'me-central-1'

# Host URL of your Amazon OpenSearch Service
host = "vpc-aura-dev-opensrch-f6cr4ow5q3qkvy7vpiv2njpove.me-central-1.es.amazonaws.com"

# Creating AWS4Auth instance using boto3 to retrieve credentials
session = boto3.Session()
credentials = session.get_credentials()
awsauth = AWS4Auth(
    credentials.access_key, 
    credentials.secret_key, 
    region, 
    'es', 
    session_token=credentials.token
)

# OpenSearch client
aos_client = OpenSearch(
    hosts=[{'host': host, 'port': 443}],
    http_auth=awsauth,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection
)

In [21]:
try:
    health = aos_client.cluster.health()
    print("Cluster health response:", health)
except Exception as e:
    print("Error connecting to OpenSearch:", e)

Cluster health response: {'cluster_name': '583504607797:aura-dev-opensrch', 'status': 'green', 'timed_out': False, 'number_of_nodes': 2, 'number_of_data_nodes': 2, 'discovered_master': True, 'discovered_cluster_manager': True, 'active_primary_shards': 28, 'active_shards': 55, 'relocating_shards': 0, 'initializing_shards': 0, 'unassigned_shards': 0, 'delayed_unassigned_shards': 0, 'number_of_pending_tasks': 0, 'number_of_in_flight_fetch': 0, 'task_max_waiting_in_queue_millis': 0, 'active_shards_percent_as_number': 100.0}


In [8]:
docsearch = OpenSearchVectorSearch.from_documents(
    pages,
    embeddings,
    opensearch_url=[{'host': host, 'port': 443}],
    http_auth=awsauth,
    timeout=300,
    use_ssl=True,
    verify_certs=True,
    connection_class=RequestsHttpConnection,
    index_name="test-index",
)



In [9]:
docs = docsearch.similarity_search(
    "What is feature selection",
    k=1,
)

In [10]:
docs

[Document(page_content='Classification : Confidential \\ FAB Technology Confidential  Technology Defaults for Database services on AWS  UAE  \n \n1. Introduction and scope  \nThis document provides the technology defaults for FAB service owners, architect s, developers and \nanyone who needs to select a database for usage by FAB applications on AWS UAE Region . The \ntechnology defaults are to be used while designing new business application , modernizing existing \napplications or migrating business application to AWS (UAE) cloud platform.  \nThe d ocument provides databa se type s and deployment recommendations to achieve  high \navailability, performance,  security,  and optimized cost.  \nFor feedback and questions, contact FAB EA Team FABEATeam@bankfab.com . \n \n2. Objectives  of the technology defaults  \nThe technology defaults were selected to enable FAB to build systems which are highly available, \nsecure,  and cost optimized. Th e following  were considered:  \n• Improved R

In [28]:
import boto3
from opensearchpy import OpenSearch, RequestsHttpConnection
from requests_aws4auth import AWS4Auth
from langchain.vectorstores import OpenSearchVectorSearch

# Setup for AWS authentication and OpenSearch client (not shown, use your existing setup)

# Name of your existing index
index_name = 'test-index'

# Ensure that 'host' and 'awsauth' are correctly defined (from your existing setup)
# Ensure that 'embeddings' is correctly defined and is a callable for embedding
host2  = "https://vpc-aura-dev-opensrch-f6cr4ow5q3qkvy7vpiv2njpove.me-central-1.es.amazonaws.com"

# Initialize OpenSearchVectorSearch with the existing index
vector_search = OpenSearchVectorSearch(
    opensearch_url=host2,  # Use the host URL directly as a string
    index_name=index_name,
    opensearch_client=aos_client,
    embedding_function=embeddings
)

# Example: Using the retriever to search
try:
    search_query = "First abu dhabi bank"
    search_results = vector_search.search(search_query, k=1, search_type="similarity")
    print(search_results)
except Exception as e:
    print("Error during search:", e)


[Document(page_content='Classification : Confidential \\ FAB Technology Confidential  Technology Defaults for Database services on AWS  UAE  \n \n1. Introduction and scope  \nThis document provides the technology defaults for FAB service owners, architect s, developers and \nanyone who needs to select a database for usage by FAB applications on AWS UAE Region . The \ntechnology defaults are to be used while designing new business application , modernizing existing \napplications or migrating business application to AWS (UAE) cloud platform.  \nThe d ocument provides databa se type s and deployment recommendations to achieve  high \navailability, performance,  security,  and optimized cost.  \nFor feedback and questions, contact FAB EA Team FABEATeam@bankfab.com . \n \n2. Objectives  of the technology defaults  \nThe technology defaults were selected to enable FAB to build systems which are highly available, \nsecure,  and cost optimized. Th e following  were considered:  \n• Improved R