# ElasticsearchStore Integrations (Recommended)

## Install Packages

In [8]:
%pip install langchain
%pip install openai 
%pip install langchain-community 
%pip install python-dotenv
%pip install elasticsearch
%pip install tiktoken


Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.6 kB)
Collecting regex>=2022.1.18 (from tiktoken)
  Downloading regex-2024.7.24-cp312-cp312-macosx_11_0_arm64.whl.metadata (40 kB)
Downloading tiktoken-0.7.0-cp312-cp312-macosx_11_0_arm64.whl (906 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m906.7/906.7 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading regex-2024.7.24-cp312-cp312-macosx_11_0_arm64.whl (279 kB)
Installing collected packages: regex, tiktoken
Successfully installed regex-2024.7.24 tiktoken-0.7.0
Note: you may need to restart the kernel to use updated packages.


## Import packages

In [39]:
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import ElasticsearchStore
from langchain.text_splitter import CharacterTextSplitter
from urllib.request import urlopen
import os, json

load_dotenv()

openai_api_key=os.getenv('OPENAI_API_KEY2')
# elastic_cloud_id=os.getenv('ES_CLOUD_ID')
elastic_user=os.getenv('ES_USER')
elastic_password=os.getenv('ES_PASSWORD')
elastic_endpoint=os.getenv('ES_ENDPOINT')
elastic_index_name='byte-discuss-elasticsearch-store'


## Add documents

### Let's download the sample dataset and deserialize the document.

In [28]:
url = "https://raw.githubusercontent.com/ashishtiwari1993/langchain-elasticsearch-RAG/main/data.json"

response = urlopen(url)

workplace_docs = json.loads(response.read())

### Split Documents into Passages

In [17]:
metadata = []
content = []

for doc in workplace_docs:
  content.append(doc["content"])
  metadata.append({
      "name": doc["name"],
      "summary": doc["summary"],
      "rolePermissions":doc["rolePermissions"]
  })

text_splitter = CharacterTextSplitter(chunk_size=50, chunk_overlap=0)
docs = text_splitter.create_documents(content, metadatas=metadata)

Created a chunk of size 245, which is longer than the specified 50
Created a chunk of size 288, which is longer than the specified 50
Created a chunk of size 204, which is longer than the specified 50
Created a chunk of size 281, which is longer than the specified 50
Created a chunk of size 249, which is longer than the specified 50
Created a chunk of size 285, which is longer than the specified 50
Created a chunk of size 298, which is longer than the specified 50
Created a chunk of size 270, which is longer than the specified 50
Created a chunk of size 224, which is longer than the specified 50
Created a chunk of size 288, which is longer than the specified 50
Created a chunk of size 260, which is longer than the specified 50
Created a chunk of size 199, which is longer than the specified 50
Created a chunk of size 290, which is longer than the specified 50
Created a chunk of size 251, which is longer than the specified 50
Created a chunk of size 195, which is longer than the specifie

### Index data into elasticsearch

In [32]:
# Elasticsearch 설정
elastic_host = "localhost"
elastic_port = 9200
elastic_user = "elastic"
elastic_password = "woorifisa1!"
# elastic_index_name = "your_index_name"

# OpenAI 임베딩 생성
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

# ElasticsearchStore 생성 및 문서 저장
es = ElasticsearchStore.from_documents(
    docs,
    es_url=f"http://{elastic_host}:{elastic_port}",
    index_name=elastic_index_name,
    es_user=elastic_user,
    es_password=elastic_password,
    embedding=embeddings,
)

es

<langchain_community.vectorstores.elasticsearch.ElasticsearchStore at 0x168409b20>

## Show Result

In [21]:
def showResults(output):
  print("Total results: ", len(output))
  for index in range(len(output)):
    print(output[index])

## Similarity / Vector Search (Approximate KNN Search) - ApproxRetrievalStrategy()

In [22]:
query = "work from home policy"
result = es.similarity_search(query=query)

showResults(result)

Total results:  4
page_content='The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.
Scope' metadata={'name': 'Work From Home Policy', 'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'rolePermissions': ['demo', 'manager']}
page_content='This work-from-home policy will be reviewed periodically and updated as necessary, taking into account changes in public health guidance, business needs, and employee feedback.
Questions and Concerns' metadata={'name': 'Work From Home Policy'

## Hybrid Search (Approximate KNN + Keyword Search) - ApproxRetrievalStrategy()

In [33]:
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)


es = ElasticsearchStore.from_documents(
    docs,
    es_url=f"http://{elastic_host}:{elastic_port}",
    index_name=elastic_index_name,
    es_user=elastic_user,
    es_password=elastic_password,
    embedding=embeddings,
    strategy=ElasticsearchStore.ApproxRetrievalStrategy(hybrid=True),
)

es.similarity_search("work from home policy")

[Document(metadata={'name': 'Work From Home Policy', 'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'rolePermissions': ['demo', 'manager']}, page_content='This work-from-home policy will be reviewed periodically and updated as necessary, taking into account changes in public health guidance, business needs, and employee feedback.\nQuestions and Concerns'),
 Document(metadata={'name': 'Work From Home Policy', 'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data

## Exact KNN Search (Brute Force) - ExactRetrievalStrategy()

In [34]:
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)


es = ElasticsearchStore.from_documents(
    docs,
    es_url=f"http://{elastic_host}:{elastic_port}",
    index_name=elastic_index_name,
    es_user=elastic_user,
    es_password=elastic_password,
    embedding=embeddings,
    strategy=ElasticsearchStore.ExactRetrievalStrategy(),
)

es.similarity_search("work from home policy")

[Document(metadata={'name': 'Work From Home Policy', 'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'rolePermissions': ['demo', 'manager']}, page_content='The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\nScope'),
 Document(metadata={'name': 'Work From Home Policy', 'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time t

## Index / Search Documents using ELSER - SparseVectorRetrievalStrategy()

In [38]:
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)


es = ElasticsearchStore.from_documents(
    docs,
    es_url=f"http://{elastic_host}:{elastic_port}",
    # index_name=elastic_index_name,
    es_user=elastic_user,
    es_password=elastic_password,
    embedding=embeddings,
    index_name=elastic_index_name + "-" + "elser",
    strategy=ElasticsearchStore.SparseVectorRetrievalStrategy(),
)

es.similarity_search("work from home policy")

NotFoundError: NotFoundError(404, 'resource_not_found_exception', 'Could not find trained model [.elser_model_1]')