Install packages and import modules

In [None]:
%pip install --upgrade

In [2]:
%pip install --no-build-isolation --force-reinstall "boto3>=1.28.57" "awscli>=1.29.57" "botocore>=1.31.57" "ipython<8" elasticsearch langchain    

Collecting boto3>=1.28.57
  Obtaining dependency information for boto3>=1.28.57 from https://files.pythonhosted.org/packages/0a/88/e68eb04a86e1022676196cbaf130563c241e80e95b38758f60d86cd940d8/boto3-1.28.66-py3-none-any.whl.metadata
  Using cached boto3-1.28.66-py3-none-any.whl.metadata (6.7 kB)
Collecting awscli>=1.29.57
  Obtaining dependency information for awscli>=1.29.57 from https://files.pythonhosted.org/packages/99/f0/cb7c948e6235db6efa5409b4f982820cce7d6678cc5472c18afd801febfb/awscli-1.29.66-py3-none-any.whl.metadata
  Using cached awscli-1.29.66-py3-none-any.whl.metadata (11 kB)
Collecting botocore>=1.31.57
  Obtaining dependency information for botocore>=1.31.57 from https://files.pythonhosted.org/packages/9f/85/66f93685c7006f32617ba74eaa984ce0fbf8b13312e6255887e509e4a036/botocore-1.31.66-py3-none-any.whl.metadata
  Using cached botocore-1.31.66-py3-none-any.whl.metadata (6.1 kB)
Collecting ipython<8
  Downloading ipython-7.34.0-py3-none-any.whl (793 kB)
[2K     [90m━━━━━━━

In [None]:
%pip install -qU langchain elasticsearch boto3

In [3]:
from getpass import getpass
from urllib.request import urlopen
from langchain.vectorstores import ElasticsearchStore
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.bedrock import BedrockEmbeddings
from langchain.llms.bedrock import Bedrock
from langchain.chains import RetrievalQA
import boto3
import json

Init Bedrock client

In [4]:
default_region = "us-east-1"
AWS_REGION = input(f"AWS Region [default: {default_region}]: ") or default_region

def get_bedrock_client(region):
    bedrock_client = boto3.client("bedrock-runtime", region_name=region)
    return bedrock_client

AWS Region [default: us-east-1]:  


Connect to Elasticsearch

In [5]:
CLOUD_ID = getpass("Elastic deployment Cloud ID: ")
CLOUD_USERNAME = "elastic"
CLOUD_PASSWORD = getpass("Elastic deployment Password: ")


vector_store = ElasticsearchStore(
    es_cloud_id=CLOUD_ID,
    es_user=CLOUD_USERNAME,
    es_password=CLOUD_PASSWORD,
    index_name= "workplace_index",
    strategy=ElasticsearchStore.SparseVectorRetrievalStrategy()
)

Elastic deployment Cloud ID:  ········
Elastic deployment Password:  ········


Download the dataset

In [6]:
url = "https://raw.githubusercontent.com/elastic/elasticsearch-labs/main/example-apps/workplace-search/data/data.json"

response = urlopen(url)

workplace_docs = json.loads(response.read())

Split Documents into Passages

In [7]:
metadata = []
content = []

for doc in workplace_docs:
  content.append(doc["content"])
  metadata.append({
      "name": doc["name"],
      "summary": doc["summary"],
      "rolePermissions":doc["rolePermissions"]
  })

text_splitter = CharacterTextSplitter(chunk_size=800, chunk_overlap=400)
docs = text_splitter.create_documents(content, metadatas=metadata)

Created a chunk of size 866, which is longer than the specified 800
Created a chunk of size 1120, which is longer than the specified 800


Index data into elasticsearch

In [8]:
documents = vector_store.from_documents(
    docs,
    es_cloud_id=CLOUD_ID,
    es_user=CLOUD_USERNAME,
    es_password=CLOUD_PASSWORD,
    index_name="workplace_index",
    strategy=ElasticsearchStore.SparseVectorRetrievalStrategy()
)

Initialize Bedrock LLM

In [12]:
default_model_id = "cohere.command-text-v14"
AWS_MODEL_ID = input(f"AWS model [default: {default_model_id}]: ") or default_model_id

def create_bedrock_llm(bedrock_client, model_version_id):
    bedrock_llm = Bedrock(
        model_id=model_version_id, 
        client=bedrock_client,
        model_kwargs={'temperature': 0}
        )
    return bedrock_llm

bedrock_client = get_bedrock_client(AWS_REGION)
llm = create_bedrock_llm(bedrock_client, AWS_MODEL_ID)

AWS model [default: cohere.command-text-v14]:  anthropic.claude-v2


In [13]:
retriever = vector_store.as_retriever()

qa = RetrievalQA.from_llm(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)

questions = [
    'What is the nasa sales team?',
    'What is our work from home policy?',
    'Does the company own my personal project?',
    'What job openings do we have?',
    'How does compensation work?'
]
question = questions[0]
print(f"Question: {question}\n")

ans = qa({"query": question})

print("\033[92m ---- Answer ---- \033[0m")
print(ans["result"] + "\n")
print("\033[94m ---- Sources ---- \033[0m")
for doc in ans["source_documents"]:
  print("Name: " + doc.metadata["name"])
  print("Content: "+ doc.page_content)
  print("-------\n")

Question: What is the nasa sales team?

[92m ---- Answer ---- [0m
 Based on the context provided, the NASA sales team refers to the sales team for the North America and South America region. This region includes the United States, Canada, Mexico, as well as Central and South America. The NASA sales team is led by two Area Vice-Presidents - Laura Martinez who is the AVP for North America, and Gary Johnson who is the AVP for South America. The NASA sales team is responsible for sales activities in both North and South America.

[94m ---- Sources ---- [0m
Name: Sales Organization Overview
Content: Our sales organization is structured to effectively serve our customers and achieve our business objectives across multiple regions. The organization is divided into the following main regions:

The Americas: This region includes the United States, Canada, Mexico, as well as Central and South America. The North America South America region (NASA) has two Area Vice-Presidents: Laura Martinez 