In [3]:
from langchain.document_loaders import JSONLoader
loader=JSONLoader(file_path="hotels.json",jq_schema=".hotels",json_lines=True,text_content=False)
docs=loader.load()
for doc in docs:
    print(doc.page_content)

[{"name": "Hotel One", "location": "City A", "rooms": 100, "rating": 4.5, "reviews": [{"reviewer": "John Doe", "comment": "Great stay!", "rating": 5}, {"reviewer": "Jane Smith", "comment": "Average service.", "rating": 3}]}, {"name": "Hotel Two", "location": "City B", "rooms": 150, "rating": 4, "reviews": [{"reviewer": "Alice Brown", "comment": "Very comfortable.", "rating": 4}, {"reviewer": "Bob Johnson", "comment": "Will visit again!", "rating": 5}]}]


# Ingestion Phase
## json data is ingested into upstash db after converting to vector

### when we insert any data into s3 bucket automatically this lambda function need to be triggered and data need to be ingested to upstash vector DB

In [4]:
from langchain.text_splitter import CharacterTextSplitter,RecursiveCharacterTextSplitter
#text_splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=50)

#documents = text_splitter.split_documents(docs)
#print(documents)
# Split the md sections further based on tokens
chunk_size = 300
chunk_overlap = 30
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    encoding_name='cl100k_base',
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
)

# Split
splits = text_splitter.split_documents(docs)
len(splits)

1

In [5]:
from openai import OpenAI
from tqdm.auto import tqdm

oai_client = OpenAI()

chunks = []

for i, chunk in tqdm(enumerate(splits), total=len(splits)):
    response = oai_client.embeddings.create(
        input=chunk.page_content,
        model="text-embedding-3-small"
    )

    _chunk = {
        "id": i,
        "embedding": response.data[0].embedding,
        "metadata": {
            "text": chunk.page_content
        }
    }
    chunks.append(_chunk)
print(chunks)

100%|██████████| 1/1 [00:00<00:00,  1.67it/s]

[{'id': 0, 'embedding': [-0.05493985489010811, -0.004505396354943514, 0.053818631917238235, -0.02841339446604252, -0.017830155789852142, 0.01227190624922514, -0.0372190847992897, -0.026786254718899727, 0.02247912436723709, -0.006412839982658625, -0.03694561496376991, -0.06016310304403305, -0.013974248431622982, -0.003334608394652605, 0.011772825382649899, 0.049197010695934296, -0.015628734603524208, 0.05688147991895676, 0.016845669597387314, 0.0636088103055954, 0.003167108865454793, 0.006327380891889334, 0.06103820353746414, -0.04750150442123413, -0.008682630024850368, 0.03358194977045059, -0.0067273289896547794, 0.026922989636659622, 0.0312027744948864, -0.028850942850112915, 0.01557403989136219, -0.0009639770141802728, 0.06393697112798691, 0.004833558574318886, -0.06103820353746414, -0.027661355212330818, 0.005346312187612057, 0.04159458354115486, 0.05335373431444168, 0.009564566425979137, -0.008074162527918816, 0.031257469207048416, 0.04927905276417732, -0.02803053706884384, -0.0471




In [6]:
from upstash_vector import Index, Vector

#index = Index.from_env()
index = Index(url="https://present-panther-47615-us1-vector.upstash.io", token="ABkFMHByZXNlbnQtcGFudGhlci00NzYxNS11czFhZG1pblpERmhOVFEzTnpNdE1EaG1PQzAwTTJOaUxXRTFZelV0Tm1VeFlUSXhZelJrTVdRdw==")

vectors = []
for chunk in chunks:
    #chunk['metadata']['doc_id'] = doc_id
    chunk_id = f"{chunk['id']}"

    vector = Vector(
        id=chunk_id, vector=chunk["embedding"], metadata=chunk["metadata"]
    )
    vectors.append(vector)

In [7]:
vectors[0]

Vector(id='0', vector=[-0.05493985489010811, -0.004505396354943514, 0.053818631917238235, -0.02841339446604252, -0.017830155789852142, 0.01227190624922514, -0.0372190847992897, -0.026786254718899727, 0.02247912436723709, -0.006412839982658625, -0.03694561496376991, -0.06016310304403305, -0.013974248431622982, -0.003334608394652605, 0.011772825382649899, 0.049197010695934296, -0.015628734603524208, 0.05688147991895676, 0.016845669597387314, 0.0636088103055954, 0.003167108865454793, 0.006327380891889334, 0.06103820353746414, -0.04750150442123413, -0.008682630024850368, 0.03358194977045059, -0.0067273289896547794, 0.026922989636659622, 0.0312027744948864, -0.028850942850112915, 0.01557403989136219, -0.0009639770141802728, 0.06393697112798691, 0.004833558574318886, -0.06103820353746414, -0.027661355212330818, 0.005346312187612057, 0.04159458354115486, 0.05335373431444168, 0.009564566425979137, -0.008074162527918816, 0.031257469207048416, 0.04927905276417732, -0.02803053706884384, -0.047145

In [8]:
index.upsert(vectors)

'Success'

# Retreival phase
## Question is asked and it will be the input when we create a lambda function url

In [15]:
from upstash_vector import Vector
from langchain_community.vectorstores.upstash import UpstashVectorStore
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.multi_query import MultiQueryRetriever,LineListOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import BaseOutputParser
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

# Initialize Upstash Vector Store

# Initialize the embedding model
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")

store = UpstashVectorStore(embedding=embeddings,index_url="https://present-panther-47615-us1-vector.upstash.io",index_token="ABkFMHByZXNlbnQtcGFudGhlci00NzYxNS11czFhZG1pblpERmhOVFEzTnpNdE1EaG1PQzAwTTJOaUxXRTFZelV0Tm1VeFlUSXhZelJrTVdRdw==")


llm = OpenAI()
output_parser = LineListOutputParser()
QUERY_PROMPT = PromptTemplate(
        input_variables=["question"],
        template="""Generate exact response from the question{question}"""
    )
llm_chain = QUERY_PROMPT | llm | output_parser
retriever = MultiQueryRetriever(
        retriever=store.as_retriever(), llm_chain=llm_chain, parser_key="lines"
    )
qa_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=retriever)

    # Example query
query = "what cities does the hotels located in?"
response = qa_chain.run(query)
print(response)
    




 The first hotel, Hotel One, is located in City A and the second hotel, Hotel Two, is located in City B.
