In [1]:
from langchain_community.document_loaders import WikipediaLoader
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_openai.chat_models import ChatOpenAI
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [2]:
loader = WikipediaLoader(query="nashville, tennessee")
documents = loader.load()

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

split_documents = text_splitter.split_documents(documents)

In [4]:
embedding = OpenAIEmbeddings(model="text-embedding-3-small")

vector_store = InMemoryVectorStore(embedding=embedding)

vector_store.add_documents(split_documents)

['10b25a14-1e7d-491b-9ab1-2e7b33d3caaa',
 'c04cf2f8-aee9-455a-ab6d-394b74c4dd69',
 'f61ed86c-bb09-4ee4-84e9-3b867b0801fa',
 '975cb1e9-880d-4fd2-becc-8b286e290506',
 'ece709cf-985e-43cb-9818-7f18716fe593',
 'e4afb114-a1c8-42d6-afe4-687b37caf03c',
 'f661d0a2-0e97-4f3a-a27a-322f471155cd',
 'ce3aa746-8d67-4536-97e5-0c755187d45f',
 'c60b4573-cc47-4718-a89c-1b26ab269e3a',
 '960e6b70-84a1-4d24-83a8-05adf51c1aac',
 '984c334d-5040-447e-b0da-214404d35128',
 '6770111e-5d8a-4912-922b-fb6859f6410e',
 '3d2d21c1-84c1-4fd3-9e53-8091d1f7a1f7',
 '0cb1a15a-b4b0-4793-ae2c-b1b1c5c002c5',
 'ab3df530-794f-41c6-b0d0-6b58e6e95bd2',
 '73a00e6c-927f-4220-9b89-a03a55f00c13',
 'afb49fe0-d0f3-4dee-b532-5acfb01b7d74',
 'e6f50b0e-7a0d-4a9b-8cb6-73e4c526c482',
 'a355d4c7-e3e6-4876-b50e-4f9772168c9e',
 'cf19271d-637c-42ff-80a3-051600564dde',
 '7649c14b-c0d1-4b03-aa4a-464601fd08a2',
 'b7cee7ab-da17-48d3-885d-a20e6f41ff1c',
 '59c3fa57-a7ea-4669-830d-a145dade038c',
 '85b907d6-7474-4dc6-8bf3-a60251c9fe57',
 '01ca8449-477a-

In [5]:
llm = ChatOpenAI(model="gpt-4o-mini")

system_prompt = (
    "You are a helpful assistant that answers questions based only on the provided context. "
    "If the question cannot be sourced based on the provided context, then politely refuse to answer the question. "
)

query = "What is the capitol of Tennessee?"

messages = [SystemMessage(content=system_prompt), HumanMessage(content=query)]

llm.invoke(messages)

AIMessage(content="I'm sorry, but I can't provide information outside of the provided context.", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 15, 'prompt_tokens': 55, 'total_tokens': 70, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_d02d531b47', 'finish_reason': 'stop', 'logprobs': None}, id='run-d6810a45-ab99-4d85-b8cb-1f02b9a2baf3-0', usage_metadata={'input_tokens': 55, 'output_tokens': 15, 'total_tokens': 70, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

In [6]:
results = vector_store.similarity_search(query)

messages.append(HumanMessage(content=f"context: {results}"))

messages

[SystemMessage(content='You are a helpful assistant that answers questions based only on the provided context. If the question cannot be sourced based on the provided context, then politely refuse to answer the question. ', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='What is the capitol of Tennessee?', additional_kwargs={}, response_metadata={}),
 HumanMessage(content='context: [Document(id=\'46f97dfd-5ce6-481e-8e6a-b6d053fe1e29\', metadata={\'title\': \'Tennessee\', \'summary\': \'Tennessee ( , locally ), officially the State of Tennessee, is a landlocked state in the Southeastern region of the United States. It borders Kentucky to the north, Virginia to the northeast, North Carolina to the east, Georgia, Alabama, and Mississippi to the south, Arkansas to the southwest, and Missouri to the northwest. Tennessee is the 36th-largest by area and the 15th-most populous of the 50 states. According to the United States Census Bureau, the state\\\'s estimated populatio

In [7]:
llm.invoke(messages)

AIMessage(content='The capital of Tennessee is Nashville.', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 8, 'prompt_tokens': 2957, 'total_tokens': 2965, 'completion_tokens_details': {'audio_tokens': 0, 'reasoning_tokens': 0, 'accepted_prediction_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_d02d531b47', 'finish_reason': 'stop', 'logprobs': None}, id='run-86ecb7d1-c77a-4b17-aa29-5e8a4bab045e-0', usage_metadata={'input_tokens': 2957, 'output_tokens': 8, 'total_tokens': 2965, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})