### Initialization chapter 7 - QA across documents

In [4]:
import os
from dotenv import load_dotenv

from langchain_community.document_loaders import WikipediaLoader, Docx2txtLoader, PyPDFLoader, TextLoader, DirectoryLoader

from langchain_chroma import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough


In [11]:
# ============================================================================
# ENV SETUP (fail-fast)
# ============================================================================
load_dotenv()

OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")
OPENROUTER_BASE_URL = os.getenv("OPENROUTER_BASE_URL")  # e.g. https://openrouter.ai/api/v1
if not OPENROUTER_API_KEY:
    raise RuntimeError("Missing OPENROUTER_API_KEY in .env")
if not OPENROUTER_BASE_URL:
    raise RuntimeError("Missing OPENROUTER_BASE_URL in .env")

In [15]:
embeddings_model = OpenAIEmbeddings(
    api_key=OPENROUTER_API_KEY,        # API key for authentication
    base_url=OPENROUTER_BASE_URL,      # OpenRouter endpoint URL
    model="text-embedding-3-small",    # Small, fast embedding model (1536 dimensions)
)

# Initialize LLM
chatbot = ChatOpenAI(
    api_key=OPENROUTER_API_KEY,        # API key for authentication
    base_url=OPENROUTER_BASE_URL,      # OpenRouter endpoint URL
    model="openai/gpt-4o",             # GPT-4o model for high-quality responses
    temperature=0,                   # Low temperature for more deterministic, focused answers
)

### Indexing pipeline for RAG (chapter 7 - QA across documents)

In [16]:
vector_db = Chroma("tourist_info", embeddings_model)

In [17]:
wikipedia_loader = WikipediaLoader(query="Paestum")

In [18]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=0)

In [19]:
wikipedia_chunks = text_splitter.split_documents(
    wikipedia_loader.load())

In [None]:
vector_db.add_documents(wikipedia_chunks)

In [25]:
def split_and_import(loader):
     chunks = text_splitter.split_documents(loader.load())
     vector_db.add_documents(chunks)
     print(f"Ingested chunks created by {loader}")

In [None]:
wikipedia_loader = WikipediaLoader(query="Paestum")
split_and_import(wikipedia_loader)

In [30]:
word_loader = Docx2txtLoader("Paestum/Paestum-Britannica.docx")
split_and_import(word_loader)

Ingested chunks created by <langchain_community.document_loaders.word_document.Docx2txtLoader object at 0x0000023627FDF0B0>


In [31]:
pdf_loader = PyPDFLoader("Paestum/PaestumRevisited.pdf")
split_and_import(pdf_loader)

Ingested chunks created by <langchain_community.document_loaders.pdf.PyPDFLoader object at 0x0000023627FE3980>


In [29]:
txt_loader = TextLoader("Paestum/Paestum-Encyclopedia.txt")
split_and_import(txt_loader)

Ingested chunks created by <langchain_community.document_loaders.text.TextLoader object at 0x00000236281C3740>


In [24]:
for i, chunk in enumerate(wikipedia_chunks[:5], 1):
    print(f"=== Chunk {i} ===")
    print(chunk.page_content)
    print("\n" + "-"*80 + "\n")

=== Chunk 1 ===
Paestum ( PEST-əm, US also  PEE-stəm, Latin: [ˈpae̯stũː]) was a major ancient Greek city on the coast of the Tyrrhenian Sea, in Magna Graecia. The ruins of Paestum are famous for their three ancient Greek temples in the Doric order dating from about 550 to 450 BCE that are in an excellent state of preservation. The city walls and amphitheatre are largely intact, and the bottom of the walls of many other structures remain, as well as paved roads. The site is open to the public, and there is a

--------------------------------------------------------------------------------

=== Chunk 2 ===
modern national museum within it, which also contains the finds from the associated Greek site of Foce del Sele.

--------------------------------------------------------------------------------

=== Chunk 3 ===
Paestum was established around 600 BCE by settlers from Sybaris, a Greek colony in southern Italy, under the name of Poseidonia (Ancient Greek: Ποσειδωνία). The city thrived as

In [8]:
loader_classes = {
    'docx': Docx2txtLoader,
    'pdf': PyPDFLoader,
    'txt': TextLoader
}

In [33]:
def get_loader(filename):
    _, file_extension = os.path.splitext(filename) #A Extract the file extension
    file_extension = file_extension.lstrip('.') #B Remove the leading dot from the extension

    loader_class = loader_classes.get(
        file_extension) #C Get the loader class from the dictionary

    if loader_class:
        return loader_class(filename) #D Instantiate and return the correct loader
    else:
        raise ValueError(f"No loader available for file extension '{file_extension}'")

In [34]:
folder_path = "CilentoTouristInfo" #A Path to the folder containing the documents

for filename in os.listdir(folder_path): #B iterate over the files in the path
    file_path = os.path.join(folder_path, filename) #C Construct the full path to the file

    if os.path.isfile(file_path): #D Check if it is a file (not a directory)
        try:
            loader = get_loader(file_path) #E Instantiate the correct loader for the file
            print(f"Loader for {filename}: {loader}")
            split_and_import(loader) #F Split and ingest
        except ValueError as e:
            print(e)

Loader for Acciaroli.pdf: <langchain_community.document_loaders.pdf.PyPDFLoader object at 0x0000023627FE3AD0>
Ingested chunks created by <langchain_community.document_loaders.pdf.PyPDFLoader object at 0x0000023627FE3AD0>
Loader for Cape Palinuro.txt: <langchain_community.document_loaders.text.TextLoader object at 0x0000023628814E60>
Ingested chunks created by <langchain_community.document_loaders.text.TextLoader object at 0x0000023628814E60>
Loader for Casalvelino.txt: <langchain_community.document_loaders.text.TextLoader object at 0x0000023626AC6420>
Ingested chunks created by <langchain_community.document_loaders.text.TextLoader object at 0x0000023626AC6420>
Loader for Cilentan coast.docx: <langchain_community.document_loaders.word_document.Docx2txtLoader object at 0x0000023626AC4AA0>
Ingested chunks created by <langchain_community.document_loaders.word_document.Docx2txtLoader object at 0x0000023626AC4AA0>
Loader for Cilento Coast Map and Travel Guide.docx: <langchain_community.docum

###  Generation pipeline for RAG (chapter 7 - QA across documents)

In [40]:
query = "Where was Poseidonia and who renamed it to Paestum?"
results = vector_db.similarity_search(query, 4) # four clostest results
print(results)

[Document(id='44140363-5032-4c5b-92df-542fb746bab8', metadata={'source': 'https://en.wikipedia.org/wiki/Paestum', 'title': 'Paestum', 'summary': 'Paestum ( PEST-əm, US also  PEE-stəm, Latin: [ˈpae̯stũː]) was a major ancient Greek city on the coast of the Tyrrhenian Sea, in Magna Graecia. The ruins of Paestum are famous for their three ancient Greek temples in the Doric order dating from about 550 to 450 BCE that are in an excellent state of preservation. The city walls and amphitheatre are largely intact, and the bottom of the walls of many other structures remain, as well as paved roads. The site is open to the public, and there is a modern national museum within it, which also contains the finds from the associated Greek site of Foce del Sele.\nPaestum was established around 600 BCE by settlers from Sybaris, a Greek colony in southern Italy, under the name of Poseidonia (Ancient Greek: Ποσειδωνία). The city thrived as a Greek settlement for about two centuries, witnessing the develop

In [41]:
len(results)

4

In [44]:
rag_prompt_template = """Use the following pieces of context
to answer the question at the end.
If you don't know the answer, just say that you don't know,
don't try to make up an answer.
Use three sentences maximum and keep the
answer as concise as possible.
{context}
Question: {question}
Helpful Answer:"""

rag_prompt = PromptTemplate.from_template(rag_prompt_template)

In [45]:
retriever = vector_db.as_retriever()

In [47]:
question_feeder = RunnablePassthrough()

In [48]:
# set up RAG chain

rag_chain = {"context": retriever,
             "question": question_feeder}|rag_prompt|chatbot

In [49]:
def execute_chain(chain, question):
    answer = chain.invoke(question)
    return answer

In [51]:
question = """Where was Poseidonia and who renamed
it to Paestum. Also tell me the source."""

In [52]:
answer = execute_chain(rag_chain, question)

In [53]:
print(answer.content)

Poseidonia was an ancient Greek city in southern Italy, established by settlers from Sybaris. It was renamed Paestum by the Romans when they took over in 273 BCE. The source of this information is the Wikipedia article on Paestum.


In [54]:
print(answer)

content='Poseidonia was an ancient Greek city in southern Italy, established by settlers from Sybaris. It was renamed Paestum by the Romans when they took over in 273 BCE. The source of this information is the Wikipedia article on Paestum.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 52, 'prompt_tokens': 1980, 'total_tokens': 2032, 'completion_tokens_details': {'accepted_prediction_tokens': None, 'audio_tokens': None, 'reasoning_tokens': 0, 'rejected_prediction_tokens': None, 'image_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0, 'video_tokens': 0}, 'cost': 0.00547, 'is_byok': False, 'cost_details': {'upstream_inference_cost': None, 'upstream_inference_prompt_cost': 0.00495, 'upstream_inference_completions_cost': 0.00052}}, 'model_provider': 'openai', 'model_name': 'openai/gpt-4o', 'system_fingerprint': 'fp_a0e9480a2f', 'id': 'gen-1766241567-UV60d6rbKO5YggHnlsjf', 'finish_reason': 'stop', 'logprobs': None} id

In [55]:
question = """And then, what they do?
Tell me only if you know.
Also tell me the source"""
answer = execute_chain(rag_chain, question)

In [56]:
print(answer.content)

I don't know.
