In [1]:
import bs4
import os
from dotenv import load_dotenv

# Load from .env if you use one
load_dotenv()
os.environ["USER_AGENT"] = (
    "Mozilla/5.0 (compatible; RAG-TutorialBot/1.0; +https://yourwebsite.com/bot)"
)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from operator import itemgetter
from langchain.schema import Document
from langchain.load import dumps, loads

In [2]:
LANGSMITH_PROJECT = "rag-virtual-assistant-course"

In [3]:
# Load documents from the web
loader = WebBaseLoader(
    web_paths=[
        "https://www.reuters.com/world/europe/greece-ask-eu-fiscal-leeway-defence-spending-minister-says-2025-04-29/",
        "https://www.ekathimerini.com/economy/1264299/moodys-upgrade-of-the-greek-economy-is-significant-says-govt-spox/",
        "https://www.imf.org/en/News/Articles/2025/04/04/pr2589-greece-imf-executive-board-concludes-2025-article-iv-consultation",
        "https://economy-finance.ec.europa.eu/economic-surveillance-eu-economies/greece/economic-forecast-greece_en",
        "https://www.reuters.com/markets/europe/greece-repay-first-bailout-loans-by-2031-10-years-early-2025-04-11/",
        "https://www.reuters.com/world/europe/bribery-scandals-greeces-public-sector-show-persistence-corruption-2025-03-27",
        "https://www.reuters.com/markets/europe/greek-economy-surges-after-decade-pain-2024-04-18/",
    ],
    bs_kwargs={
        # Optional: you can remove `bs_kwargs` if the websites don't need specific filtering
        "parse_only": bs4.SoupStrainer(
            ["article", "body", "main", "section", "div", "p"]
        )
    },
)
docs = loader.load()

# Split documents into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Check for available API keys
openai_key = os.getenv("OPENAI_API_KEY")
groq_key = os.getenv("GROQ_API_KEY")

# Create vectorstore with OpenAI embeddings
if os.getenv("OPENAI_API_KEY"):
    embedder = OpenAIEmbeddings()
else:
    embedder = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

vectorstore = Chroma.from_documents(documents=splits, embedding=embedder)

retriever = vectorstore.as_retriever()

# Multi query

In [4]:
# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)


generate_queries = (
    prompt_perspectives
    | ChatOpenAI(temperature=0)
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

In [5]:
question = "What were the main factors that led to the Greek debt crisis?"

queries = generate_queries.invoke({"question": question})

for i, q in enumerate(queries, 1):
    print(f"{i}. {q}")

1. 1. What were the primary causes of the Greek debt crisis?
2. 2. Can you outline the key factors that contributed to the Greek debt crisis?
3. 3. What were the underlying reasons behind the Greek debt crisis?
4. 4. What factors played a significant role in triggering the Greek debt crisis?
5. 5. What were the major influences that precipitated the Greek debt crisis?


In [6]:
def get_unique_union(documents: list[list]):
    """Unique union of retrieved docs"""
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]


retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question": question})
len(docs)

  return [loads(doc) for doc in unique_docs]


7

In [7]:
# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, "question": itemgetter("question")}
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question": question})

"The main factors that led to the Greek debt crisis included high levels of public debt, fiscal mismanagement, tax evasion, economic imbalances, and structural weaknesses in the Greek economy. Additionally, the global financial crisis of 2008 exacerbated the situation by exposing the vulnerabilities in Greece's economy and financial system."

# Query Decomposition

In [8]:
rewrite_prompt = ChatPromptTemplate.from_messages([
    ("system", 
     "You are a helpful assistant that rewrites the user's question "
     "into multiple focused subqueries to improve document retrieval."),
    ("human", 
     "Original question: {question}\n\nRewrite it into 2-3 targeted subquestions, "
     "each on a new line, without bullet points.")
])


# Define a prompt for rewriting
print("\n Preparing query decomposition prompt...")
decomposition_chain = rewrite_prompt | llm | StrOutputParser()


# Run decomposition on a complex user question

question = "How did tourism affect the Greek economy in 2023 compared to 2022?"
print(f"\n Original Question:\n{question}")

rewrites = decomposition_chain.invoke({"question": question})
rewrite_list = [r.strip() for r in rewrites.splitlines() if r.strip()]

print("\n Rewritten Subqueries:")
for i, q in enumerate(rewrite_list, 1):
    print(f"{i}. {q}")


# Simulate retrieved documents (from a vector DB)
print("\n Simulating retrieval of relevant documents...")
retrieved_docs = [
    Document(page_content="In 2023, tourism contributed 25% more revenue compared to 2022, driven by record-breaking arrivals in July and August."),
    Document(page_content="The Greek economy in 2022 saw a 20% recovery in tourism, following the pandemic lows of 2020 and 2021."),
    Document(page_content="The GDP from tourism rose from 15% of total GDP in 2022 to nearly 18% in 2023, according to ELSTAT."),
]

print("\n Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, 1):
    print(f"Doc {i}: {doc.page_content}")


# Feed context into the final answer generation prompt
print("\n Generating the final answer using retrieved context...")
GEN_PROMPT = ChatPromptTemplate.from_template(
    "Answer using only the context below. "
    "If insufficient, share any partial info you have **and** explicitly say "
    "\"I don't know\" where details are missing.\n\n"
    "{context}\n\n"
    "Question: {question}\nAnswer:"
) | llm | StrOutputParser()

context = "\n\n".join(doc.page_content for doc in retrieved_docs)
final_answer = GEN_PROMPT.invoke({"context": context, "question": question})


# Present the final answer
print("\n🎯 Final Answer:")
print(final_answer)


 Preparing query decomposition prompt...

 Original Question:
How did tourism affect the Greek economy in 2023 compared to 2022?

 Rewritten Subqueries:
1. How did the tourism industry contribute to the Greek economy in 2022?
2. How did the tourism industry impact the Greek economy in 2023?
3. What were the key differences in the economic effects of tourism between 2022 and 2023 in Greece?

 Simulating retrieval of relevant documents...

 Retrieved Documents:
Doc 1: In 2023, tourism contributed 25% more revenue compared to 2022, driven by record-breaking arrivals in July and August.
Doc 2: The Greek economy in 2022 saw a 20% recovery in tourism, following the pandemic lows of 2020 and 2021.
Doc 3: The GDP from tourism rose from 15% of total GDP in 2022 to nearly 18% in 2023, according to ELSTAT.

 Generating the final answer using retrieved context...

🎯 Final Answer:
Tourism significantly boosted the Greek economy in 2023 compared to 2022, with a 25% increase in revenue and a rise in

# HYDE

In [9]:
# Your existing vectorstore and retriever are already running
# from main app, so we'll just use the retriever here
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

parser = StrOutputParser()

# HYDE step: create a plausible hypothetical answer
hyde_prompt = ChatPromptTemplate.from_template(
    "Write a plausible, informed answer to the question below as if you were confident and had access to expert data.\n\nQuestion: {question}\n\nAnswer:"
)
hyde_chain = hyde_prompt | llm | parser

question = "Greek inflation"
print("🔮 Generating hypothetical answer...")
hypothetical_answer = hyde_chain.invoke({"question": question})
print("\n📝 Hypothetical Answer:\n", hypothetical_answer)

# Use that as the query for the vectorstore
retrieved_docs = retriever.invoke(hypothetical_answer)

print("\n📚 Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, 1):
    print(f"\nDoc {i}:\n{doc.page_content.strip()[:500]}")  # Truncated for clarity

# Final generation using retrieved context
GEN_PROMPT = ChatPromptTemplate.from_template(
    "Answer the user's question using ONLY the context below. "
    "If the context is insufficient, say \"I don't know\".\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"
)
gen_chain = GEN_PROMPT | llm | parser

context = "\n\n".join(doc.page_content for doc in retrieved_docs)
final_answer = gen_chain.invoke({"context": context, "question": question})

print("\n🎯 Final Answer:\n")
print(final_answer)

🔮 Generating hypothetical answer...

📝 Hypothetical Answer:
 Greek inflation has been relatively low in recent years, with the country experiencing an average inflation rate of around 0.5% in 2020. This is largely due to the economic challenges faced by Greece in the aftermath of the financial crisis, which led to a period of deflation and slow economic growth.

However, with the country's economy gradually recovering and the government implementing structural reforms, there is potential for inflation to increase in the coming years. Factors such as rising energy prices, increased consumer demand, and supply chain disruptions could all contribute to higher inflation rates in Greece.

It is important for policymakers to closely monitor inflation trends and take appropriate measures to ensure price stability and sustainable economic growth. The European Central Bank also plays a role in influencing inflation in Greece through its monetary policy decisions. Overall, while Greek inflation 

# Evaluate the outcome with another llm (LLM as a jury)

In [10]:

parser = StrOutputParser()

# Load jury model
jury_llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

# ✨ Strict evaluation prompt with injected inputs
prompt = PromptTemplate.from_template(
    """
You are an expert evaluator for a question answering system.

Evaluate the answer based on the following:
User Question: {question}

Retrieved Context:
{context}

Final Answer:
{answer}

Evaluate the answer on a scale of 1 to 5 for each criterion and give a short reason.

Format your response like this:
Faithfulness: x/5 — ...
Relevance: x/5 — ...
Fluency: x/5 — ...
Completeness: x/5 — ...
"""
)

# Build the chain and run
evaluation_chain = prompt | llm | parser
result = evaluation_chain.invoke({
    "question": question,
    "context": context,
    "answer": final_answer,
})

print("📊 LLM Jury Evaluation:\n")
print(result)


📊 LLM Jury Evaluation:

Faithfulness: 5/5 — The answer accurately addresses the user's question about Greek inflation, providing specific data and projections for headline inflation and inflation excluding energy and food.
Relevance: 5/5 — The answer is highly relevant as it directly addresses the user's query about Greek inflation and provides detailed information on the topic.
Fluency: 4/5 — The answer is well-written and coherent, but there are some areas where the information could be presented more clearly for better understanding.
Completeness: 5/5 — The answer is comprehensive, covering various aspects of Greek inflation, including factors influencing it and future projections.
