In [1]:
!which python

/Users/pantost/Desktop/codehub/rag-virtual-assistant-course/.venv/bin/python


In [2]:
import bs4
import os
from dotenv import load_dotenv

# Load from .env if you use one
load_dotenv()
os.environ["USER_AGENT"] = (
    "Mozilla/5.0 (compatible; RAG-TutorialBot/1.0; +https://yourwebsite.com/bot)"
)
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain.prompts import ChatPromptTemplate
from operator import itemgetter

In [3]:
LANGSMITH_PROJECT = "rag-virtual-assistant-course"

In [4]:
# Load documents from the web
loader = WebBaseLoader(
    web_paths=[
        "https://www.reuters.com/world/europe/greece-ask-eu-fiscal-leeway-defence-spending-minister-says-2025-04-29/",
        "https://www.ekathimerini.com/economy/1264299/moodys-upgrade-of-the-greek-economy-is-significant-says-govt-spox/",
        "https://www.imf.org/en/News/Articles/2025/04/04/pr2589-greece-imf-executive-board-concludes-2025-article-iv-consultation",
        "https://economy-finance.ec.europa.eu/economic-surveillance-eu-economies/greece/economic-forecast-greece_en",
        "https://www.reuters.com/markets/europe/greece-repay-first-bailout-loans-by-2031-10-years-early-2025-04-11/",
        "https://www.reuters.com/world/europe/bribery-scandals-greeces-public-sector-show-persistence-corruption-2025-03-27",
        "https://www.reuters.com/markets/europe/greek-economy-surges-after-decade-pain-2024-04-18/",
    ],
    bs_kwargs={
        # Optional: you can remove `bs_kwargs` if the websites don't need specific filtering
        "parse_only": bs4.SoupStrainer(
            ["article", "body", "main", "section", "div", "p"]
        )
    },
)
docs = loader.load()

# Split documents into manageable chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)

# Check for available API keys
openai_key = os.getenv("OPENAI_API_KEY")
groq_key = os.getenv("GROQ_API_KEY")

# Create vectorstore with OpenAI embeddings
if os.getenv("OPENAI_API_KEY"):
    embedder = OpenAIEmbeddings()
else:
    embedder = HuggingFaceEmbeddings(
        model_name="sentence-transformers/all-MiniLM-L6-v2"
    )

vectorstore = Chroma.from_documents(documents=splits, embedding=embedder)

retriever = vectorstore.as_retriever()

# Multi query

In [5]:
# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five 
different versions of the given user question to retrieve relevant documents from a vector 
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search. 
Provide these alternative questions separated by newlines. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)


generate_queries = (
    prompt_perspectives
    | ChatOpenAI(temperature=0)
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

In [6]:
question = "What were the main factors that led to the Greek debt crisis?"

question = "Greek growth?"

queries = generate_queries.invoke({"question": question})

for i, q in enumerate(queries, 1):
    print(f"{i}. {q}")

1. 1. What are the factors contributing to economic growth in Greece?
2. 2. How has the Greek economy been performing in terms of growth?
3. 3. Can you provide insights into the growth trajectory of Greece?
4. 4. What is the current state of economic expansion in Greece?
5. 5. How has Greece's growth compared to other European countries?


In [7]:
from langchain.load import dumps, loads


def get_unique_union(documents: list[list]):
    """Unique union of retrieved docs"""
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]


retrieval_chain = generate_queries | retriever.map() | get_unique_union
docs = retrieval_chain.invoke({"question": question})
len(docs)

  return [loads(doc) for doc in unique_docs]


5

In [8]:
# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

llm = ChatOpenAI(temperature=0)

final_rag_chain = (
    {"context": retrieval_chain, "question": itemgetter("question")}
    | prompt
    | llm
    | StrOutputParser()
)

final_rag_chain.invoke({"question": question})

'Greek growth is expected to remain robust, with real GDP growth averaging 2.1% in 2024 and projected to remain high at 2.1% in 2025. Investment, private consumption, and steady real income growth are key factors driving this growth. Additionally, private consumption is set to continue expanding at a robust pace, and investment is forecasted to accelerate further.'

# Query Decomposition

In [9]:
# 1. Load your LLM (e.g., GPT-4o or GPT-3.5-turbo)
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

# 2. Prompt that teaches the LLM to do decomposition
prompt = ChatPromptTemplate.from_messages([
    ("system", 
     "You are a helpful assistant that rewrites the user's question "
     "into multiple focused subqueries to improve document retrieval."),
    ("human", 
     "Original question: {question}\n\nRewrite it into 2-3 targeted subquestions, "
     "each on a new line, without bullet points.")
])

# 3. Chain
chain = prompt | llm | StrOutputParser()

# 4. Run it
question = "How did tourism affect the Greek economy in 2023 compared to 2022?"
output = chain.invoke({"question": question})


print(output)


What was the impact of tourism on the Greek economy in 2023?  
How did the tourism sector contribute to the Greek economy in 2022?  
What are the differences in the economic effects of tourism in Greece between 2022 and 2023?


In [10]:
from langchain.schema import Document

# -----------------------------
# STEP 1: Load the LLM (GPT‑4o)
# -----------------------------
print("🔧 Loading GPT‑4o model...")
llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

# -------------------------------------
# STEP 2: Define a prompt for rewriting
# -------------------------------------
print("\n📜 Preparing query decomposition prompt...")
rewrite_prompt = ChatPromptTemplate.from_messages([
    ("system", 
     "You are a helpful assistant that rewrites the user's question "
     "into multiple focused subqueries to improve document retrieval."),
    ("human", 
     "Original question: {question}\n\nRewrite it into 2-3 targeted subquestions, "
     "each on a new line, without bullet points.")
])
decomposition_chain = rewrite_prompt | llm | StrOutputParser()

# ---------------------------------------------------
# STEP 3: Run decomposition on a complex user question
# ---------------------------------------------------
question = "How did tourism affect the Greek economy in 2023 compared to 2022?"
print(f"\n🤖 Original Question:\n{question}")

rewrites = decomposition_chain.invoke({"question": question})
rewrite_list = [r.strip() for r in rewrites.splitlines() if r.strip()]

print("\n Rewritten Subqueries:")
for i, q in enumerate(rewrite_list, 1):
    print(f"{i}. {q}")

# ---------------------------------------------------
# STEP 4: Simulate retrieved documents (from a vector DB)
# ---------------------------------------------------
print("\n📚 Simulating retrieval of relevant documents...")
retrieved_docs = [
    Document(page_content="In 2023, tourism contributed 25% more revenue compared to 2022, driven by record-breaking arrivals in July and August."),
    Document(page_content="The Greek economy in 2022 saw a 20% recovery in tourism, following the pandemic lows of 2020 and 2021."),
    Document(page_content="The GDP from tourism rose from 15% of total GDP in 2022 to nearly 18% in 2023, according to ELSTAT."),
]

print("\n Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, 1):
    print(f"Doc {i}: {doc.page_content}")

# ---------------------------------------------------
# STEP 5: Feed context into the final answer generation prompt
# ---------------------------------------------------
print("\n✏️ Generating the final answer using retrieved context...")
GEN_PROMPT = ChatPromptTemplate.from_template(
    "Answer using only the context below. "
    "If insufficient, share any partial info you have **and** explicitly say "
    "\"I don't know\" where details are missing.\n\n"
    "{context}\n\n"
    "Question: {question}\nAnswer:"
) | llm | StrOutputParser()

context = "\n\n".join(doc.page_content for doc in retrieved_docs)
final_answer = GEN_PROMPT.invoke({"context": context, "question": question})

# ---------------------------------------------------
# STEP 6: Present the final answer
# ---------------------------------------------------
print("\n🎯 Final Answer:")
print(final_answer)

🔧 Loading GPT‑4o model...

📜 Preparing query decomposition prompt...

🤖 Original Question:
How did tourism affect the Greek economy in 2023 compared to 2022?

 Rewritten Subqueries:
1. What was the impact of tourism on the Greek economy in 2023?
2. How did the tourism sector contribute to the Greek economy in 2022?
3. What are the differences in the economic effects of tourism in Greece between 2022 and 2023?

📚 Simulating retrieval of relevant documents...

 Retrieved Documents:
Doc 1: In 2023, tourism contributed 25% more revenue compared to 2022, driven by record-breaking arrivals in July and August.
Doc 2: The Greek economy in 2022 saw a 20% recovery in tourism, following the pandemic lows of 2020 and 2021.
Doc 3: The GDP from tourism rose from 15% of total GDP in 2022 to nearly 18% in 2023, according to ELSTAT.

✏️ Generating the final answer using retrieved context...

🎯 Final Answer:
Tourism positively affected the Greek economy in 2023 compared to 2022. Revenue from tourism inc

# HYDE

In [11]:
# Your existing vectorstore and retriever are already running
# from main app, so we'll just use the retriever here
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

parser = StrOutputParser()

# HYDE step: create a plausible hypothetical answer
hyde_prompt = ChatPromptTemplate.from_template(
    "Write a plausible, informed answer to the question below as if you were confident and had access to expert data.\n\nQuestion: {question}\n\nAnswer:"
)
hyde_chain = hyde_prompt | llm | parser

question = "Greek inflation"
print("🔮 Generating hypothetical answer...")
hypothetical_answer = hyde_chain.invoke({"question": question})
print("\n📝 Hypothetical Answer:\n", hypothetical_answer)

# Use that as the query for the vectorstore
retrieved_docs = retriever.invoke(hypothetical_answer)

print("\n📚 Retrieved Documents:")
for i, doc in enumerate(retrieved_docs, 1):
    print(f"\nDoc {i}:\n{doc.page_content.strip()[:500]}")  # Truncated for clarity

# Final generation using retrieved context
GEN_PROMPT = ChatPromptTemplate.from_template(
    "Answer the user's question using ONLY the context below. "
    "If the context is insufficient, say \"I don't know\".\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"
)
gen_chain = GEN_PROMPT | llm | parser

context = "\n\n".join(doc.page_content for doc in retrieved_docs)
final_answer = gen_chain.invoke({"context": context, "question": question})

print("\n🎯 Final Answer:\n")
print(final_answer)

🔮 Generating hypothetical answer...

📝 Hypothetical Answer:
 As of the latest data available, Greek inflation has been influenced by a combination of domestic and international factors. In recent years, Greece, like many other European countries, has experienced fluctuations in inflation rates due to the COVID-19 pandemic's economic impact, supply chain disruptions, and energy price volatility.

In 2022, Greece saw a significant rise in inflation, driven primarily by increased energy costs and supply chain issues. The war in Ukraine further exacerbated these pressures, leading to higher prices for natural gas and oil, which are critical imports for Greece. This situation was compounded by the global supply chain disruptions that affected the availability and cost of goods.

The Greek government and the European Central Bank (ECB) have been actively monitoring the situation. The ECB's monetary policy, including interest rate adjustments, aims to stabilize inflation across the Eurozone, 

# Evaluate the outcome with another llm (LLM as a jury)

In [12]:

parser = StrOutputParser()

# Load jury model
jury_llm = ChatOpenAI(model_name="gpt-4o", temperature=0)

# ✨ Strict evaluation prompt with injected inputs
prompt = PromptTemplate.from_template(
    """
You are an expert evaluator for a question answering system.

Evaluate the answer based on the following:
User Question: {question}

Retrieved Context:
{context}

Final Answer:
{answer}

Evaluate the answer on a scale of 1 to 5 for each criterion and give a short reason.

Format your response like this:
Faithfulness: x/5 — ...
Relevance: x/5 — ...
Fluency: x/5 — ...
Completeness: x/5 — ...
"""
)

# Build the chain and run
evaluation_chain = prompt | llm | parser
result = evaluation_chain.invoke({
    "question": question,
    "context": context,
    "answer": final_answer,
})

print("📊 LLM Jury Evaluation:\n")
print(result)


📊 LLM Jury Evaluation:

Faithfulness: 5/5 — The answer accurately reflects the information provided in the retrieved context, including specific figures and trends related to Greek inflation.

Relevance: 5/5 — The answer directly addresses the user's question about Greek inflation, providing detailed information on current and projected inflation rates.

Fluency: 5/5 — The answer is well-written and easy to understand, with clear and concise language that effectively communicates the information.

Completeness: 5/5 — The answer includes all relevant details from the context, such as the comparison to the euro area average, projections for future years, and factors influencing inflation trends.
