In [None]:
# Run requirments from txt file for user to easily install needed packages.
#pip install -r requirements
#conda install -c conda-forge pyarrow
#conda install -c conda-forge datasets
#conda install -c conda-forge langchain
#pip install unstructured
#pip install openai
#pip install python-dotenv
#pip install langchain-community
#pip install langchain-openai
#pip install chromadb

## 1. Setup

In [5]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Load variables from .env file
load_dotenv(dotenv_path="news.env.txt", override=True)

# Access the variables
openai_key = os.getenv("OPENAI_API_KEY")
langsmith_key = os.getenv("LANGSMITH_API_KEY")

# Pass the API key explicitly to the LangChain classes
llm = ChatOpenAI(temperature=0, openai_api_key=openai_key)
embeddings = OpenAIEmbeddings(openai_api_key=openai_key)

print(f"OpenAI key loaded: {'Yes' if openai_key else 'No'}")
print(f"LangSmith key loaded: {'Yes' if langsmith_key else 'No'}")

OpenAI key loaded: Yes
LangSmith key loaded: Yes


In [6]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

from rich import print
from rich.console import Console

llm = ChatOpenAI(temperature=0.0)
console = Console(width=100)

USER_AGENT environment variable not set, consider setting it to identify your requests.


## 2A. Load Dataset (AG News)

In [14]:
from datasets import load_dataset
from langchain.docstore.document import Document

# Load the AG News dataset. Use the 'train' split for knowledge base.
ag_news_dataset = load_dataset("ag_news", split="train")

# Convert the dataset entries into LangChain Document objects
# Each entry has a 'text' and 'label' column.
documents = []
for entry in ag_news_dataset:
    doc = Document(
        page_content=entry['text'],
        metadata={"label": entry['label']}
    )
    documents.append(doc)

# Print the first document to verify the format
print(documents[0])

# Your existing code to split the documents and create the vectorstore can now use this 'documents' list
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

# embed through Chroma
embedding = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding)

  from .autonotebook import tqdm as notebook_tqdm


## 2B. Load Dataset (BBC News)

In [15]:
from datasets import load_dataset
from langchain.schema import Document
from datetime import datetime

# Load latest BBC dataset from Hugging Face
bbc = load_dataset("RealTimeData/bbc_latest")

bbc_docs = []
for item in bbc["train"]:
    text = item.get("text", "")
    date_str = item.get("date", "")
    
    # Try to parse the date if available
    try:
        published = datetime.fromisoformat(date_str)
    except Exception:
        published = datetime.now()  # fallback to "now"
    
    metadata = {
        "source": "BBC",
        "title": item.get("title", ""),
        "published": str(published),
        "recency_score": 1.0  # give BBC docs a boost
    }
    bbc_docs.append(Document(page_content=text, metadata=metadata))

In [16]:
# Append to AG News Chroma
vectorstore.add_documents(bbc_docs)
base_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

def custom_retriever(query):
    results = base_retriever.get_relevant_documents(query)
    rescored = []
    for doc in results:
        boost = doc.metadata.get("recency_score", 0)
        rescored.append((doc, -boost))  # negative = better rank
    rescored.sort(key=lambda x: x[1])
    return [doc for doc, _ in rescored]

# Chaining Logic

## Step 1: Query Understanding and Rewriting

In [7]:
query_prompt = PromptTemplate(
    template="""You are an expert at rephrasing user questions into concise, search-friendly queries.
    Given the user's question, generate 3 search queries that will help find the answer.
    The queries should be in a numbered list.
    User Question: {user_input}
    Queries:""",
    input_variables=["user_input"],
)
query_chain = LLMChain(llm=llm, prompt=query_prompt, output_parser=StrOutputParser())

  query_chain = LLMChain(llm=llm, prompt=query_prompt, output_parser=StrOutputParser())


## Step 2: Draft Answer

In [8]:
draft_prompt = PromptTemplate(
    template="""You are a professional assistant. Using only the following context, write a precise and well-structured answer to the user's question.
    Do not add any information that is not explicitly found in the context.
    Context: {retrieved_docs}
    User Question: {user_input}
    Draft Answer:""",
    input_variables=["retrieved_docs", "user_input"],
)
draft_chain = LLMChain(llm=llm, prompt=draft_prompt, output_parser=StrOutputParser())

## Step 3: Fact-Check

In [9]:
factcheck_prompt = PromptTemplate(
    input_variables=["retrieved_docs", "draft_answer"],
    template="""You are a fact-checking assistant.
Review the draft answer and compare it to the context.

Tasks:
1. Identify any claims NOT supported by the context.
2. Flag vague or biased language.
3. Suggest corrections.

Context:
{retrieved_docs}

Draft Answer:
{draft_answer}

Fact-Check Report:
"""
)
factcheck_chain = LLMChain(llm=llm, prompt=factcheck_prompt)

## Step 4: Citation Acquisition

In [10]:
# --- New citation-aware final prompt ---
citation_prompt = PromptTemplate(
    template="""
You are a professional assistant. 
Using only the retrieved documents, write a clear, precise answer to the user's question. 

Each statement you make MUST be supported by a citation. 
Cite sources inline in the form [Source: {doc_id}] where doc_id is provided in the context list. 

Context Documents (ID → snippet):
{context_list}

User Question: {user_input}

Final Answer (with inline citations):
""",
    input_variables=["context_list", "user_input"],
)

citation_chain = LLMChain(llm=llm, prompt=citation_prompt, output_parser=StrOutputParser())

## Step 5: Final Answer

In [11]:

final_prompt = PromptTemplate(
    template="""
You are a careful assistant.

Your job is to decide whether to keep the draft answer or reject it based on the fact-check notes.

Rules:
- If the fact-check notes confirm the draft answer is fully accurate, return the draft answer exactly as written (do not rephrase).
- If the fact-check notes indicate any missing or unsupported information, respond with the draft answer exactly as written and then the fact-check notes but only those that indicate any missing or unsupported information.

Draft Answer:
{draft_answer}

Fact-check Notes:
{fact_report}

Retrieved Docs (with Source IDs):
{retrieved_docs}

User Question: {user_input}

Final Answer (with inline citations):
""",
    input_variables=["draft_answer", "fact_report", "retrieved_docs", "user_input"]
)
final_chain = LLMChain(llm=llm, prompt=final_prompt, output_parser=StrOutputParser())

## Finished Pipeline Function

In [12]:
def get_final_answer(user_input: str):
    # Step 1: Query Understanding
    queries = query_chain.invoke({"user_input": user_input})
    search_query = queries['text'].strip().split("\n")[0]  # Access the string value under the 'text' key

    # Step 2: Document Retrieval
    docs = base_retriever.invoke(search_query)
    retrieved_texts = "\n".join([doc.page_content for doc in docs])

    # Attach numbered source IDs for citations
    sources = {}
    context = []
    for i, doc in enumerate(docs):
        source_id = f"Source-{i+1}"
        sources[source_id] = doc.metadata
        snippet = doc.page_content[:200].replace("\n", " ")
        context.append(f"{source_id}: {snippet}...")

    context_text = "\n".join(context)

    # Step 3: Answer Drafting
    draft = draft_chain.invoke({"retrieved_docs": retrieved_texts, "user_input": user_input})

    # Step 4: Fact-Check
    fact_report = factcheck_chain.invoke({"retrieved_docs": retrieved_texts, "draft_answer": draft})

    # Step 5: Final Answer
    final = final_chain.invoke({"draft_answer": draft, "fact_report": fact_report,"retrieved_docs": context_text,"user_input":user_input})
    return final["text"]

## Evaluation

In [None]:
import pandas as pd

# Make DataFrame display easier to read
pd.set_option("display.width", 1500)
pd.set_option("display.max_colwidth", None)

sample_questions = [
    "Who is the CEO of Microsoft?",
    "What are the latest trends in renewable energy?",
    "What happened in the 2008 financial crisis?",
    "Who won the 2016 U.S. presidential election?",
    "What is quantum computing?"
]

results = []

for q in sample_questions:
    pipeline_ans = get_final_answer(q)
    baseline_ans = llm.invoke(q).content  # single-shot baseline
    
    results.append({
        "question": q,
        "pipeline_answer": pipeline_ans,
        "baseline_answer": baseline_ans,
        "has_citation": "(Source-" in pipeline_ans
    })

df = pd.DataFrame(results)
df

Unnamed: 0,question,pipeline_answer,baseline_answer,has_citation,correct?
0,Who is the CEO of Microsoft?,The CEO of Microsoft is Steve Ballmer.\n\nFact-check Notes:\n1. The draft answer correctly identifies Steve Ballmer as the CEO of Microsoft based on the context provided.,"As of September 2021, the CEO of Microsoft is Satya Nadella.",False,
1,What are the latest trends in renewable energy?,"The latest trends in renewable energy include an increasing focus on alternative sources such as wind, solar, and sea energy. Rising fuel prices and concerns about global warming have accelerated the shift towards renewable energy sources. The wind industry is making efforts to overcome public resistance to wind turbines, with the European wind energy industry aiming to eventually supply all of the continent's electricity. Additionally, provinces like Ontario are unveiling new projects to create electricity from renewable resources. (Source-1, Source-2, Source-3, Source-4, Source-5)\n\nFact-check Notes:\n1. The draft answer accurately reflects the context provided.","1. Offshore wind power: Offshore wind farms are becoming increasingly popular due to their higher wind speeds and larger potential for energy generation compared to onshore wind farms.\n\n2. Solar energy storage: Advances in battery technology are making it easier to store excess solar energy for use during times when the sun is not shining, increasing the reliability and efficiency of solar power systems.\n\n3. Floating solar panels: Installing solar panels on bodies of water, such as lakes or reservoirs, is a growing trend that maximizes land use and reduces water evaporation while generating clean energy.\n\n4. Green hydrogen production: Green hydrogen, produced using renewable energy sources, is gaining traction as a clean alternative to traditional fossil fuels for transportation and industrial applications.\n\n5. Community solar projects: Community solar projects allow multiple households or businesses to share the benefits of a solar energy system, making renewable energy more accessible and affordable for a wider range of people.\n\n6. Microgrids: Microgrids are small-scale, localized energy systems that can operate independently or in conjunction with the main power grid, providing increased resilience and reliability during power outages or emergencies.\n\n7. Energy efficiency retrofits: Retrofitting existing buildings and infrastructure with energy-efficient technologies and renewable energy systems is a growing trend to reduce energy consumption and carbon emissions in the built environment.",True,
2,What happened in the 2008 financial crisis?,"The 2008 financial crisis was triggered by a combination of factors, including rising interest rates, a reliance on credit, risky investments by UK banks, a run on Ukraine's banks due to political worries, and the bursting of the housing bubble. Additionally, the long-term economic health of the United States was threatened by a significant amount of government debts and liabilities coming due as baby boomers began to retire. \n\nFact-check Notes:\n1. The draft answer correctly mentions the factors that contributed to the 2008 financial crisis, such as rising interest rates, a reliance on credit, risky investments by UK banks, a run on Ukraine's banks, and the bursting of the housing bubble.","The 2008 financial crisis, also known as the global financial crisis, was a severe worldwide economic crisis that began in 2007 and continued into 2008. It was triggered by the collapse of the housing market in the United States, which led to a domino effect that spread throughout the global financial system.\n\nSome key events and factors that contributed to the crisis include:\n\n1. Subprime mortgage crisis: Banks and financial institutions in the U.S. had been issuing risky subprime mortgages to borrowers who were unable to repay them. When the housing bubble burst in 2007, many of these borrowers defaulted on their loans, leading to a wave of foreclosures and a sharp decline in housing prices.\n\n2. Financial institutions collapse: The crisis led to the collapse of several major financial institutions, including Lehman Brothers, Bear Stearns, and AIG. This caused panic in the financial markets and led to a credit crunch, as banks became reluctant to lend to each other.\n\n3. Stock market crash: The crisis also triggered a sharp decline in stock prices, with major stock indexes around the world plummeting. This led to a loss of trillions of dollars in market value and wiped out many investors' savings.\n\n4. Government intervention: In response to the crisis, governments around the world implemented various measures to stabilize the financial system, including bailouts of banks and other financial institutions, stimulus packages to boost economic growth, and regulatory reforms to prevent future crises.\n\nThe 2008 financial crisis had far-reaching consequences, including a global recession, high levels of unemployment, and a loss of confidence in the financial system. It also exposed weaknesses in the regulatory framework and led to calls for reform to prevent similar crises in the future.",False,
3,Who won the 2016 U.S. presidential election?,"George W. Bush won the 2004 U.S. presidential election and was re-elected as the 43rd President of the United States. (Source-3)\n\nFact-check Notes:\n1. The draft answer incorrectly states that George W. Bush won the 2016 U.S. presidential election. The context provided clearly states that he won the 2004 election, not 2016.",Donald Trump won the 2016 U.S. presidential election.,True,
4,What is quantum computing?,"Quantum computing is a cutting-edge technology that involves the use of quantum bits, or qubits, to perform computations. Recently, scientists at the University of Bonn in Germany have made significant progress in this field by building the fundamental memory component of a quantum computer, known as a ""register"", using caesium atoms trapped inside a laser beam. This development could pave the way for a more reliable method of building a working quantum computer compared to other techniques. Additionally, quantum computing has the potential to revolutionize data security, with advancements such as quantum cryptography systems that offer increased speed and range, making them commercially viable for applications in various industries. (Source-1, Source-2, Source-4)\n\nFact-check Notes:\n1. The draft answer correctly mentions the development of the fundamental memory component of a quantum computer using caesium atoms by scientists at the University of Bonn in Germany. (Source-1, Source-2)\n2. The draft answer correctly highlights the potential of quantum computing to revolutionize data security with advancements like quantum cryptography systems. (Source-4)","Quantum computing is a type of computing that uses quantum-mechanical phenomena, such as superposition and entanglement, to perform operations on data. Unlike classical computers, which use bits to represent information as either 0 or 1, quantum computers use quantum bits, or qubits, which can represent both 0 and 1 simultaneously due to superposition. This allows quantum computers to perform certain calculations much faster than classical computers, making them potentially powerful tools for solving complex problems in fields such as cryptography, optimization, and material science.",True,


In [None]:
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

# Define evaluator prompt

eval_prompt = PromptTemplate(
    template="""
You are an impartial evaluator. Compare the pipeline answer with the baseline answer. 
Use the retrieved documents as the ONLY ground truth. Do not trust tone or length.

Rules:
1. Mark an answer CORRECT only if:
   - It matches the retrieved documents AND
   - It includes inline citations (e.g. [Source-1]) that are consistent with the retrieved docs.
2. If the pipeline answer and baseline disagree, mark INCORRECT unless the pipeline clearly 
   cites retrieved docs that directly support its claim.
3. If the pipeline answer contains outdated or factually wrong info, mark INCORRECT.
4. If the pipeline answer copies fact-check notes instead of giving a clean factual answer, 
   mark INCORRECT.
5. If there is not enough verified evidence in the retrieved docs, mark as "INSUFFICIENT".

Format:
Return only one of the following:
- CORRECT (factually correct & cited properly)
- INCORRECT (factually wrong, outdated, or poorly cited)
- INSUFFICIENT (not enough info to judge)

Question: {question}
Pipeline Answer: {pipeline_answer}
Baseline Answer: {baseline_answer}
Retrieved Docs: {retrieved_docs}

Final Evaluation:
""",
    input_variables=["question", "pipeline_answer", "baseline_answer", "retrieved_docs"]
)

evaluator_chain = LLMChain(llm=llm, prompt=eval_prompt)

# Run evaluation
results = []
for q in sample_questions:
    pipeline_ans = get_final_answer(q)
    baseline_ans = llm.invoke(q).content
    retrieved = base_retriever.get_relevant_documents(q)  # fetch top docs
    
    eval_result = evaluator_chain.run(
        question=q,
        pipeline_answer=pipeline_ans,
        baseline_answer=baseline_ans,
        retrieved_docs="\n".join([d.page_content for d in retrieved])
    )
    
    results.append({
        "question": q,
        "pipeline_answer": pipeline_ans,
        "baseline_answer": baseline_ans,
        "evaluation": eval_result.strip()
    })

df = pd.DataFrame(results)
df

Unnamed: 0,question,pipeline_answer,baseline_answer,evaluation
0,Who is the CEO of Microsoft?,The CEO of Microsoft is Steve Ballmer.\n\nFact-check Notes:\n1. The draft answer correctly identifies Steve Ballmer as the CEO of Microsoft based on the context provided.,"As of September 2021, the CEO of Microsoft is Satya Nadella.",INCORRECT
1,What are the latest trends in renewable energy?,"The latest trends in renewable energy include an increasing share of power coming from renewable sources such as wind, solar, and potentially the energy of the sea. Rising fuel prices, concerns about global warming, and the environmental merits of renewable energy are driving the shift towards these alternative energy sources. The wind energy industry in Europe is thriving, with the potential to eventually supply all of the continent's electricity, although public resistance to eyesore turbines remains a challenge. Additionally, provinces like Ontario are launching new projects to create electricity from renewable resources.\n\nFact-check Notes:\n- No corrections are needed for the draft answer.","1. Offshore wind power: Offshore wind farms are becoming increasingly popular due to their higher wind speeds and larger potential for energy generation compared to onshore wind farms.\n\n2. Solar energy storage: Advances in battery technology are making it easier to store excess solar energy for use during times when the sun is not shining, increasing the reliability and efficiency of solar power systems.\n\n3. Green hydrogen production: Green hydrogen, produced using renewable energy sources, is gaining traction as a clean alternative to traditional fossil fuels for use in transportation, industry, and heating.\n\n4. Floating solar panels: Installing solar panels on bodies of water, such as lakes and reservoirs, is a growing trend that maximizes land use efficiency and reduces water evaporation.\n\n5. Community solar projects: Community solar projects allow multiple households or businesses to share the benefits of a single solar installation, making renewable energy more accessible and affordable for a wider range of people.\n\n6. Microgrids: Microgrids are small-scale, localized energy systems that can operate independently or in conjunction with the main power grid, providing increased resilience and reliability during power outages.\n\n7. Electrification of transportation: The shift towards electric vehicles is driving the demand for renewable energy sources to power charging infrastructure, reducing greenhouse gas emissions from the transportation sector.",INCORRECT
2,What happened in the 2008 financial crisis?,"The 2008 financial crisis was triggered by a combination of factors, including rising interest rates, a reliance on credit, and risky investments made by UK banks. The crisis was further exacerbated by the impending retirement of baby boomers in the United States, which highlighted the country's significant government debts and liabilities. Additionally, the burst of the housing bubble in 2008 was confirmed by the first year-on-year drop in mortgage lending in four years. \n\nFact-check Notes:\n1. The draft answer correctly mentions the factors that contributed to the 2008 financial crisis, such as rising interest rates, a reliance on credit, risky investments by UK banks, and significant government debts and liabilities in the United States.","The 2008 financial crisis, also known as the global financial crisis, was a severe worldwide economic crisis that began in 2007 and continued into 2008. It was triggered by the collapse of the housing market in the United States, which led to a domino effect that spread throughout the global financial system.\n\nSome key events and factors that contributed to the crisis include:\n\n1. Subprime mortgage crisis: Banks and financial institutions in the U.S. had been issuing risky subprime mortgages to borrowers who were unable to repay them. When the housing bubble burst in 2007, many of these borrowers defaulted on their loans, leading to a wave of foreclosures and a sharp decline in housing prices.\n\n2. Financial institutions collapse: The crisis led to the collapse of several major financial institutions, including Lehman Brothers, Bear Stearns, and AIG. This caused panic in the financial markets and led to a credit crunch, as banks became reluctant to lend to each other.\n\n3. Stock market crash: The crisis also triggered a sharp decline in stock prices, with major stock indexes around the world plummeting. This led to a loss of trillions of dollars in market value and wiped out many investors' savings.\n\n4. Government intervention: In response to the crisis, governments around the world implemented various measures to stabilize the financial system, including bailouts of banks and other financial institutions, stimulus packages to boost economic growth, and regulatory reforms to prevent future crises.\n\nThe 2008 financial crisis had far-reaching consequences, including a global recession, high levels of unemployment, and a loss of confidence in the financial system. It also exposed weaknesses in the regulatory framework and led to calls for greater oversight of the financial industry.",CORRECT
3,Who won the 2016 U.S. presidential election?,"George W. Bush won the 2004 U.S. presidential election and was re-elected as the 43rd President of the United States. (Source-3)\n\nFact-check Notes:\n1. The draft answer incorrectly states that George W. Bush won the 2016 U.S. presidential election. The context provided clearly states that he won the 2004 election, not 2016.",Donald Trump won the 2016 U.S. presidential election.,INCORRECT
4,What is quantum computing?,"Quantum computing is a cutting-edge technology that involves the use of quantum bits, or qubits, to perform computations. Recently, scientists at the University of Bonn in Germany have made significant progress in this field by building the fundamental memory component of a quantum computer, known as a ""register"", using caesium atoms trapped inside a laser beam. This development could pave the way for a more reliable method of building a working quantum computer compared to other techniques. Additionally, quantum computing has the potential to revolutionize data security through the development of super-secure quantum networks that can detect and alert administrators to any attempts at eavesdropping. (Source-1, Source-2, Source-5)","Quantum computing is a type of computing that uses quantum-mechanical phenomena, such as superposition and entanglement, to perform operations on data. Unlike classical computers, which use bits to represent information as either 0 or 1, quantum computers use quantum bits, or qubits, which can represent both 0 and 1 simultaneously due to superposition. This allows quantum computers to perform certain calculations much faster than classical computers, making them potentially powerful tools for solving complex problems in fields such as cryptography, optimization, and material science.",CORRECT


## Single Question Test

In [None]:
if __name__ == "__main__":
    user_q = "Are video games good for children?"

    get_final_answer(user_q)