In [None]:
# Run requirments from txt file for user to easily install needed packages.
#pip install -r requirements
%conda install -c conda-forge pyarrow
%conda install -c conda-forge datasets
%conda install -c conda-forge langchain
%pip install unstructured
%pip install openai
%pip install python-dotenv
%pip install langchain-community
%pip install langchain-openai
%pip install chromadb

## 1. Setup

In [4]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI, OpenAIEmbeddings

# Load variables from .env file
load_dotenv(dotenv_path=".env", override=True)

# Access the variables
openai_key = os.getenv("OPENAI_API_KEY")
langsmith_key = os.getenv("LANGSMITH_API_KEY")

# Pass the API key explicitly to the LangChain classes
llm = ChatOpenAI(temperature=0, openai_api_key=openai_key)
embeddings = OpenAIEmbeddings(openai_api_key=openai_key)

print(f"OpenAI key loaded: {'Yes' if openai_key else 'No'}")
print(f"LangSmith key loaded: {'Yes' if langsmith_key else 'No'}")

OpenAI key loaded: Yes
LangSmith key loaded: Yes


In [64]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter

from langchain.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain.chains import LLMChain
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

from rich import print
from rich.console import Console

llm = ChatOpenAI(temperature=0.0)
console = Console(width=100)

## 2. Load Dataset (AG News)

In [110]:
from datasets import load_dataset
from langchain.docstore.document import Document

# Load the AG News dataset. Use the 'train' split for knowledge base.
ag_news_dataset = load_dataset("ag_news", split="train")

# Convert the dataset entries into LangChain Document objects
# Each entry has a 'text' and 'label' column.
documents = []
for entry in ag_news_dataset:
    doc = Document(
        page_content=entry['text'],
        metadata={"label": entry['label']}
    )
    documents.append(doc)

# Print the first document to verify the format
print(documents[0])

# Your existing code to split the documents and create the vectorstore can now use this 'documents' list
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(documents)

# embed through Chroma
embedding = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(documents=splits, embedding=embedding)

In [None]:
from datasets import load_dataset
from langchain.schema import Document
from datetime import datetime

# Load latest BBC dataset from Hugging Face
bbc = load_dataset("RealTimeData/bbc_latest")

bbc_docs = []
for item in bbc["train"]:
    text = item.get("text", "")
    date_str = item.get("date", "")
    
    # Try to parse the date if available
    try:
        published = datetime.fromisoformat(date_str)
    except Exception:
        published = datetime.now()  # fallback to "now"
    
    metadata = {
        "source": "BBC",
        "title": item.get("title", ""),
        "published": str(published),
        "recency_score": 1.0  # give BBC docs a boost
    }
    bbc_docs.append(Document(page_content=text, metadata=metadata))

In [115]:
# Append to AG News Chroma
vectorstore.add_documents(bbc_docs)
base_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

def custom_retriever(query):
    results = base_retriever.get_relevant_documents(query)
    rescored = []
    for doc in results:
        boost = doc.metadata.get("recency_score", 0)
        rescored.append((doc, -boost))  # negative = better rank
    rescored.sort(key=lambda x: x[1])
    return [doc for doc, _ in rescored]

# Chaining Logic

## Step 1: Query Understanding and Rewriting

In [26]:
query_prompt = PromptTemplate(
    template="""You are an expert at rephrasing user questions into concise, search-friendly queries.
    Given the user's question, generate 3 search queries that will help find the answer.
    The queries should be in a numbered list.
    User Question: {user_input}
    Queries:""",
    input_variables=["user_input"],
)
query_chain = LLMChain(llm=llm, prompt=query_prompt, output_parser=StrOutputParser())

## Step 2: Document Retrieval Step (already done with the retriever object)

In [None]:
# This step is handled implicitly by using the retriever in the next chain.

## Step 3: Draft Answer

In [27]:
draft_prompt = PromptTemplate(
    template="""You are a professional assistant. Using only the following context, write a precise and well-structured answer to the user's question.
    Do not add any information that is not explicitly found in the context.
    Context: {retrieved_docs}
    User Question: {user_input}
    Draft Answer:""",
    input_variables=["retrieved_docs", "user_input"],
)
draft_chain = LLMChain(llm=llm, prompt=draft_prompt, output_parser=StrOutputParser())

## Step 4: Fact-Check

In [40]:
factcheck_prompt = PromptTemplate(
    input_variables=["retrieved_docs", "draft_answer"],
    template="""You are a fact-checking assistant.
Review the draft answer and compare it to the context.

Tasks:
1. Identify any claims NOT supported by the context.
2. Flag vague or biased language.
3. Suggest corrections.

Context:
{retrieved_docs}

Draft Answer:
{draft_answer}

Fact-Check Report:
"""
)
factcheck_chain = LLMChain(llm=llm, prompt=factcheck_prompt)

## Step 5: Citation Acquisition

In [None]:
# --- New citation-aware final prompt ---
citation_prompt = PromptTemplate(
    template="""
You are a professional assistant. 
Using only the retrieved documents, write a clear, precise answer to the user's question. 

Each statement you make MUST be supported by a citation. 
Cite sources inline in the form [Source: {doc_id}] where doc_id is provided in the context list. 

Context Documents (ID → snippet):
{context_list}

User Question: {user_input}

Final Answer (with inline citations):
""",
    input_variables=["context_list", "user_input"],
)

citation_chain = LLMChain(llm=llm, prompt=citation_prompt, output_parser=StrOutputParser())

## Step 6: Final Answer

In [81]:

final_prompt = PromptTemplate(
    template="""
You are a careful assistant.

Your job is to decide whether to keep the draft answer or reject it based on the fact-check notes.

Rules:
- If the fact-check notes confirm the draft answer is fully accurate, return the draft answer exactly as written (do not rephrase).
- If the fact-check notes indicate any missing or unsupported information, respond with the draft answer exactly as written and then the fact-check notes but only those that indicate any missing or unsupported information.

Draft Answer:
{draft_answer}

Fact-check Notes:
{fact_report}

Retrieved Docs (with Source IDs):
{retrieved_docs}

User Question: {user_input}

Final Answer (with inline citations):
""",
    input_variables=["draft_answer", "fact_report", "retrieved_docs", "user_input"]
)
final_chain = LLMChain(llm=llm, prompt=final_prompt, output_parser=StrOutputParser())

## Pipeline Function

In [None]:
def get_final_answer(user_input: str):
    # Step 1: Query Understanding
    queries = query_chain.invoke({"user_input": user_input})
    search_query = queries['text'].strip().split("\n")[0]  # Access the string value under the 'text' key

    # Step 2: Document Retrieval
    docs = base_retriever.invoke(search_query)
    retrieved_texts = "\n".join([doc.page_content for doc in docs])

    # Attach numbered source IDs for citations
    sources = {}
    context = []
    for i, doc in enumerate(docs):
        source_id = f"Source-{i+1}"
        sources[source_id] = doc.metadata
        snippet = doc.page_content[:200].replace("\n", " ")
        context.append(f"{source_id}: {snippet}...")

    context_text = "\n".join(context)

    # Step 3: Answer Drafting
    draft = draft_chain.invoke({"retrieved_docs": retrieved_texts, "user_input": user_input})

    # Step 4: Fact-Check
    fact_report = factcheck_chain.invoke({"retrieved_docs": retrieved_texts, "draft_answer": draft})

    # Step 5: Final Answer
    final = final_chain.invoke({"draft_answer": draft, "fact_report": fact_report,"retrieved_docs": context_text,"user_input":user_input})
    return final["text"]

## Evaluation

In [116]:
# Example evaluation of pipeline vs baseline

# Optionally, you can also set the display width to avoid line wrapping for wide tables
pd.set_option('display.width', 1000)

sample_questions = [
    "Who is the CEO of Microsoft?",
    "What are the latest trends in renewable energy?",
    "What happened in the 2008 financial crisis?",
    "Who won the 2016 U.S. presidential election?",
    "What is quantum computing?"
]

results = []

for q in sample_questions:
    pipeline_ans = get_final_answer(q)
    baseline_ans = llm.invoke(q).content  # single-shot baseline
    
    results.append({
        "question": q,
        "pipeline_answer": pipeline_ans,
        "baseline_answer": baseline_ans
    })

import pandas as pd
df = pd.DataFrame(results)
df

Unnamed: 0,question,pipeline_answer,baseline_answer
0,Who is the CEO of Microsoft?,"The CEO of Microsoft is Steve Ballmer.\n\nFact-check Notes:\n1. The draft answer correctly identifies Steve Ballmer as the CEO of Microsoft, which is supported by the context.","As of September 2021, the CEO of Microsoft is Satya Nadella."
1,What are the latest trends in renewable energy?,"The latest trends in renewable energy include an increasing share of power coming from renewable sources such as wind, solar, and the energy of the sea. With escalating oil prices and global warming concerns, the quest for renewable energy sources has intensified, making alternative energy a viable power source for the future.\n\nFact-check Notes:\n- No corrections or suggestions are needed for the draft answer.","1. Offshore wind power: Offshore wind farms are becoming increasingly popular due to their higher wind speeds and less visual impact on land. Countries like the UK, Germany, and Denmark are leading the way in developing offshore wind projects.\n\n2. Solar power with energy storage: The combination of solar panels with energy storage systems, such as batteries, is becoming more common. This allows for the storage of excess energy generated during the day for use during peak demand times or at night.\n\n3. Floating solar panels: Floating solar panels are being installed on bodies of water, such as reservoirs and lakes, to maximize energy production and reduce land use. This technology is particularly popular in countries with limited available land for solar installations.\n\n4. Green hydrogen production: Green hydrogen, produced using renewable energy sources like wind or solar power, is gaining traction as a clean alternative to traditional hydrogen production methods. It can be used in various sectors, including transportation and industry.\n\n5. Community solar projects: Community solar projects allow multiple individuals or organizations to invest in and benefit from a shared solar installation. This model makes solar energy more accessible to those who may not be able to install panels on their own property.\n\n6. Electrification of transportation: The shift towards electric vehicles (EVs) is driving the demand for renewable energy sources to power these vehicles. This trend is leading to the development of more charging infrastructure and incentives for EV adoption.\n\n7. Microgrids: Microgrids are small-scale, localized energy systems that can operate independently or in conjunction with the main grid. They are being used to increase energy resilience, improve grid stability, and integrate renewable energy sources more effectively."
2,What happened in the 2008 financial crisis?,"The 2008 financial crisis was triggered by a number of factors, including rising interest rates and a reliance on credit, which led to an increased number of personal bankruptcies in the UK. Additionally, UK banks had built up risky investment positions that could have potentially triggered a financial crisis if a sudden rush to sell occurred. The Bank of England warned about the potential risks associated with these investments (Source-1, Source-2, Source-3, Source-4).","The 2008 financial crisis, also known as the global financial crisis, was a severe worldwide economic crisis that began in 2007 and continued into 2008. It was triggered by the collapse of the housing market in the United States, which led to a domino effect that spread throughout the global financial system.\n\nSeveral factors contributed to the crisis, including the proliferation of subprime mortgages, the securitization of these mortgages into complex financial products, and the excessive risk-taking and leverage by financial institutions. As the housing market collapsed, banks and financial institutions faced massive losses on their mortgage-backed securities, leading to a liquidity crisis and a credit crunch.\n\nThe crisis resulted in the failure or near-collapse of several major financial institutions, including Lehman Brothers, Bear Stearns, and AIG. Stock markets around the world plummeted, and many countries entered into recession. Governments and central banks implemented various measures to stabilize the financial system, including bailouts of banks and other financial institutions, monetary stimulus, and regulatory reforms.\n\nThe 2008 financial crisis had far-reaching consequences, including a global economic downturn, high levels of unemployment, and a loss of confidence in the financial system. It also led to significant changes in financial regulation and oversight to prevent a similar crisis from happening in the future."
3,Who won the 2016 U.S. presidential election?,"President Bush won the 2016 U.S. presidential election, securing re-election and leading the Republicans to expand their majorities in both the Senate and House of Representatives. The Democrats faced defeat and are now tasked with analyzing what went wrong in the election.\n\nFact-check Notes:\n1. The draft answer incorrectly states that President Bush won the 2016 U.S. presidential election. The context provided is about a previous election, not the 2016 election.\n3. The draft answer should be corrected to accurately reflect the context provided about a previous election, not the 2016 U.S. presidential election.",Donald Trump won the 2016 U.S. presidential election.
4,What is quantum computing?,"Quantum computing is a type of computing that utilizes the principles of quantum mechanics to perform operations. It involves using quantum bits, or qubits, which can exist in multiple states simultaneously, allowing for faster and more efficient processing compared to classical computing. The recent development of a quantum computer's memory component using a string of atoms suggests a promising route towards building a reliable quantum computer. (Source-1, Source-2, Source-3, Source-4)","Quantum computing is a type of computing that uses quantum-mechanical phenomena, such as superposition and entanglement, to perform operations on data. Unlike classical computers, which use bits to represent information as either 0 or 1, quantum computers use quantum bits, or qubits, which can represent both 0 and 1 simultaneously due to superposition. This allows quantum computers to perform certain calculations much faster than classical computers, making them potentially powerful tools for solving complex problems in fields such as cryptography, optimization, and material science."


In [100]:
cell_value = df.loc[0, 'pipeline_answer']
print(cell_value)

In [114]:
def factcheck_answer(answer, docs):
    result = factcheck_chain.invoke({"draft_answer": answer, "retrieved_docs": docs})
    return result["text"].strip()

# Add a column for fact-check results
for row in results:
    row["Pipeline Factcheck"] = factcheck_answer(row["pipeline_answer"], "N/A")  # you can skip docs here
    row["Baseline Factcheck"] = factcheck_answer(row["baseline_answer"], "N/A")

df = pd.DataFrame(results)
df

Unnamed: 0,question,pipeline_answer,baseline_answer,Pipeline Factcheck,Baseline Factcheck
0,Who is the CEO of Microsoft?,"The CEO of Microsoft is Steve Ballmer.\n\nFact-check Notes:\n1. The draft answer correctly identifies Steve Ballmer as the CEO of Microsoft, which is supported by the context.","As of September 2021, the CEO of Microsoft is Satya Nadella.",The draft answer is accurate and supported by the context. No corrections or flags are needed.,1. The claim that Satya Nadella is the CEO of Microsoft as of September 2021 is supported by the context.\n2. The language used is clear and unbiased.\n3. No corrections are needed for this fact-check.
1,What are the latest trends in renewable energy?,"The latest trends in renewable energy include an increasing share of power coming from renewable sources such as wind, solar, and the energy of the sea. With escalating oil prices and concerns about global warming, the quest for renewable energy sources has intensified, making alternative energy a viable power source for the future.\n\nFact-check Notes:\n1. The draft answer correctly mentions the increasing share of power coming from renewable sources such as wind, solar, and the energy of the sea, which is supported by the context.","1. Offshore wind power: Offshore wind farms are becoming increasingly popular due to their higher wind speeds and less visual impact on land. Countries like the UK, Germany, and Denmark are leading the way in developing offshore wind projects.\n\n2. Solar power with energy storage: The combination of solar panels with energy storage systems, such as batteries, is becoming more common. This allows for the storage of excess energy generated during the day for use during peak demand times or at night.\n\n3. Floating solar panels: Floating solar panels are being installed on bodies of water, such as reservoirs and lakes, to maximize energy production and reduce land use. This technology is particularly popular in countries with limited available land for solar installations.\n\n4. Green hydrogen: Green hydrogen, produced using renewable energy sources like wind or solar power, is gaining traction as a clean alternative to traditional hydrogen production methods. It can be used for fuel cells, energy storage, and industrial processes.\n\n5. Community solar projects: Community solar projects allow multiple individuals or businesses to invest in a shared solar installation, typically located off-site. This allows for greater access to solar energy for those who may not be able to install panels on their own property.\n\n6. Electric vehicles and charging infrastructure: The adoption of electric vehicles (EVs) is driving the need for more charging infrastructure powered by renewable energy sources. This trend is expected to continue as more countries set targets to phase out internal combustion engine vehicles.\n\n7. Microgrids: Microgrids are small-scale, localized energy systems that can operate independently or in conjunction with the main grid. They are often powered by renewable energy sources and can provide increased resilience and reliability during power outages.","Overall, the draft answer is factually accurate and supported by the context. No claims are made that are not supported by the information provided. There are no instances of vague or biased language. No corrections are needed.","1. The claim about offshore wind farms having less visual impact on land is not supported by the context. While it is true that offshore wind farms have higher wind speeds, the visual impact on land is subjective and may vary depending on the location.\n\n2. The language used in this section is clear and unbiased.\n\n3. The claim about floating solar panels being particularly popular in countries with limited available land for solar installations is not supported by the context. While floating solar panels do maximize energy production and reduce land use, their popularity may not be solely based on land availability.\n\n4. The claim about green hydrogen being used for fuel cells, energy storage, and industrial processes is accurate and supported by the context.\n\n5. The language used in this section is clear and unbiased.\n\n6. The claim about the adoption of electric vehicles driving the need for more charging infrastructure powered by renewable energy sources is accurate and supported by the context.\n\n7. The claim about microgrids providing increased resilience and reliability during power outages is accurate and supported by the context."
2,What happened in the 2008 financial crisis?,"During the 2008 financial crisis, UK banks had built up risky investment positions that could trigger a financial crisis if a sudden rush to sell occurred. The crisis was caused by a combination of higher interest rates and a reliance on credit, leading to an increased number of British personal bankruptcies. The Bank of England warned of the potential for a financial crisis due to these risky investments.\n\nFact-check Notes:\n1. The draft answer incorrectly states that the 2008 financial crisis is being discussed in the context provided. The context actually discusses a recent increase in British personal bankruptcies and a warning from the Bank of England about risky investments that could trigger a financial crisis.\n2. The language in the draft answer is vague and misleading, as it inaccurately connects the 2008 financial crisis to the current situation described in the context.\n3. The draft answer should be revised to accurately reflect the context provided, focusing on the recent increase in British personal bankruptcies and the warning from the Bank of England about risky investments.","The 2008 financial crisis, also known as the global financial crisis, was a severe worldwide economic crisis that began in 2007 and continued into 2008. It was triggered by the collapse of the housing market in the United States, which led to a domino effect that spread throughout the global financial system.\n\nSeveral factors contributed to the crisis, including the proliferation of subprime mortgages, the securitization of these mortgages into complex financial products, and the excessive risk-taking and leverage by financial institutions. As the housing market collapsed, banks and financial institutions faced massive losses on their mortgage-backed securities, leading to a liquidity crisis and a credit crunch.\n\nThe crisis resulted in the failure or near-collapse of several major financial institutions, including Lehman Brothers, Bear Stearns, and AIG. Stock markets around the world plummeted, and many countries entered into recession. Governments and central banks implemented various measures to stabilize the financial system, including bailouts of banks and other financial institutions, monetary stimulus, and regulatory reforms.\n\nThe 2008 financial crisis had far-reaching consequences, including a global economic downturn, high levels of unemployment, and a loss of confidence in the financial system. It also led to significant changes in financial regulation and oversight to prevent a similar crisis from happening in the future.","1. The claim about the 2008 financial crisis is not supported by the context provided.\n2. The language in the draft answer is vague and misleading.\n3. The draft answer should be revised to accurately reflect the context provided, focusing on the recent increase in British personal bankruptcies and the warning from the Bank of England about risky investments.","1. The draft answer does not make any claims that are not supported by the context provided.\n\n2. The language used in the draft answer is clear and objective, without any vague or biased language.\n\n3. No corrections or suggestions are needed for the draft answer."
3,Who won the 2016 U.S. presidential election?,"President Bush won the 2004 U.S. presidential election, securing re-election and expanding the Republican majorities in both the Senate and House of Representatives. Democrats faced defeat and are now tasked with analyzing what went wrong in the election. \n\nFact-check Notes:\n1. The draft answer incorrectly states that President Bush won the 2016 U.S. presidential election. This is not supported by the context, which refers to the 2004 election.\n3. The correct information should be provided regarding the election year. The answer should state that President Bush won the 2004 U.S. presidential election, not 2016.",Donald Trump won the 2016 U.S. presidential election.,"1. The claim that President Bush won the 2016 U.S. presidential election is not supported by the context, which refers to the 2004 election.\n2. The language is clear and unbiased.\n3. The answer should be corrected to state that President Bush won the 2004 U.S. presidential election, not 2016.",1. The claim that Donald Trump won the 2016 U.S. presidential election is supported by the context.\n2. The language used in the answer is clear and unbiased.\n3. No corrections are needed for this fact-check.
4,What is quantum computing?,"Quantum computing is a cutting-edge technology that involves the use of quantum bits, or qubits, to perform computations. Recently, scientists at the University of Bonn in Germany have made significant progress in this field by building the fundamental memory component of a quantum computer, known as a ""register"", using caesium atoms trapped inside a laser beam. This breakthrough could pave the way for a more reliable method of constructing a working quantum computer compared to other existing techniques (Source-3, Source-4).","Quantum computing is a type of computing that uses quantum-mechanical phenomena, such as superposition and entanglement, to perform operations on data. Unlike classical computers, which use bits to represent information as either 0 or 1, quantum computers use quantum bits, or qubits, which can represent both 0 and 1 simultaneously due to superposition. This allows quantum computers to perform certain calculations much faster than classical computers, making them potentially powerful tools for solving complex problems in fields such as cryptography, optimization, and material science.","1. The claim that the scientists at the University of Bonn in Germany have made significant progress in building the fundamental memory component of a quantum computer using caesium atoms trapped inside a laser beam is not supported by the context provided. The context does not mention any specific breakthroughs or advancements made by the scientists at the University of Bonn.\n\n2. The language used in the draft answer is mostly clear and factual, but it could be improved by providing more specific details about the progress made by the scientists at the University of Bonn in the field of quantum computing. Additionally, the use of terms like ""significant progress"" and ""more reliable method"" could be considered slightly biased without further evidence to support these claims. \n\n3. Corrections:\n- Provide specific details about the progress made by the scientists at the University of Bonn in the field of quantum computing.\n- Avoid using biased language such as ""significant progress"" and ""more reliable method"" without further evidence to support these claims.",1. The draft answer does not make any unsupported claims.\n2. The language used in the draft answer is clear and unbiased.\n3. No corrections are needed for the draft answer.


## Single Test

In [None]:
if __name__ == "__main__":
    user_q = "Are video games good for children?"

    
    get_final_answer(user_q)