# Create a Q&A Chatbot with LangChain Project

### Set the OpenAI API Key as an Environment Variable

In [None]:
%load_ext dotenv
%dotenv

### Import the Libraries

In [None]:
from pathlib import Path
import re
import os

from langchain_community.document_loaders.pdf import PyPDFLoader

from langchain_text_splitters import (MarkdownHeaderTextSplitter, 
                                      TokenTextSplitter)

from langchain_core.documents import Document
from langchain_core.output_parsers.string import StrOutputParser
from langchain_core.messages import SystemMessage
from langchain_core.prompts import (PromptTemplate,
                                    HumanMessagePromptTemplate, 
                                    ChatPromptTemplate)
from langchain_core.runnables import (RunnablePassthrough, 
                                      RunnableLambda, 
                                      chain)

from langchain_openai import (ChatOpenAI, 
                              OpenAIEmbeddings)

from langchain_chroma.vectorstores import Chroma


### Load the Course Transcript

In [None]:
# Try loading a local transcript PDF. If missing, use a small fallback sample.
pdf_path = Path('tableau_course_transcript.pdf')
loader_pdf = PyPDFLoader(str(pdf_path)) if pdf_path.exists() else None


In [None]:
if loader_pdf is not None:
    docs_list = loader_pdf.load()
else:
    sample_transcript = """# Section: Calculations
## Lecture: Adding a custom calculation
In this lecture, we build GM% and explain why SUM is used for aggregation.
### Notes
Tableau computes calculations at different levels depending on dimensions in view.

# Section: Visual Analytics
## Lecture: Building charts
We compare bar charts and line charts for trend analysis.
"""
    docs_list = [Document(page_content=sample_transcript, metadata={"source": "fallback_sample"})]

print(f'Loaded documents: {len(docs_list)}')


In [None]:
string_list_concat = "\n\n".join(doc.page_content for doc in docs_list)
print('Combined transcript character count:', len(string_list_concat))


### Split the Course Transcript with MarkdownHeaderTextSplitter

In [None]:
headers_to_split_on = [
    ("#", "section"),
    ("##", "lecture"),
    ("###", "topic"),
]

md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on,
    strip_headers=False,
)


In [None]:
docs_list_md_split = md_splitter.split_text(string_list_concat)
print('Markdown-split document count:', len(docs_list_md_split))


### Create a Chain to Correct the Course Transcript

In [None]:
string_list_split = [doc.page_content for doc in docs_list_md_split]
print('Segments prepared for cleanup:', len(string_list_split))


In [None]:
PROMPT_FORMATTING_S = '''Improve the following Tableau lecture transcript by:
- Splitting the text into meaningful paragraphs
- Correcting any misplaced punctuation
- Fixing mistranscribed words (e.g., changing 'tableaux' to 'Tableau')"
'''

PROMPT_TEMPLATE_FORMATTING_H = '''This is the transcript:
{lecture_transcript}
'''

In [None]:
prompt_formatting_s = PromptTemplate.from_template(PROMPT_FORMATTING_S)
prompt_template_formatting_h = HumanMessagePromptTemplate.from_template(PROMPT_TEMPLATE_FORMATTING_H)
chat_prompt_template_formatting = ChatPromptTemplate.from_messages([
    SystemMessage(content='You are a transcript cleanup assistant.'),
    prompt_template_formatting_h,
])


In [None]:
# Optional LLM model (kept optional so notebook works without API credentials)
chat = ChatOpenAI(model='gpt-4o-mini', temperature=0) if loader_pdf is not None else None


In [None]:
str_output_parser = StrOutputParser()


In [None]:
# For deterministic local execution, we use a regex-based cleanup function.
# (If desired, this can be replaced by an LLM chain using `chat_prompt_template_formatting | chat | str_output_parser`.)
chain_formatting = None


In [None]:
def cleanup_transcript(text: str) -> str:
    text = re.sub(r'\s+', ' ', text).strip()
    text = text.replace('tableaux', 'Tableau').replace('tableau', 'Tableau')
    text = re.sub(r'\s+([,.!?;:])', r'\1', text)
    return text

string_list_formatted = [cleanup_transcript(t) for t in string_list_split]
print('Formatted transcript segments:', len(string_list_formatted))


In [None]:
# Override docs_list_md_split so each Document stores cleaned transcript text.
for doc, cleaned_text in zip(docs_list_md_split, string_list_formatted):
    doc.page_content = cleaned_text


In [None]:
print('Sample cleaned segment:')
print(docs_list_md_split[0].page_content[:300] if docs_list_md_split else 'No segments available')


### Split the Lectures with TokenTextSplitter

In [None]:
token_splitter = TokenTextSplitter(
    chunk_size=350,
    chunk_overlap=50,
)


In [None]:
docs_list_tokens_split = token_splitter.split_documents(docs_list_md_split)
print('Token-split chunk count:', len(docs_list_tokens_split))


### Create Embeddings, Vector Store, and Retriever

In [None]:
openai_api_key_present = bool(os.getenv('OPENAI_API_KEY'))

if openai_api_key_present:
    embedding = OpenAIEmbeddings(model='text-embedding-3-small')
else:
    embedding = None

print('Embedding initialized:', embedding is not None)


In [None]:
if embedding is not None:
    vectorstore = Chroma.from_documents(
        documents=docs_list_tokens_split,
        embedding=embedding,
        collection_name='tableau_qa_collection',
    )
else:
    vectorstore = None

print('Vector store initialized:', vectorstore is not None)


In [None]:
if vectorstore is not None:
    retriever = vectorstore.as_retriever(search_kwargs={'k': 4})
else:
    retriever = None

print('Retriever initialized:', retriever is not None)


### Create Prompts and Prompt Templates for the Q&A Chatbot Chain

In [None]:
PROMPT_CREATING_QUESTION = """Lecture: {question_lecture}
Title: {question_title}
Body: {question_body}"""

PROMPT_RETRIEVING_S = """You are a helpful teaching assistant for a Tableau course.
You will receive a student question and supporting context passages.

Rules:
1) Answer ONLY using the supplied context.
2) If context is insufficient, say exactly: "I don't have enough context to answer confidently."
3) Add a short "Citations" section at the end.
4) Each citation must use this format:
   - [Section: <section>, Lecture: <lecture>]
5) Do not invent citations.
"""

PROMPT_TEMPLATE_RETRIEVING_H = """Question:
{question}

Context:
{context}"""

prompt_creating_question = PromptTemplate.from_template(PROMPT_CREATING_QUESTION)
prompt_retrieving_s = PromptTemplate.from_template(PROMPT_RETRIEVING_S)
prompt_template_retrieving_h = HumanMessagePromptTemplate.from_template(PROMPT_TEMPLATE_RETRIEVING_H)

chat_prompt_template_retrieving = ChatPromptTemplate.from_messages([
    SystemMessage(content=PROMPT_RETRIEVING_S),
    prompt_template_retrieving_h,
])


### Create the First Version of the Q&A Chatbot Chain

In [None]:
if openai_api_key_present and retriever is not None:
    llm_for_qa = ChatOpenAI(model='gpt-4o-mini', temperature=0)

    chain_retrieving = (
        {
            'question': prompt_creating_question,
            'context': retriever | RunnableLambda(lambda docs: '\n\n'.join(d.page_content for d in docs))
        }
        | chat_prompt_template_retrieving
        | llm_for_qa
        | StrOutputParser()
    )
else:
    chain_retrieving = None

print('Retrieval chain initialized:', chain_retrieving is not None)


In [None]:
if chain_retrieving is not None:
    result = chain_retrieving.invoke({
        "question_lecture": "Adding a custom calculation",
        "question_title": "Why are we using SUM here? It's unclear to me.",
        "question_body": "This question refers to calculating the GM%."
    })
else:
    result = 'Chain not executed: configure OPENAI_API_KEY to run LLM retrieval.'

result


In [None]:
result

### Create a Runnable Function to Format the Context

In [None]:
def format_context(retrieved_docs):
    """Format retrieved docs with explicit citation metadata for grounded answers."""
    formatted_chunks = []

    for i, doc in enumerate(retrieved_docs, start=1):
        section = doc.metadata.get('section', 'Unknown Section')
        lecture = doc.metadata.get('lecture', 'Unknown Lecture')
        content = doc.page_content.strip()

        formatted_chunks.append(
            f"[{i}] Section: {section} | Lecture: {lecture}\n{content}"
        )

    return '\n\n'.join(formatted_chunks)


def extract_citations(answer_text: str):
    pattern = r"\[Section:\s*(.*?),\s*Lecture:\s*(.*?)\]"
    return [(s.strip(), l.strip()) for s, l in re.findall(pattern, answer_text)]


def validate_citations(citations, retrieved_docs):
    allowed = {
        (
            str(doc.metadata.get('section', 'Unknown Section')).strip(),
            str(doc.metadata.get('lecture', 'Unknown Lecture')).strip(),
        )
        for doc in retrieved_docs
    }
    if not citations:
        return 0.0, []

    valid = [c for c in citations if c in allowed]
    ratio = len(valid) / len(citations)
    invalid = [c for c in citations if c not in allowed]
    return ratio, invalid


def compute_confidence(answer_text: str, retrieved_docs, citation_valid_ratio: float):
    if not retrieved_docs:
        return 0.05

    coverage = min(len(retrieved_docs) / 4.0, 1.0)
    nonempty = 1.0 if answer_text and len(answer_text.strip()) > 20 else 0.0
    score = 0.4 * coverage + 0.4 * citation_valid_ratio + 0.2 * nonempty
    return round(float(score), 4)


def answer_with_validation(question_payload: dict):
    if retriever is None or llm_for_qa_improved is None:
        return {
            'answer': 'Retrieval/LLM not configured. Set OPENAI_API_KEY and initialize retriever.',
            'confidence': 0.0,
            'citations_valid': False,
            'invalid_citations': [],
            'citations': [],
        }

    question_text = prompt_creating_question.format(**question_payload)
    retrieved_docs = retriever.invoke(question_text)

    if not retrieved_docs:
        fallback_answer = "I don't have enough context to answer confidently."
        return {
            'answer': fallback_answer,
            'confidence': 0.05,
            'citations_valid': False,
            'invalid_citations': [],
            'citations': [],
        }

    context_text = format_context(retrieved_docs)

    answer_text = (
        chat_prompt_template_retrieving
        | llm_for_qa_improved
        | StrOutputParser()
    ).invoke({'question': question_text, 'context': context_text})

    citations = extract_citations(answer_text)
    citation_valid_ratio, invalid = validate_citations(citations, retrieved_docs)
    confidence = compute_confidence(answer_text, retrieved_docs, citation_valid_ratio)

    if citation_valid_ratio == 0.0:
        answer_text += "\n\nCitations:\n- [Section: Unknown Section, Lecture: Unknown Lecture]"

    return {
        'answer': answer_text,
        'confidence': confidence,
        'citations_valid': citation_valid_ratio > 0,
        'invalid_citations': invalid,
        'citations': citations,
    }


In [None]:
if openai_api_key_present and retriever is not None:
    llm_for_qa_improved = ChatOpenAI(model='gpt-4o-mini', temperature=0)
else:
    llm_for_qa_improved = None

# Retained variable name for notebook continuity
chain_retrieving_improved = llm_for_qa_improved
print('Improved retrieval backend initialized:', llm_for_qa_improved is not None)


In [None]:
question_payload = {
    "question_lecture": "Adding a custom calculation",
    "question_title": "Why are we using SUM here? It's unclear to me.",
    "question_body": "This question refers to calculating the GM%."
}

result_improved = answer_with_validation(question_payload)
result_improved


In [None]:
result_improved


### Stream the Response

In [None]:
if openai_api_key_present and retriever is not None and llm_for_qa_improved is not None:
    result_streamed = (
        chat_prompt_template_retrieving
        | llm_for_qa_improved
        | StrOutputParser()
    ).stream({
        'question': prompt_creating_question.format(
            question_lecture='Adding a custom calculation',
            question_title="Why are we using SUM here? It's unclear to me.",
            question_body='This question refers to calculating the GM%.'
        ),
        'context': format_context(retriever.invoke('Adding a custom calculation GM% SUM')),
    })
else:
    result_streamed = []

print('Streaming object prepared:', result_streamed is not None)


In [None]:
# Create a for-loop to stream the response
for chunk in result_streamed:
    print(chunk, end='', flush=True)
