In [None]:
# ---------------------------------------------------------
# STEP 1: SETUP & IMPORTS
# ---------------------------------------------------------
import os
from dotenv import load_dotenv

# LangChain Imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import MarkdownHeaderTextSplitter, TokenTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_chroma import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Load Environment Variables (API Keys)
load_dotenv()

# Verify API Key
if not os.getenv("OPENAI_API_KEY"):
    raise ValueError("Please set your OPENAI_API_KEY in the .env file")

print("Libraries imported and Environment loaded successfully.")

# ---------------------------------------------------------
# STEP 2: DATA LOADING & PREPROCESSING
# ---------------------------------------------------------
def load_and_split_documents(pdf_path):
    """
    Loads the PDF, splits by Markdown headers, and then by tokens.
    """
    print(f"Loading {pdf_path}...")
    loader = PyPDFLoader(pdf_path)
    raw_documents = loader.load()
    
    # Merge pages into a single string for Markdown splitting
    full_text = "".join([doc.page_content for doc in raw_documents])
    
    # Split 1: Logical Split by Headers (Sections/Lectures)
    print("Splitting by Markdown headers...")
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=[
            ("#", "Section Title"),
            ("##", "Lecture Title")
        ]
    )
    md_docs = markdown_splitter.split_text(full_text)
    
    # Split 2: Token Split (Chunking for Embedding)
    print("Splitting by Tokens...")
    token_splitter = TokenTextSplitter(
        encoding_name="cl100k_base",
        chunk_size=500,
        chunk_overlap=50
    )
    token_docs = token_splitter.split_documents(md_docs)
    
    print(f"Total chunks created: {len(token_docs)}")
    return token_docs

# Run the loading function
pdf_path = "data/Introduction_to_Tableau.pdf"  # Ensure file is in 'data' folder
documents = load_and_split_documents(pdf_path)

# ---------------------------------------------------------
# STEP 3: VECTOR STORE & EMBEDDINGS
# ---------------------------------------------------------
# Initialize OpenAI Embeddings
embedding_model = OpenAIEmbeddings(model='text-embedding-3-small')

# Create Vector Store (Chroma)
# Note: We use a local directory to persist the database
persist_directory = "./chroma_db"

print("Creating Vector Store (this may take a moment)...")
vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding_model,
    persist_directory=persist_directory
)

print("Vector Store created and persisted.")

# Create Retriever
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 5, "lambda_mult": 0.8}
)

# ---------------------------------------------------------
# STEP 4: RAG CHAIN CONSTRUCTION
# ---------------------------------------------------------
# Define the Prompt Template
template = """
You are a helpful Q&A chatbot for the 'Introduction to Tableau' course.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.

Context:
{context}

Question:
{question}

At the end of your answer, explicitly list the 'Lecture Title' and 'Section Title' 
from the metadata of the context you used.
"""

prompt = ChatPromptTemplate.from_template(template)

# Initialize LLM (GPT-4o)
llm = ChatOpenAI(model="gpt-4o", temperature=0)

# formatting function to join retrieved docs
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Build the Chain (LCEL)
rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

print("RAG Chain initialized.")

# ---------------------------------------------------------
# STEP 5: INTERACTIVE CHAT (Streaming)
# ---------------------------------------------------------
def chat_with_bot(question):
    print(f"\nQuestion: {question}\n")
    print("Answer: ", end="", flush=True)
    
    # Stream the response
    for chunk in rag_chain.stream(question):
        print(chunk, end="", flush=True)
    print("\n" + "-"*50)

# Example Usage
chat_with_bot("How do I create a calculated field in Tableau?")
chat_with_bot("What is the difference between a join and a blend?")