# Create a Q&A Chatbot with LangChain Project

### Set the OpenAI API Key as an Environment Variable

In [22]:
%load_ext dotenv
%dotenv

The dotenv extension is already loaded. To reload it, use:
  %reload_ext dotenv
cannot find .env file


### Install the Libraries

In [23]:
!pip install langchain-community langchain-text-splitters langchain-core langchain-openai langchain-chroma
!pip install pypdf python-dotenv



### Import the Libraries

Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting langchain-text-splitters
  Downloading langchain_text_splitters-1.1.0-py3-none-any.whl.metadata (2.7 kB)
Collecting langchain-openai
  Downloading langchain_openai-1.1.9-py3-none-any.whl.metadata (3.1 kB)
Collecting langchain-chroma
  Downloading langchain_chroma-1.1.0-py3-none-any.whl.metadata (1.9 kB)
Collecting langchain-classic<2.0.0,>=1.0.0 (from langchain-community)
  Downloading langchain_classic-1.0.1-py3-none-any.whl.metadata (4.2 kB)
Collecting requests<3.0.0,>=2.32.5 (from langchain-community)
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting dataclasses-json<0.7.0,>=0.6.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain-core
  Downloading langchain_core-1.2.11-py3-none-any.whl.metadata (4.4 kB)
Collecting chromadb<2.0.0,>=1.3.5 (from langchain-chroma)
  Downloading

In [5]:
from langchain_community.document_loaders.pdf import PyPDFLoader

from langchain_text_splitters import (MarkdownHeaderTextSplitter,
                                      TokenTextSplitter)

from langchain_core.output_parsers.string import StrOutputParser
from langchain_core.messages import SystemMessage
from langchain_core.prompts import (PromptTemplate,
                                    HumanMessagePromptTemplate,
                                    ChatPromptTemplate)
from langchain_core.runnables import (RunnablePassthrough,
                                      RunnableLambda,
                                      chain)

from langchain_openai import (ChatOpenAI,
                              OpenAIEmbeddings)

from langchain_chroma.vectorstores import Chroma

### Load the Course Transcript

In [6]:
loader_pdf = PyPDFLoader("Introduction_to_Tableau.pdf")
docs_list = loader_pdf.load()

In [7]:
len(docs_list)

49

In [8]:
string_list_concat = "".join([i.page_content for i in docs_list])

In [9]:
string_list_concat



### Split the Course Transcript with MarkdownHeaderTextSplitter

In [11]:
md_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on = [("#", "Section Title"),
                           ("##", "Lecture Title")]
)

docs_list_md_split = md_splitter.split_text(string_list_concat)

In [12]:
len(docs_list_md_split)

22

### Create a Chain to Correct the Course Transcript

In [13]:
string_list_split = [i.page_content for i in docs_list_md_split]

In [14]:
string_list_split

["Hi, everyone.\nI'm Ned and I'll be your instructor for this\ncourse.\nTableau is an invaluable tool.\nOne needs to learn on their journey to become a\nsuccessful business intelligence analyst or\ndata scientist.\nThe art of these professions is storytelling\nusing data to tell stories and convince top\nmanagement of the right course of action.\nBy completing this part of the program, you\nwill know how to create charts and dashboards\nin tableaux.\nThis is an essential step on your way to a data\nscientist role.",
 "Tableau has grown to become one of the most\npopular business intelligence tools in the\nentire world.\nIt is A B I software that allows non technical\nusers to visualize their data and work with it\nalmost immediately lowering,\nknow how barriers dramatically in the past.\nBusiness analysts needed the help of it\npersonnel who could assist them in gathering\nraw data and preprocessing it.\nOnly then could business analysts start working\non the visualization of such data

In [15]:
PROMPT_FORMATTING_S = '''Improve the following Tableau lecture transcript by:
- Splitting the text into meaningful paragraphs
- Correcting any misplaced punctuation
- Fixing mistranscribed words (e.g., changing 'tableaux' to 'Tableau')"
'''

PROMPT_TEMPLATE_FORMATTING_H = '''This is the transcript:
{lecture_transcript}
'''

In [16]:
prompt_formatting_s = SystemMessage(content=PROMPT_FORMATTING_S)
prompt_template_formatting_h = HumanMessagePromptTemplate.from_template(template=PROMPT_TEMPLATE_FORMATTING_H)

chat_prompt_template_formatting = ChatPromptTemplate(messages=[prompt_formatting_s,
                                                               prompt_template_formatting_h])

In [17]:
chat = ChatOpenAI(model_name='gpt-4o',
                  seed=365,
                  temperature=0)

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [18]:
str_output_parser = StrOutputParser()

In [19]:
chain_formatting = (chat_prompt_template_formatting
                    | chat
                    | str_output_parser)

NameError: name 'chat' is not defined

In [20]:
string_list_formatted = chain_formatting.batch(string_list_split)

NameError: name 'chain_formatting' is not defined

In [21]:
string_list_formatted

NameError: name 'string_list_formatted' is not defined

In [None]:
for i in string_list_formatted:
    print(i)
    print('''
-------------------
    ''')

In [None]:
for i, j in zip(docs_list_md_split, string_list_formatted):
    i.page_content = j

In [None]:
for i in docs_list_md_split:
    print(i.page_content)
    print('''
-------------------
    ''')

In [None]:
len(docs_list_md_split)

### Split the Lectures with TokenTextSplitter

In [None]:
token_splitter = TokenTextSplitter(encoding_name="cl100k_base",
                                   chunk_size=500,
                                   chunk_overlap=50)

In [None]:
docs_list_tokens_split = token_splitter.split_documents(docs_list_md_split)

In [None]:
len(docs_list_tokens_split)

### Create Embeddings, Vector Store, and Retriever

In [None]:
embedding = OpenAIEmbeddings(model='text-embedding-3-small')

In [None]:
# vectorstore = Chroma.from_documents(documents = docs_list_tokens_split,
#                                     embedding = embedding,
#                                     persist_directory = "./intro-to-tableau")

vectorstore = Chroma(persist_directory = "./intro-to-tableau",
                     embedding_function = embedding)

In [None]:
len(vectorstore.get()["documents"])

In [None]:
retriever = vectorstore.as_retriever(search_type = 'mmr',
                                     search_kwargs = {'k':2,
                                                      'lambda_mult':0.7})

### Create Prompts and Prompt Templates for the Q&A Chatbot Chain

In [None]:
PROMPT_CREATING_QUESTION = '''Lecture: {question_lecture}
Title: {question_title}
Body: {question_body}'''

PROMPT_RETRIEVING_S = '''You will receive a question from a student taking a Tableau course, which includes a title and a body.
The corresponding lecture will also be provided.

Answer the question using only the provided context.

At the end of your response, include the section and lecture names where the context was drawn from, formatted as follows:
Resources:
Section: *Section Title*, Lecture: *Lecture Title*
...
Replace *Section Title* and *Lecture Title* with the appropriate titles.'''

PROMPT_TEMPLATE_RETRIEVING_H = '''This is the question:
{question}

This is the context:
{context}'''

prompt_creating_question = PromptTemplate.from_template(template=PROMPT_CREATING_QUESTION)
prompt_retrieving_s = SystemMessage(content=PROMPT_RETRIEVING_S)
prompt_template_retrieving_h = HumanMessagePromptTemplate.from_template(template=PROMPT_TEMPLATE_RETRIEVING_H)

chat_prompt_template_retrieving = ChatPromptTemplate([prompt_retrieving_s,
                                                      prompt_template_retrieving_h])

### Create the First Version of the Q&A Chatbot Chain

In [None]:
chain_retrieving = (prompt_creating_question
                    | RunnableLambda(lambda x: x.text)
                    | {'context': retriever,
                       'question': RunnablePassthrough()}
                    | chat_prompt_template_retrieving
                    | chat
                    | str_output_parser)

In [None]:
result = chain_retrieving.invoke({"question_lecture": "Adding a custom calculation",
                                  "question_title": "Why are we using SUM here? It's unclear to me.",
                                  "question_body": "This question refers to calculating the GM%."})

In [None]:
result

### Create a Runnable Function to Format the Context

In [None]:
@chain
def format_context(dictionary):

    formatted_string = ""
    retrieved_list = dictionary["context"]

    for i in range(len(retrieved_list)):
        formatted_string += f'''
Document {i+1}
Section Title: {retrieved_list[i].metadata["Section Title"]}
Lecture Title: {retrieved_list[i].metadata["Lecture Title"]}
Content: {retrieved_list[i].page_content}

-------------------
'''

    new_dictionary = {"context": formatted_string,
                      "question": dictionary["question"]}

    return new_dictionary

In [None]:
chain_retrieving_improved = (prompt_creating_question
                             | RunnableLambda(lambda x: x.text)
                             | {'context': retriever,
                                'question': RunnablePassthrough()}
                             | format_context
                             | chat_prompt_template_retrieving
                             | chat
                             | str_output_parser)

In [None]:
result_improved = chain_retrieving_improved.invoke({"question_lecture": "Adding a custom calculation",
                                                    "question_title": "Why are we using SUM here? It's unclear to me.",
                                                    "question_body": "This question refers to calculating the GM%."})

In [None]:
result_improved

### Stream the Response

In [None]:
result_streamed = chain_retrieving_improved.stream({"question_lecture": "Adding a custom calculation",
                                                    "question_title": "Why are we using SUM here? It's unclear to me.",
                                                    "question_body": "This question refers to calculating the GM%."})

In [None]:
result_streamed

In [None]:
for chunk in result_streamed:
    print(chunk, end="")