# Setting environment variables

In [14]:
import openai
from dotenv import load_dotenv
import os
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONENT = os.getenv("PINECONE_ENVIRONENT")
PINECONE_INDEX = os.getenv("PINECONE_INDEX")

openai.api_key = OPENAI_API_KEY
openai.Engine.list()  # check we have authenticated

<OpenAIObject list at 0x7fd859119630> JSON: {
  "data": [
    {
      "created": null,
      "id": "babbage",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "davinci",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "text-davinci-edit-001",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "babbage-code-search-code",
      "object": "engine",
      "owner": "openai-dev",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "text-similarity-babbage-001",
      "object": "engine",
      "owner": "openai-dev",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "code-davinci-edit-001",
      "object": "engine",
      

In [77]:
import pinecone

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONENT,
)

In [19]:
CSV_PATH = "./FAQ Dataset.csv"
ROLE_PROMPT = """You are an advisor at the University of Texas at Dallas who serves students by answering questions pertaining to information found on the university's website. 
    Answer all questions in complete detail to provide guidance to students and include links to webpages which you found the information from. 
    Keep a positive attitude when answering students and be professional with your wording.
    Write in the style and quality of an expert in university advising and student services with 20+ years of experience in the field.
    Give you answers using the information provided to you below in complete detail."""

In [24]:
# list of URLS to be crawled
URLS = [
    "https://engineering.utdallas.edu/academics/undergraduate-majors/undergrad-advising/advising-faq/"
]

In [22]:
import pandas as pd
from langchain.document_loaders import DataFrameLoader

def get_docs_from_csv(faq_dataset):
    df = pd.DataFrame(faq_dataset)
    df = df[["Question","source"]] # only want questions stored in the db

    loader = DataFrameLoader(df, page_content_column="Question")
    documents = loader.load()
    return documents

In [86]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

def init_vectorstore_from_csv(faq_dataset):
    docs = get_docs_from_csv(faq_dataset)
    return Pinecone.from_documents(docs, OpenAIEmbeddings(), index_name=PINECONE_INDEX)

In [21]:
import pinecone

def init_index():
    if pinecone.list_indexes() == []:
        return pinecone.create_index(PINECONE_INDEX, PINECONE_ENVIRONENT)
    else:
        return pinecone.Index(PINECONE_INDEX)

In [78]:
import pinecone

def reset_index():
    pinecone.delete_index(PINECONE_INDEX) # pinecone docs says to delete index if deleting all vectors in an index
    pinecone.create_index(PINECONE_INDEX, dimension=1536, metric="cosine") # run only once for initial creation

reset_index()

In [26]:
from haystack.pipelines import Pipeline
from haystack.nodes import Crawler, PreProcessor
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)

crawler = Crawler(
    urls=URLS,   # List of Websites to crawl
    crawler_depth=2,    # How many links to follow
    output_dir="crawled_files",  # The directory to store the crawled files, not very important, we don't use the files in this example
    filter_urls=".edu"
)
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=300,
    split_respect_sentence_boundary=False,
    split_overlap=40
)
indexing_pipeline = Pipeline()
indexing_pipeline.add_node(component=crawler, name="crawler", inputs=['File'])
indexing_pipeline.add_node(component=preprocessor, name="preprocessor", inputs=['crawler'])
indexing_pipeline.add_node(component=document_store, name="document_store", inputs=['preprocessor'])

[WDM] - Downloading: 100%|██████████| 8.79M/8.79M [00:00<00:00, 40.8MB/s]


In [None]:
# result = indexing_pipeline.run()

Vectorstores come with handy methods, similarity search being the most applicable to our use case. These methods return documents that we must handle.

In [54]:
def get_answers_from_csv(docs, faq_dataset):
  qa = []
  for doc in docs:
    answer = faq_dataset.filter(lambda x: x['Question'] == doc[0].page_content) # get row with the corresponding question in query
    if answer:
      qa.append({"Question": f"{answer['Question'][0]}", "Answer": f"{answer['Answering'][0]}", "Source": f"{answer['source'][0]}"}) # adds a dictionary of the row to list
  return qa

## Few Shot Prompts

In [89]:
from langchain.prompts.prompt import PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from datasets import load_dataset

def build_prompt_from_csv(query):
    faq_dataset = load_dataset("csv", data_files=CSV_PATH) # may throw error, just keep retrying
    faq_dataset = faq_dataset['train']

    index = init_index()
    if index.describe_index_stats()['total_vector_count'] == 0: # if index is empty, reinitialize it
        vectorstore = init_vectorstore_from_csv(faq_dataset)
    else: # else, just use the existing index
        vectorstore = Pinecone.from_existing_index(PINECONE_INDEX, OpenAIEmbeddings(), "text")
    
    docs = vectorstore.similarity_search_with_score(query)
    qa = get_answers_from_csv(docs, faq_dataset)
    
    example_prompt = PromptTemplate(
        input_variables=["Question", "Answer", "Source"], 
        template="Question: {Question}\n{Answer}\nSource: {Source}"
    )

    fewShotPrompt = FewShotPromptTemplate(
        examples=qa,
        example_prompt=example_prompt,
        suffix="Question: {input}",
        input_variables=["input"]
    )

    system_message_prompt = SystemMessage(content=ROLE_PROMPT)
    human_message_prompt = HumanMessagePromptTemplate(prompt=fewShotPrompt)
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    return chat_prompt

## Connecting to ChatGPT

In [79]:
query = "How do I register for classes?"

In [30]:
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(temperature=0)

In [90]:
from langchain import LLMChain

chat_prompt = build_prompt_from_csv(query)
llm_chain = LLMChain(llm=chat, prompt=chat_prompt)

Using custom data configuration default-b08e8f99787ba199
Found cached dataset csv (/Users/anthonyjaramillo/.cache/huggingface/datasets/csv/default-b08e8f99787ba199/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 614.28it/s]
Loading cached processed dataset at /Users/anthonyjaramillo/.cache/huggingface/datasets/csv/default-b08e8f99787ba199/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-87447f5e49896e3e.arrow
Loading cached processed dataset at /Users/anthonyjaramillo/.cache/huggingface/datasets/csv/default-b08e8f99787ba199/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-cf2210f73f66d08a.arrow
Loading cached processed dataset at /Users/anthonyjaramillo/.cache/huggingface/datasets/csv/default-b08e8f99787ba199/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-50c7b3a3df8c9d94.arrow
Loading cached processed dataset at /Users/anthonyjaramillo/.cache

input_variables=['input'] output_parser=None partial_variables={} messages=[SystemMessage(content="You are an advisor at the University of Texas at Dallas who serves students by answering questions pertaining to information found on the university's website. \n    Answer all questions in complete detail to provide guidance to students and include links to webpages which you found the information from. \n    Keep a positive attitude when answering students and be professional with your wording.\n    Write in the style and quality of an expert in university advising and student services with 20+ years of experience in the field.\n    Give you answers using the information provided to you below in complete detail.", additional_kwargs={}), HumanMessagePromptTemplate(prompt=FewShotPromptTemplate(input_variables=['input'], output_parser=None, partial_variables={}, examples=[{'Question': 'How do I reserve classroom space on campus?', 'Answer': 'For information on reserving classroom space on 

In [91]:
chat_prompt

ChatPromptTemplate(input_variables=['input'], output_parser=None, partial_variables={}, messages=[SystemMessage(content="You are an advisor at the University of Texas at Dallas who serves students by answering questions pertaining to information found on the university's website. \n    Answer all questions in complete detail to provide guidance to students and include links to webpages which you found the information from. \n    Keep a positive attitude when answering students and be professional with your wording.\n    Write in the style and quality of an expert in university advising and student services with 20+ years of experience in the field.\n    Give you answers using the information provided to you below in complete detail.", additional_kwargs={}), HumanMessagePromptTemplate(prompt=FewShotPromptTemplate(input_variables=['input'], output_parser=None, partial_variables={}, examples=[{'Question': 'How do I reserve classroom space on campus?', 'Answer': 'For information on reservi

In [92]:
output = llm_chain.run(input=query)
print(output)

To register for classes, students can log in to their Orion account and follow the registration steps outlined in the registration guide provided by the Registrar's Office. The registration process involves searching for classes, selecting classes, and adding them to the student's schedule. It is important to note that students should register for classes as soon as possible to ensure availability of desired courses.
Source: https://registrar.utdallas.edu/registration/registration-basics/


# Making our agent

The below four options are possible tools to give to our agent when combined with data stored from the web crawler. Does not produce great results just from the FAQ csv alone

## RetrievalQAWithSourcesChain experiments

Chain_type seems to affect the outcome dramatically. Rerank provides the best answers, but do not give sources

In [93]:
from langchain.chains import RetrievalQAWithSourcesChain

chain = RetrievalQAWithSourcesChain.from_chain_type(chat, chain_type="map_rerank", retriever=vectorstore.as_retriever())

NameError: name 'vectorstore' is not defined

In [97]:
chain({
    "question": "What kind of services does the University of Texas at Dallas offer to students with disabilities?"
}, return_only_outputs=True)

{'answer': 'The University of Texas at Dallas offers services for students with disabilities, including accommodations and assistive technology. Transportation services are also available. There is no separate admissions procedure for students with disabilities, and the university offers in-state tuition to employed teaching or research assistants. \n',
 'sources': 'https://accessability.utdallas.edu/student-accommodations/frequently-asked-questions/, https://registrar.utdallas.edu/residence/faq/'}

In [109]:
chain.__call__("What kind of services does the University of Texas at Dallas offer to students with disabilities?")

{'question': 'What kind of services does the University of Texas at Dallas offer to students with disabilities?',
 'answer': 'The University of Texas at Dallas offers a range of services to students with disabilities, including academic accommodations, disability services coaching, assistive technology, and note-taking services. They also have a Student AccessAbility office that coordinates services and provides support to students with disabilities.',
 'sources': ''}

In [51]:
chain.run("What kind of services does the University of Texas at Dallas offer to students with disabilities?")

ValueError: `run` not supported when there is not exactly one output key. Got ['answer', 'sources'].

## RetrievalQA Experiments

map_rerank and map_reduce seem to work better just for RetrievalQA

In [120]:
from langchain.chains import RetrievalQA

qa = RetrievalQA.from_chain_type(llm=chat, chain_type="map_rerank", retriever=vectorstore.as_retriever(), return_source_documents=True)

In [123]:
qa.__call__("What kind of services does the University of Texas at Dallas offer to students with disabilities?")

{'query': 'What kind of services does the University of Texas at Dallas offer to students with disabilities?',
 'result': 'UT Dallas provides a range of services for students with disabilities including assistive technology, accessible housing accommodations, and disability advising. They also provide support for academic accommodations, such as extended exam time or note-taking assistance.',
 'source_documents': [Document(page_content='Does UT Dallas provide services for students with disabilities?', metadata={'source': 'https://accessability.utdallas.edu/student-accommodations/frequently-asked-questions/'}),
  Document(page_content='What transportation services are available for people with disabilities on and around campus? ', metadata={'source': 'https://accessability.utdallas.edu/student-accommodations/frequently-asked-questions/'}),
  Document(page_content='Is there a separate admissions procedure for students with disabilities?\n', metadata={'source': 'https://accessability.utda

## CSV Agent

In [214]:
from langchain.agents import create_csv_agent

csv_agent = create_csv_agent(chat, './FAQ Dataset.csv', verbose=True)

In [215]:
csv_agent.run(query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to filter the dataframe to only include rows from the "Student Accessibility" label and then select the "Answering" column.
Action: python_repl_ast
Action Input: df[df['Label'] == 'Student Accessibility']['Answering'][0m
Observation: [36;1m[1;3m0     Yes, accommodations and services are provided ...
1     No. The admissions process and criteria are th...
2     No. Documentation should be submitted to ARC o...
3     Students requesting services must provide curr...
4     To ensure that accommodations will be in place...
5     The accommodation process requires disclosure ...
6     ARC evaluates documentation to determine eligi...
7     Appropriate and reasonable accommodations are ...
8     We are currently working with Human Resources ...
9     First, complete the initial registration proce...
10    Students wishing to appeal their ARC testing e...
11                                             Yes.\n\n
12 

'The University of Texas at Dallas provides various accommodations and services for students with disabilities to ensure equal access to educational programs and services as per the Section 504 of the Rehabilitation Act of 1973, the Americans with Disabilities Act (1990), and UT Dallas policy. These services are managed by the AccessAbility Resource Center (ARC). The information is contained in the "Answering" column of the filtered dataframe and has been extracted into a list.'

## Vectorstore Agent Experiments

In [220]:
from langchain.agents.agent_toolkits import (
    create_vectorstore_agent,
    VectorStoreToolkit,
    VectorStoreInfo,
)
vectorstore_info = VectorStoreInfo(
    name="UTD QA",
    description="A FAQ over policies for The University of Texas at Dallas",
    vectorstore=vectorstore
)
toolkit = VectorStoreToolkit(vectorstore_info=vectorstore_info)
vectorstore_agent = create_vectorstore_agent(
    llm=chat,
    toolkit=toolkit,
    verbose=True
)

In [221]:
vectorstore_agent.run("What kind of services does the University of Texas at Dallas offer to students with disabilities?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThis seems like a question about UTD policies and resources available to a specific group of students. I should use the UTD QA tool to find the answer.
Action: UTD QA
Action Input: What kind of services does the University of Texas at Dallas offer to students with disabilities?[0m
Observation: [36;1m[1;3m UT Dallas offers a variety of services for students with disabilities, including transportation services on and around campus, a separate admissions procedure, and accommodations for students in the classroom.[0m
Thought:[32;1m[1;3mThis answer seems comprehensive enough, but it's always a good idea to provide sources for the information given.
Action: UTD QA_with_sources
Action Input: What kind of services does the University of Texas at Dallas offer to students with disabilities?[0m
Observation: [33;1m[1;3m{"answer": " The University of Texas at Dallas offers a variety of services for students with disabilities, in

'The University of Texas at Dallas offers a variety of services for students with disabilities, including accommodations, transportation services, and a separate admissions procedure. Sources for this information can be found at https://accessability.utdallas.edu/student-accommodations/frequently-asked-questions/ and https://registrar.utdallas.edu/residence/faq/.'

Agent hallucinated. There is not a seperate admissions procedure for students with disabilities at UTD.

## Serper API experiments

In [None]:
os.environ["SERPER_API_KEY"] = os.getenv("SERPER_API_KEY")

In [169]:
from langchain.utilities import GoogleSerperAPIWrapper

serper = GoogleSerperAPIWrapper()
serper.run(query)

'AccessAbility Resource Center  * Supporting Students with Disabilities.  * Student Accommodation Information for Faculty.  * Digital Course Content Accessibility.  * Adaptive Technology.  * Graduation Accommodations.  * Disability and Language.'

## SerPapi API experiments

Not good

In [146]:
from langchain.utilities import SerpAPIWrapper

params = {
    "engine": "bing",
    "gl": "us",
    "hl": "en",
}
serpapi = SerpAPIWrapper(params=params)
serpapi.run(query)

'The University of Texas at Dallas is a public research university in Richardson, Texas. It is one of the largest public universities in the Dallas area and the northernmost institution of the University of …New content will be added above the current area of focus upon selectionThe University of Texas at Dallas is a public research university in Richardson, Texas. It is one of the largest public universities in the Dallas area and the northernmost institution of the University of Texas system. It was initially founded in 1961 as a private research arm of Texas Instruments.Wikipediautdallas.edu'

## Conversational agent for chat models experiments

In [171]:
from langchain.agents import load_tools, Tool
from langchain.memory import ConversationBufferMemory
from langchain.agents import initialize_agent
from langchain.agents import AgentType

In [203]:
openapi_format_instructions = """First, follow the instructons below:

Question: the input question you must answer
Thought: I will use the LLMChain tool first to try and answer the question. If the answer is insufficient, I will defer to other tools
Action: The action to take is the LLMChain tool in [{tool_names}]
Action Input: what to instruct the AI Action representative.
Observation: The Agent's response

Second, follow the format as shown below:
Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: what to instruct the AI Action representative.
Observation: The Agent's response
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer. User can't see any of my observations, API responses, links, or tools.
Final Answer: the final answer to the original input question with the right amount of detail

When responding with your Final Answer, remember that the person you are responding to CANNOT see any of your 
Thought/Action/Action Input/Observations, so if there is any relevant information there you need to include it explicitly in your response."""

In [209]:
tools = [
    Tool(
        name = "LLMChain",
        func = llm_chain.run,
        description = "Use this to answer questions about the University of Texas at Dallas. If output does not answer the user's question, use the Google search tool"
    ),
    Tool(
        name = "Google search",
        func = serper.run,
        description = "Useful for when the LLMChain does not provide a sufficient answer"
    )
]

In [222]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
agent_chain = initialize_agent(tools, chat, agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION, verbose=True, memory=memory,
                               agent_kwargs={"format_instructions": openapi_format_instructions})

In [223]:
agent_chain.run(query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Google search",
    "action_input": "University of Texas at Dallas services for students with disabilities"
}[0m
Observation: [33;1m[1;3mWelcome to the AccessAbility Resource Center! We provide disability and accessibility services to students and employees at the University of Texas at Dallas. Contact for: Student disability accommodations; campus accessibility resources for students. ... Contact for: Campus resources and services for students. D., pHCLE (972-721-5056; iacosta@udallas.edu). The Director assists in an informal resolution of the complaint or guides the complainant to the appropriate ... The guidelines below are intended to allow SDS to determine eligibility for services and appropriate accommodations. SDS staff are available at (972) 721-5056 ... The office that provides advocacy and arranges for academic support and campus accessibility for students with disabilities is usually called Disa

'The University of Texas at Dallas provides a variety of resources and services for students with disabilities through the AccessAbility Resource Center, including disability accommodations, campus accessibility resources, advocacy, academic support, and tools for equal educational opportunities. Students requesting accommodations based on a disability are required to register each semester with the Office of Disability Services.'

## 