# Setting environment variables

In [3]:
import openai
from dotenv import load_dotenv
import os
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONMENT = os.getenv("PINECONE_ENVIRONMENT")
PINECONE_INDEX = os.getenv("PINECONE_INDEX")

openai.api_key = OPENAI_API_KEY
openai.Engine.list()  # check we have authenticated

<OpenAIObject list at 0x7fbdb8af0090> JSON: {
  "data": [
    {
      "created": null,
      "id": "babbage",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "davinci",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "text-davinci-edit-001",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "gpt-3.5-turbo-0301",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "text-davinci-003",
      "object": "engine",
      "owner": "openai-internal",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "babbage-code-search-code",
      "object": "engine",
      "owner": "ope

In [4]:
import pinecone

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONMENT,
)

In [5]:
ROLE_PROMPT = """You are an advisor at the University of Texas at Dallas who serves students by answering questions pertaining to information found on the university's website. 
    Answer all questions in complete detail to provide guidance to students and include links to webpages which you found the information from. 
    Keep a positive attitude when answering students and be professional with your wording.
    Write in the style and quality of an expert in university advising and student services with 20+ years of experience in the field.
    Give your answers using the information provided to you below in complete detail."""

# Initializing Pinecone with FAQ dataset

In [5]:
CSV_PATH = "./FAQ Dataset.csv"

In [4]:
from datasets import load_dataset

FAQ_DATASET = load_dataset("csv", data_files=CSV_PATH)['train'] # may throw error, just keep retrying

Using custom data configuration default-b08e8f99787ba199
Found cached dataset csv (/Users/anthonyjaramillo/.cache/huggingface/datasets/csv/default-b08e8f99787ba199/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 589.67it/s]


In [6]:
import pandas as pd
from langchain.document_loaders import DataFrameLoader

def get_docs_from_csv(faq_dataset):
    df = pd.DataFrame(faq_dataset)
    df = df[["Question","source"]] # only want questions stored in the db

    loader = DataFrameLoader(df, page_content_column="Question")
    documents = loader.load()
    return documents

In [7]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

def init_vectorstore_from_csv(faq_dataset):
    docs = get_docs_from_csv(faq_dataset)
    return Pinecone.from_documents(docs, OpenAIEmbeddings(), index_name=PINECONE_INDEX)

In [26]:
import pinecone

def init_index():
    if PINECONE_INDEX not in pinecone.list_indexes():
        return pinecone.create_index(PINECONE_INDEX, PINECONE_ENVIRONMENT)
    else:
        return pinecone.Index(PINECONE_INDEX)

In [9]:
""" # Only use when switching datasets

import pinecone

def reset_index():
    pinecone.delete_index(PINECONE_INDEX) # pinecone docs says to delete index if deleting all vectors in an index
    pinecone.create_index(PINECONE_INDEX, dimension=1536, metric="cosine") # run only once for initial creation

reset_index() """

In [9]:
def get_answers_from_csv(docs, faq_dataset):
  qa = []
  for doc in docs:
    answer = faq_dataset.filter(lambda x: x['Question'] == doc[0].page_content) # get row with the corresponding question in query
    if answer:
      qa.append({"Question": f"{answer['Question'][0]}", "Answer": f"{answer['Answering'][0]}", "Source": f"{answer['source'][0]}"}) # adds a dictionary of the row to list
  return qa

In [33]:
from langchain.prompts.prompt import PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

def build_prompt_from_csv(query, faq_dataset):
    index = init_index()
    if index.describe_index_stats()['total_vector_count'] == 0: # if index is empty, reinitialize it
        vectorstore = init_vectorstore_from_csv(faq_dataset)
    else: # else, just use the existing index
        vectorstore = Pinecone(index, OpenAIEmbeddings().embed_query, text_key="text")
    
    docs = vectorstore.similarity_search_with_score(query)
    qa = get_answers_from_csv(docs, faq_dataset)
    
    example_prompt = PromptTemplate(
        input_variables=["Question", "Answer", "Source"], 
        template="Question: {Question}\n{Answer}\nSource: {Source}"
    )

    fewShotPrompt = FewShotPromptTemplate(
        examples=qa,
        example_prompt=example_prompt,
        suffix="Question: {input}",
        input_variables=["input"]
    )

    system_message_prompt = SystemMessage(content=ROLE_PROMPT)
    human_message_prompt = HumanMessagePromptTemplate(prompt=fewShotPrompt)
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    return chat_prompt

# Using FAQ dataset with ChatGPT

In [13]:
query = "How do I register for classes?"

In [14]:
from langchain import LLMChain

def query_with_csv(query, llm):
    chat_prompt = build_prompt_from_csv(query, FAQ_DATASET)
    llm_chain = LLMChain(llm=llm, prompt=chat_prompt)
    output = llm_chain.run(input=query)
    return output

In [47]:
from langchain.chat_models import ChatOpenAI
from langchain.llms import OpenAI

#chat = ChatOpenAI(temperature=0)
chat = OpenAI(temperature=0)
output = query_with_csv(query, chat)

100%|██████████| 1/1 [00:00<00:00, 370.91ba/s]
100%|██████████| 1/1 [00:00<00:00, 456.20ba/s]
100%|██████████| 1/1 [00:00<00:00, 373.13ba/s]
100%|██████████| 1/1 [00:00<00:00, 497.31ba/s]


In [48]:
print(output)


To register for classes, please visit the Office of the Registrar website. You will need to log in with your UTD NetID and password. Once logged in, you can search for classes, add them to your shopping cart, and then register for them.
Source: https://registrar.utdallas.edu/registration/


## Making our agent

The below four options are possible tools to give to our agent when combined with data stored from the web crawler. Does not produce great results just from the FAQ csv alone

## RetrievalQAWithSourcesChain experiments

Chain_type seems to affect the outcome dramatically. Rerank provides the best answers, but do not give sources

In [36]:
from langchain.chains import RetrievalQAWithSourcesChain

index = init_index()
vectorstore = Pinecone(index, OpenAIEmbeddings().embed_query, text_key="text")
chain = RetrievalQAWithSourcesChain.from_chain_type(chat, chain_type="map_rerank", retriever=vectorstore.as_retriever())

In [39]:
chain({
    "question": "What kind of services does the University of Texas at Dallas offer to students with disabilities?"
}, return_only_outputs=True)

{'answer': 'UT Dallas offers a variety of services for students with disabilities, including accommodations for testing and classroom participation, assistive technology, and accessible housing. They also have a Disability Services office to coordinate these services.',
 'sources': ''}

In [41]:
chain.__call__("What kind of services does the University of Texas at Dallas offer to students with disabilities?")

{'question': 'What kind of services does the University of Texas at Dallas offer to students with disabilities?',
 'answer': 'UT Dallas provides a range of services, including academic accommodations, assistive technology, accessible parking, and more through their Student Accessibility Services office. They can also provide resources and referrals for off-campus services if needed. ',
 'sources': ''}

In [51]:
chain.run("What kind of services does the University of Texas at Dallas offer to students with disabilities?") # Doesn't work

ValueError: `run` not supported when there is not exactly one output key. Got ['answer', 'sources'].

## RetrievalQA Experiments

map_rerank and map_reduce seem to work better just for RetrievalQA

In [45]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

index = init_index()
vectorstore = Pinecone(index, OpenAIEmbeddings().embed_query, text_key="text")
qa = RetrievalQA.from_chain_type(llm=OpenAI(), chain_type="map_rerank", retriever=vectorstore.as_retriever(), return_source_documents=True)

In [46]:
qa.run("What kind of services does the University of Texas at Dallas offer to students with disabilities?")

ValueError: `run` not supported when there is not exactly one output key. Got ['result', 'source_documents'].

## CSV Agent

In [214]:
from langchain.agents import create_csv_agent

csv_agent = create_csv_agent(chat, './FAQ Dataset.csv', verbose=True)

In [215]:
csv_agent.run(query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to filter the dataframe to only include rows from the "Student Accessibility" label and then select the "Answering" column.
Action: python_repl_ast
Action Input: df[df['Label'] == 'Student Accessibility']['Answering'][0m
Observation: [36;1m[1;3m0     Yes, accommodations and services are provided ...
1     No. The admissions process and criteria are th...
2     No. Documentation should be submitted to ARC o...
3     Students requesting services must provide curr...
4     To ensure that accommodations will be in place...
5     The accommodation process requires disclosure ...
6     ARC evaluates documentation to determine eligi...
7     Appropriate and reasonable accommodations are ...
8     We are currently working with Human Resources ...
9     First, complete the initial registration proce...
10    Students wishing to appeal their ARC testing e...
11                                             Yes.\n\n
12 

'The University of Texas at Dallas provides various accommodations and services for students with disabilities to ensure equal access to educational programs and services as per the Section 504 of the Rehabilitation Act of 1973, the Americans with Disabilities Act (1990), and UT Dallas policy. These services are managed by the AccessAbility Resource Center (ARC). The information is contained in the "Answering" column of the filtered dataframe and has been extracted into a list.'

## Vectorstore Agent Experiments

In [220]:
from langchain.agents.agent_toolkits import (
    create_vectorstore_agent,
    VectorStoreToolkit,
    VectorStoreInfo,
)
vectorstore_info = VectorStoreInfo(
    name="UTD QA",
    description="A FAQ over policies for The University of Texas at Dallas",
    vectorstore=vectorstore
)
toolkit = VectorStoreToolkit(vectorstore_info=vectorstore_info)
vectorstore_agent = create_vectorstore_agent(
    llm=chat,
    toolkit=toolkit,
    verbose=True
)

In [221]:
vectorstore_agent.run("What kind of services does the University of Texas at Dallas offer to students with disabilities?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThis seems like a question about UTD policies and resources available to a specific group of students. I should use the UTD QA tool to find the answer.
Action: UTD QA
Action Input: What kind of services does the University of Texas at Dallas offer to students with disabilities?[0m
Observation: [36;1m[1;3m UT Dallas offers a variety of services for students with disabilities, including transportation services on and around campus, a separate admissions procedure, and accommodations for students in the classroom.[0m
Thought:[32;1m[1;3mThis answer seems comprehensive enough, but it's always a good idea to provide sources for the information given.
Action: UTD QA_with_sources
Action Input: What kind of services does the University of Texas at Dallas offer to students with disabilities?[0m
Observation: [33;1m[1;3m{"answer": " The University of Texas at Dallas offers a variety of services for students with disabilities, in

'The University of Texas at Dallas offers a variety of services for students with disabilities, including accommodations, transportation services, and a separate admissions procedure. Sources for this information can be found at https://accessability.utdallas.edu/student-accommodations/frequently-asked-questions/ and https://registrar.utdallas.edu/residence/faq/.'

Agent hallucinated. There is not a seperate admissions procedure for students with disabilities at UTD.

## Serper API experiments

In [None]:
os.environ["SERPER_API_KEY"] = os.getenv("SERPER_API_KEY")

In [169]:
from langchain.utilities import GoogleSerperAPIWrapper

serper = GoogleSerperAPIWrapper()
serper.run(query)

'AccessAbility Resource Center  * Supporting Students with Disabilities.  * Student Accommodation Information for Faculty.  * Digital Course Content Accessibility.  * Adaptive Technology.  * Graduation Accommodations.  * Disability and Language.'

## SerPapi API experiments

Not good

In [146]:
from langchain.utilities import SerpAPIWrapper

params = {
    "engine": "bing",
    "gl": "us",
    "hl": "en",
}
serpapi = SerpAPIWrapper(params=params)
serpapi.run(query)

'The University of Texas at Dallas is a public research university in Richardson, Texas. It is one of the largest public universities in the Dallas area and the northernmost institution of the University of …New content will be added above the current area of focus upon selectionThe University of Texas at Dallas is a public research university in Richardson, Texas. It is one of the largest public universities in the Dallas area and the northernmost institution of the University of Texas system. It was initially founded in 1961 as a private research arm of Texas Instruments.Wikipediautdallas.edu'

## Conversational agent for chat models experiments

In [171]:
from langchain.agents import load_tools, Tool
from langchain.memory import ConversationBufferMemory
from langchain.agents import initialize_agent
from langchain.agents import AgentType

In [203]:
openapi_format_instructions = """First, follow the instructons below:

Question: the input question you must answer
Thought: I will use the LLMChain tool first to try and answer the question. If the answer is insufficient, I will defer to other tools
Action: The action to take is the LLMChain tool in [{tool_names}]
Action Input: what to instruct the AI Action representative.
Observation: The Agent's response

Second, follow the format as shown below:
Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: what to instruct the AI Action representative.
Observation: The Agent's response
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer. User can't see any of my observations, API responses, links, or tools.
Final Answer: the final answer to the original input question with the right amount of detail

When responding with your Final Answer, remember that the person you are responding to CANNOT see any of your 
Thought/Action/Action Input/Observations, so if there is any relevant information there you need to include it explicitly in your response."""

In [209]:
tools = [
    Tool(
        name = "LLMChain",
        func = llm_chain.run,
        description = "Use this to answer questions about the University of Texas at Dallas. If output does not answer the user's question, use the Google search tool"
    ),
    Tool(
        name = "Google search",
        func = serper.run,
        description = "Useful for when the LLMChain does not provide a sufficient answer"
    )
]

In [222]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
agent_chain = initialize_agent(tools, chat, agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION, verbose=True, memory=memory,
                               agent_kwargs={"format_instructions": openapi_format_instructions})

In [223]:
agent_chain.run(query)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Google search",
    "action_input": "University of Texas at Dallas services for students with disabilities"
}[0m
Observation: [33;1m[1;3mWelcome to the AccessAbility Resource Center! We provide disability and accessibility services to students and employees at the University of Texas at Dallas. Contact for: Student disability accommodations; campus accessibility resources for students. ... Contact for: Campus resources and services for students. D., pHCLE (972-721-5056; iacosta@udallas.edu). The Director assists in an informal resolution of the complaint or guides the complainant to the appropriate ... The guidelines below are intended to allow SDS to determine eligibility for services and appropriate accommodations. SDS staff are available at (972) 721-5056 ... The office that provides advocacy and arranges for academic support and campus accessibility for students with disabilities is usually called Disa

'The University of Texas at Dallas provides a variety of resources and services for students with disabilities through the AccessAbility Resource Center, including disability accommodations, campus accessibility resources, advocacy, academic support, and tools for equal educational opportunities. Students requesting accommodations based on a disability are required to register each semester with the Office of Disability Services.'

## 

# Initializing Pinecone with Web Crawler data 

In [176]:
# Only use when switching datasets

import pinecone

def reset_index():
    pinecone.delete_index(PINECONE_INDEX) # pinecone docs says to delete index if deleting all vectors in an index
    pinecone.create_index(PINECONE_INDEX, dimension=1536, metric="cosine") # run only once for initial creation

#reset_index()

In [6]:
pinecone.list_indexes()

['chatbot-faq']

In [7]:
index = pinecone.Index('chatbot-faq')

In [8]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 1009}},
 'total_vector_count': 1009}

In [9]:
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings

vectorstore = Pinecone(index, OpenAIEmbeddings().embed_query, text_key="text")

## Conversation Retrieval Chain

In [41]:
from langchain.llms import OpenAI
from langchain.chains import ConversationalRetrievalChain

conv_chain = ConversationalRetrievalChain.from_llm(
    OpenAI(temperature=0), 
    vectorstore.as_retriever(),
    return_source_documents=True
)

In [43]:
chat_history = []
query = "How long until I graduate?"
result = conv_chain({"question": query, "chat_history": chat_history}) # not sure why it wont work

In [44]:
result

{'question': 'How long until I graduate?',
 'chat_history': [],
 'answer': ' It depends on many variables such as the number of classes you take each semester, enrollment in summer classes, having to repeat classes, missing prerequisites, etc. To determine a possible graduation timeline, consider the following: 1. How many hours of degree requirements remain? 2. How many hours will you take each semester? 3. Can I complete all necessary prerequisites by taking ___ hours each semester? Using your degree plan and flowchart, you can make a tentative plan for classes that you will take each semester until graduation.',
 'source_documents': [Document(page_content='Undergraduate Advising\n>  Advising FAQ\n\n$(document).ready(function(){\n$(".accordion").accordion({\nactive: false,\ncollapsible: true,\nheightStyle: \'content\'\n});\n});\n\nAdvising Links\n\nAdvising\nAdvisors\nProspective Students\nDegree Resources\nGraduate Level Courses\nAdvising FAQ\nStudent Resources\n\nAdvising FAQ\n\nSe

## Question Answering with Sources

In [195]:
query = "How long until I graduate?"
docs = vectorstore.similarity_search(query)

In [196]:
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

chain = load_qa_with_sources_chain(OpenAI(temperature=0), chain_type="stuff")
query = "How long until I graduate?"
chain({"input_documents": docs, "question": query}, return_only_outputs=True)

{'output_text': ' The length of time until graduation depends on many variables such as the number of classes taken each semester, enrollment in summer classes, having to repeat classes, missing prerequisites, etc.\nSOURCES: https://engineering.utdallas.edu/engineering/academics/undergraduate-majors/undergrad-advising/advising-faq/, https://engineering.utdallas.edu/academics/undergraduate-majors/undergrad-advising/advising-faq/'}

## Retrieval Question/Answering

In [10]:
from langchain.prompts import PromptTemplate

prompt_template = ROLE_PROMPT + """

{context}

Question: {question}
Answer truthfully, do not make up answers or sources: """
PROMPT = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [12]:
from langchain.chains import RetrievalQA
from langchain.llms import OpenAI

chain_type_kwargs = {"prompt": PROMPT}
qa = RetrievalQA.from_chain_type(
    llm=OpenAI(),
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    #chain_type_kwargs=chain_type_kwargs,
    #return_source_documents=True
)

In [200]:
result = qa({"query": "How long until I graduate?"})

In [201]:
result

{'query': 'How long until I graduate?',
 'result': '\n\nThe amount of time it takes to complete your degree requirements and graduate depends on a variety of factors, such as the number of classes you take each semester, whether or not you have to repeat classes, or if you need to complete any prerequisites. To determine a possible timeline for your graduation, it is important to consider how many hours of degree requirements remain, how many hours you will take each semester, and if you can complete all necessary prerequisites by taking a certain number of hours each semester. \n\nTo begin, take a look at your degree plan and add up the hours of all classes remaining to figure out the total number of hours needed for graduation. Then, make sure to keep in mind the maximum hours for fall/spring is 18, and for summer is 15. Using your degree flowchart, you can make a tentative plan for classes that you will take each semester until graduation. \n\nFor more information on advising and re

In [150]:
print(result['result'])


Comet Giving Days is a University of Texas at Dallas annual campaign which raises funds for student scholarships and university initiatives. The 2021 Comet Giving Days will take place from April 11th - April 16th. For more information, please visit the UTD Comet Giving Days website.


## Agent with Vectorstore

In [49]:
from langchain.agents import Tool, initialize_agent, AgentType
from langchain.memory import ConversationBufferMemory
from langchain.chat_models import ChatOpenAI
from langchain.utilities import GoogleSerperAPIWrapper

search = GoogleSerperAPIWrapper()
tools = [
    Tool(
        name = "QA",
        func = conv_chain.run,
        description = "Useful to answer academic questions about the University of Texas at Dallas"
    ),
    Tool(
        name = "Search",
        func = search.run,
        description = "Use this to double check if a website link is valid"
    )
]

In [15]:
memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

In [16]:
chat = ChatOpenAI()

In [50]:
agent = initialize_agent(tools, chat, agent=AgentType.CHAT_CONVERSATIONAL_REACT_DESCRIPTION, verbose=True, memory=memory)

In [51]:
agent.run(input="https://engineering.utdallas.edu/engineering/academics/undergraduate-majors/undergrad-advising/advising-faq/")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m{
    "action": "Search",
    "action_input": "https://engineering.utdallas.edu/engineering/academics/undergraduate-majors/undergrad-advising/advising-faq/"
}[0m
Observation: [33;1m[1;3mVisit our Frequently Asked Questions (FAQ) page. For information about courses and prerequisites, review our degree plans and flowcharts. For ... Advising FAQ. Section Links. Registration and Course Selection; Academic Standing ... Major/Minor Requirements; Graduation; International and Study Abroad. Open the Sections Below to Find Your Assigned Advisor. Please email all registration requests to ecs_registration@utdallas.edu. Advisors for Biomedical ... Undergraduate Majors · Advising · Advisors · Prospective Students · Degree Resources · Graduate ... The department reviews the master's degree plans and approves. Undergraduate Majors · Advising · Advisors · Prospective Students · Degree ... Professor Jorge Cobb Director of Graduate Advising

'Based on the website you provided, I was not able to determine the specific details of what you are asking for. If you provide me with more information or a specific question, I would be happy to try and assist you.'