# Setting environment variables

In [3]:
import openai
from dotenv import load_dotenv
import os
load_dotenv()

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONENT = os.getenv("PINECONE_ENVIRONENT")
# get API key from top-right dropdown on OpenAI website

openai.Engine.list()  # check we have authenticated

<OpenAIObject list at 0x7fb7d179e4a0> JSON: {
  "data": [
    {
      "created": null,
      "id": "babbage",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "davinci",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "text-davinci-edit-001",
      "object": "engine",
      "owner": "openai",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "babbage-code-search-code",
      "object": "engine",
      "owner": "openai-dev",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "text-similarity-babbage-001",
      "object": "engine",
      "owner": "openai-dev",
      "permissions": null,
      "ready": true
    },
    {
      "created": null,
      "id": "code-davinci-edit-001",
      "object": "engine",
      

## First, get the dataset

We need to download the dataset using load_dataset so that we can filter out the questions that are not part of the query we get back from the vector database

In [4]:
from datasets import load_dataset
from rich import print

faq_dataset = load_dataset("csv", data_files="./FAQ Dataset.csv", delimiter=",") # may throw error, just keep retrying
faq_dataset['train']

  from .autonotebook import tqdm as notebook_tqdm
Using custom data configuration default-8048939cc6255881


Downloading and preparing dataset csv/default to /Users/anthonyjaramillo/.cache/huggingface/datasets/csv/default-8048939cc6255881/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 3449.26it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 431.60it/s]
                                                        

Dataset csv downloaded and prepared to /Users/anthonyjaramillo/.cache/huggingface/datasets/csv/default-8048939cc6255881/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 306.56it/s]


Dataset({
    features: ['Question', 'Answering', 'URL', 'Label'],
    num_rows: 156
})

Using pandas easily let's us single out the 'Question' column to get only those embeddings to store in the vector database

In [37]:
import pandas as pd

df = pd.DataFrame(faq_dataset['train'])
df = df[["Question","URL"]] # only want questions stored in the db
df['Source'] = "FAQ" # add source column
df

Unnamed: 0,Question,URL,Source
0,Does UT Dallas provide services for students with disabilities?,https://accessability.utdallas.edu/student-accommodations/frequently-asked-q...,FAQ
1,Is there a separate admissions procedure for students with disabilities?\n,https://Accessibility.utdallas.edu/student-accommodations/frequently-asked-q...,FAQ
2,Should I send my disability documentation with my admissions application?,https://Accessibility.utdallas.edu/student-accommodations/frequently-asked-q...,FAQ
3,What documentation is required to receive disability services?\n,https://Accessibility.utdallas.edu/student-accommodations/frequently-asked-q...,FAQ
4,Is there a deadline for submitting disability documentation?,https://Accessibility.utdallas.edu/student-accommodations/frequently-asked-q...,FAQ
...,...,...,...
151,When should I expect to receive my refund?,https://finaid.utdallas.edu/receiving-aid/faq/,FAQ
152,Where can I get information on the loans that I have borrowed?,https://finaid.utdallas.edu/receiving-aid/faq/,FAQ
153,"I need to purchase books, but my financial aid has not been disbursed yet. W...",https://finaid.utdallas.edu/receiving-aid/faq/,FAQ
154,What is considered a special circumstance?,https://finaid.utdallas.edu/receiving-aid/faq/,FAQ


Documents are LangChain's standard file used throughout the library. A handy DataFrameLoader let's us convert dataframes to documents

In [50]:
from langchain.document_loaders import DataFrameLoader

loader = DataFrameLoader(df, page_content_column="Question")
documents = loader.load()
documents

[Document(page_content='Does UT Dallas provide services for students with disabilities?', metadata={'URL': 'https://accessability.utdallas.edu/student-accommodations/frequently-asked-questions/', 'Source': 'FAQ'}),
 Document(page_content='Is there a separate admissions procedure for students with disabilities?\n', metadata={'URL': 'https://Accessibility.utdallas.edu/student-accommodations/frequently-asked-questions/', 'Source': 'FAQ'}),
 Document(page_content='Should I send my disability documentation with my admissions application?', metadata={'URL': 'https://Accessibility.utdallas.edu/student-accommodations/frequently-asked-questions/', 'Source': 'FAQ'}),
 Document(page_content='What documentation is required to receive disability services?\n', metadata={'URL': 'https://Accessibility.utdallas.edu/student-accommodations/frequently-asked-questions/', 'Source': 'FAQ'}),
 Document(page_content='Is there a deadline for submitting disability documentation?', metadata={'URL': 'https://Acces

## Initialize pinecone and get the index

In [51]:
import pinecone

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONENT
)

LangChain let's us combine documents, an embedding procedure, and an index name to create a pinecone index for us if it does not exist, or adds to an existing index with the same index name passed in. Alternatively we can just instantiate the Pinecone wrapper just with the index name and embeddings procedure

In [None]:
INDEX_NAME = "chatbot-faq"
pinecone.create_index("chatbot-faq", dimension=1536, metric="cosine")

In [63]:
pinecone.list_indexes()

['chatbot-faq']

In [65]:
index = pinecone.Index(INDEX_NAME)

In [None]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

embeddings = OpenAIEmbeddings()

# vectorstore = Pinecone.from_documents(documents, embeddings, index_name=INDEX_NAME) # only used for initial index creation, upserts document embeddings as well as the documents themselves
vectorstore = Pinecone(index, embeddings.embed_query, "text") # use this for subsequent runs

In [47]:
query = "What kinds of services does UT Dallas provide for students with disabilities?"

Vectorstores come with handy methods, similarity search being the most applicable to our use case. These methods return documents that we must handle.

In [48]:
docs = vectorstore.similarity_search_with_score(query)
docs

[(Document(page_content='Does UT Dallas provide services for students with disabilities?', metadata={}),
  0.976667941),
 (Document(page_content='What transportation services are available for people with disabilities on and around campus? ', metadata={}),
  0.894767225),
 (Document(page_content='Is there a separate admissions procedure for students with disabilities?\n', metadata={}),
  0.856727958),
 (Document(page_content='I am a UT Dallas student and employed as a UT Dallas teaching or research assistant. Can I get in-state tuition?', metadata={}),
  0.844034553)]

In [49]:
qa = []
for doc in docs:
  answer = faq_dataset['train'].filter(lambda x: x['Question'] == doc[0].page_content) # get row with the corresponding question in query
  qa.append({"Question": f"{answer['Question'][0]}", "Answer": f"{answer['Answering'][0]}", "URL": f"{answer['URL'][0]}"}) # adds a dictionary of the row to list
print(qa)

100%|██████████| 1/1 [00:00<00:00, 43.29ba/s]
100%|██████████| 1/1 [00:00<00:00, 336.70ba/s]
100%|██████████| 1/1 [00:00<00:00, 500.99ba/s]
100%|██████████| 1/1 [00:00<00:00, 420.14ba/s]


## Few Shot Prompts

In [14]:
from langchain.prompts.prompt import PromptTemplate

example_prompt = PromptTemplate(input_variables=["Question", "Answer", "Source"], template="Question: {Question}\n{Answer}\nSource:{Source}")
print(example_prompt.format(**qa[0]))

In [15]:
from langchain.prompts.few_shot import FewShotPromptTemplate

fewShotPrompt = FewShotPromptTemplate(
    examples=qa,
    example_prompt=example_prompt,
    suffix="Question: {input}",
    input_variables=["input"]
)

print(fewShotPrompt.format(input=query))

In [16]:
role_prompt = """You are an advisor at the University of Texas at Dallas who serves students by answering questions pertaining to information found on the university's website. 
Answer all questions in complete detail to provide guidance to students and include links to webpages which you found the information from. 
Keep a positive attitude when answering students and be professional with your wording.
Write in the style and quality of an expert in university advising and student services with 20+ years of experience in the field."""

In [17]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
system_message_prompt = SystemMessage(content=role_prompt)
human_message_prompt = HumanMessagePromptTemplate(prompt=fewShotPrompt)
chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

## Connecting to ChatGPT

In [18]:
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(temperature=0)

In [19]:
from langchain import LLMChain

chain = LLMChain(llm=chat, prompt=chat_prompt)

In [21]:
output = chain.run(input=query)
print(output)