# Setting environment variables

In [2]:
import openai
from dotenv import load_dotenv
import os
load_dotenv()

OPENKEY_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_ENVIRONENT = os.getenv("PINECONE_ENVIRONENT")
# get API key from top-right dropdown on OpenAI website

openai.Engine.list()  # check we have authenticated

ModuleNotFoundError: No module named 'openai'

## First, get the dataset

We need to download the dataset using load_dataset so that we can filter out the questions that are not part of the query we get back from the vector database

In [2]:
from datasets import load_dataset
from rich import print

faq_dataset = load_dataset("csv", data_files="./FAQ Dataset.csv", delimiter=",") # may throw error, just keep retrying
faq_dataset['train']

  from .autonotebook import tqdm as notebook_tqdm
Using custom data configuration default-e08e05220f75e9aa
Found cached dataset csv (C:/Users/Anthony/.cache/huggingface/datasets/csv/default-e08e05220f75e9aa/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 333.33it/s]


Dataset({
    features: ['Question', 'Answering', 'URL', 'Label'],
    num_rows: 156
})

Using pandas easily let's us single out the 'Question' column to get only those embeddings to store in the vector database

In [3]:
import pandas as pd

df = pd.DataFrame(faq_dataset['train'])
df = df[["Question"]] # only want questions stored in the db
df

Unnamed: 0,Question
0,Does UT Dallas provide services for students w...
1,Is there a separate admissions procedure for s...
2,Should I send my disability documentation with...
3,What documentation is required to receive disa...
4,Is there a deadline for submitting disability ...
...,...
151,When should I expect to receive my refund?
152,Where can I get information on the loans that ...
153,"I need to purchase books, but my financial aid..."
154,What is considered a special circumstance?


Documents are LangChain's standard file used throughout the library. A handy DataFrameLoader let's us convert dataframes to documents

In [4]:
from langchain.document_loaders import DataFrameLoader

loader = DataFrameLoader(df, page_content_column="Question")
documents = loader.load()
documents

[Document(page_content='Does UT Dallas provide services for students with disabilities?', metadata={}),
 Document(page_content='Is there a separate admissions procedure for students with disabilities?\n', metadata={}),
 Document(page_content='Should I send my disability documentation with my admissions application?', metadata={}),
 Document(page_content='What documentation is required to receive disability services?\n', metadata={}),
 Document(page_content='Is there a deadline for submitting disability documentation?', metadata={}),
 Document(page_content='Who at the University will know about my disability if I register with ARC?', metadata={}),
 Document(page_content='What happens after my disability documentation is received?', metadata={}),
 Document(page_content='What accommodations and services will I be eligible for?', metadata={}),
 Document(page_content='Services and Procedures for Employees', metadata={}),
 Document(page_content='How do I take a test at the ARC Testing Center

## Initialize pinecone and get the index

In [5]:
import pinecone

pinecone.init(
    api_key=PINECONE_API_KEY,
    environment=PINECONE_ENVIRONENT
)

LangChain let's us combine documents, an embedding procedure, and an index name to create a pinecone index for us if it does not exist, or adds to an existing index with the same index name passed in. Alternatively we can just instantiate the Pinecone wrapper just with the index name and embeddings procedure

In [6]:
pinecone.list_indexes()

['utd-chatbot']

In [7]:
INDEX_NAME = "utd-chatbot"
index = pinecone.Index(INDEX_NAME)

In [13]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

embeddings = OpenAIEmbeddings()

# vectorstore = Pinecone.from_documents(documents, embeddings, index_name=index_name) # only used for initial index creation, upserts document embeddings as well as the documents themselves
vectorstore = Pinecone(index, embeddings.embed_query, "text") # use this for subsequent runs

In [11]:
query = "What kinds of services does UT Dallas provide for students with disabilities?"

Vectorstores come with handy methods, similarity search being the most applicable to our use case. These methods return documents that we must handle.

In [14]:
docs = vectorstore.similarity_search_with_score(query)
docs

[(Document(page_content='Does UT Dallas provide services for students with disabilities?', metadata={}),
  0.976678789),
 (Document(page_content='What transportation services are available for people with disabilities on and around campus? ', metadata={}),
  0.894735754),
 (Document(page_content='Is there a separate admissions procedure for students with disabilities?\n', metadata={}),
  0.856736541),
 (Document(page_content='I am a UT Dallas student and employed as a UT Dallas teaching or research assistant. Can I get in-state tuition?', metadata={}),
  0.843869746)]

In [15]:
qa = []
for doc in docs:
  answer = faq_dataset['train'].filter(lambda x: x['Question'] == doc[0].page_content) # get row with the corresponding question in query
  qa.append({"Question": f"{answer['Question'][0]}", "Answer": f"{answer['Answering'][0]}", "Source": f"{answer['URL'][0]}"}) # adds a dictionary of the row to list
print(qa)

100%|██████████| 1/1 [00:00<00:00,  9.95ba/s]
100%|██████████| 1/1 [00:00<00:00, 285.77ba/s]
100%|██████████| 1/1 [00:00<00:00, 222.13ba/s]
100%|██████████| 1/1 [00:00<00:00, 200.00ba/s]


## Few Shot Prompts

In [16]:
from langchain.prompts.prompt import PromptTemplate

example_prompt = PromptTemplate(input_variables=["Question", "Answer", "Source"], template="Question: {Question}\n{Answer}\nSource:{Source}")
print(example_prompt.format(**qa[0]))

In [17]:
from langchain.prompts.few_shot import FewShotPromptTemplate

fewShotPrompt = FewShotPromptTemplate(
    examples=qa,
    example_prompt=example_prompt,
    suffix="Question: {input}",
    input_variables=["input"]
)

print(fewShotPrompt.format(input=query))

In [18]:
role_prompt = """You are an advisor at the University of Texas at Dallas who serves students by answering questions pertaining to information found on the university's website. 
Answer all questions in complete detail to provide guidance to students and include links to webpages which you found the information from. 
Keep a positive attitude when answering students and be professional with your wording."""

In [19]:
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
system_message_prompt = SystemMessage(content=role_prompt)
human_message_prompt = HumanMessagePromptTemplate(prompt=fewShotPrompt)
chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])

## Connecting to ChatGPT

In [24]:
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(temperature=0)

ValidationError: 1 validation error for ChatOpenAI
__root__
  `openai` has no `ChatCompletion` attribute, this is likely due to an old version of the openai package. Try upgrading it with `pip install --upgrade openai`. (type=value_error)

In [None]:
from langchain import LLMChain

chain = LLMChain(llm=chat, prompt=chat_prompt)

In [None]:
chain.run(input=query)

'UT Dallas provides a variety of services for students with disabilities, including but not limited to note-taking services, test accommodations, assistive technology, accessible campus transportation, sign language interpreters, and accessible housing. The Accessibility Resource Center (ARC) facilitates the provision of services and accommodations for students.\nSource:https://Accessibility.utdallas.edu/student-accommodations/frequently-asked-questions/'