# Install and import libraries

In [1]:
!pip install -qU pinecone-client openai datasets

In [2]:
import pandas as pd
import numpy as np

# Read FAQ data

In [3]:
faq = pd.read_csv("FAQ.csv")
questions = [s.rstrip() for s in faq['Question']] # list of all the questions

In [4]:
faq

Unnamed: 0,Question,Answer,URL,Label
0,Does UT Dallas provide services for students w...,"Yes, accommodations and services are provided ...",https://accessability.utdallas.edu/student-acc...,Student Accessibility
1,Is there a separate admissions procedure for s...,No. The admissions process and criteria are th...,https://Accessibility.utdallas.edu/student-acc...,Student Accessibility
2,Should I send my disability documentation with...,No. Documentation should be submitted to ARC o...,https://Accessibility.utdallas.edu/student-acc...,Student Accessibility
3,What documentation is required to receive disa...,Students requesting services must provide curr...,https://Accessibility.utdallas.edu/student-acc...,Student Accessibility
4,Is there a deadline for submitting disability ...,To ensure that accommodations will be in place...,https://Accessibility.utdallas.edu/student-acc...,Student Accessibility
...,...,...,...,...
151,When should I expect to receive my refund?,Please see the Bursar’s Office Refunds page fo...,https://finaid.utdallas.edu/receiving-aid/faq/,"Scholarship, Cost & AID"
152,Where can I get information on the loans that ...,All of your federal loan information can be ob...,https://finaid.utdallas.edu/receiving-aid/faq/,"Scholarship, Cost & AID"
153,"I need to purchase books, but my financial aid...",If your financial aid application is complete ...,https://finaid.utdallas.edu/receiving-aid/faq/,"Scholarship, Cost & AID"
154,What is considered a special circumstance?,A special circumstance is an event that causes...,https://finaid.utdallas.edu/receiving-aid/faq/,"Scholarship, Cost & AID"


# Setup openai

In [5]:
import openai
import os

openai.api_key = "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"
# get API key from top-right dropdown on OpenAI website

# Turn questions into vectors (embedding)

In [6]:
MODEL = "text-embedding-ada-002"

res = openai.Embedding.create(
    input=questions, engine=MODEL
)
embeds = [record['embedding'] for record in res['data']] # embeds is list of vectors

# Setup pinecone

In [7]:
import pinecone

index_name = 'semantic-search-openai'

# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key="d5580837-1cbd-46ba-8cd5-d1f9e3cff833",
    environment="eu-west1-gcp"  # find next to api key in console
)
# check if 'openai' index already exists (only create index if not)
if index_name not in pinecone.list_indexes():
    pinecone.create_index(index_name, dimension=len(embeds[0]))
# connect to index
index = pinecone.Index(index_name)

  from tqdm.autonotebook import tqdm


# Put questions into pinecone

In [8]:
to_upsert = zip([str(n) for n in range(len(embeds))], embeds, [{'text': line, 'faq':True} for line in faq['Question']])

In [9]:
index.upsert(vectors=list(to_upsert))

{'upserted_count': 156}

# Haystack

## Set up Haystack

In [10]:
from haystack.pipelines import Pipeline
from haystack.nodes import Crawler, PreProcessor
from haystack.document_stores import InMemoryDocumentStore

document_store = InMemoryDocumentStore(use_bm25=True)

In [27]:
URLS = [
    "https://engineering.utdallas.edu/academics/undergraduate-majors/undergrad-advising/advising-faq/",
    "https://engineering.utdallas.edu/engineering/academics/undergraduate-majors/undergrad-advising/graduate-courses/",
    "https://www.utdallas.edu/fact-sheets/ecs/bs-biomedical-engineering/",
    "https://www.utdallas.edu/fact-sheets/ecs/bs-computer-engineering/",
    "https://www.utdallas.edu/fact-sheets/ecs/bs-computer-science/",
    "https://www.utdallas.edu/fact-sheets/ecs/bs-electrical-engineering/",
    "https://www.utdallas.edu/fact-sheets/ecs/bs-mechanical-engineering/",
    "https://www.utdallas.edu/fact-sheets/ecs/bs-software-engineering/",
    "https://fye.utdallas.edu/orientation/",
    "https://fye.utdallas.edu/orientation/faq/",
    "https://oue.utdallas.edu/aleks-exam/",
    "https://www.utdallas.edu/academics/academic-resources/",
    "https://www.utdallas.edu/costs-scholarships-aid/faq/",
    "https://accessability.utdallas.edu/student-accommodations/frequently-asked-questions/",
    "https://housing.utdallas.edu/resources/frequently-asked-questions/",
    "https://services.utdallas.edu/transit/parkfaq/",
    "https://registrar.utdallas.edu/faq/",
    "https://services.utdallas.edu/bookstore/faq/",
    "https://isso.utdallas.edu/joining-ut-dallas/i20-receive/",
    "https://finaid.utdallas.edu/receiving-aid/faq/"
    "https://catalog.utdallas.edu/2022/undergraduate/home",
    "https://catalog.utdallas.edu/2022/undergraduate/programs/ecs",
    "https://catalog.utdallas.edu/2022/undergraduate/programs/ecs/biomedical-engineering",
    "https://catalog.utdallas.edu/2022/undergraduate/programs/ecs/computer-engineering",
    "https://catalog.utdallas.edu/2022/undergraduate/programs/ecs/computer-science",
    "https://catalog.utdallas.edu/2022/undergraduate/programs/ecs/data-science",
    "https://catalog.utdallas.edu/2022/undergraduate/programs/ecs/electrical-engineering",
    "https://catalog.utdallas.edu/2022/undergraduate/programs/ecs/mechanical-engineering",
    "https://catalog.utdallas.edu/2022/undergraduate/programs/ecs/software-engineering",
    "https://catalog.utdallas.edu/2022/undergraduate/programs/ecs/certificates",
    "https://catalog.utdallas.edu/2022/undergraduate/programs/ecs/minors",
    "https://catalog.utdallas.edu/2022/undergraduate/curriculum",
    "https://catalog.utdallas.edu/2022/undergraduate/curriculum/core-curriculum",
    "https://catalog.utdallas.edu/2022/undergraduate/curriculum/honors-programs",
    "https://catalog.utdallas.edu/2022/undergraduate/curriculum/other-degree-requirements",
    "https://catalog.utdallas.edu/2022/undergraduate/tuition-and-financial-aid",
    "https://catalog.utdallas.edu/2022/undergraduate/tuition-and-financial-aid/excessive-hours",
    "https://catalog.utdallas.edu/2022/undergraduate/tuition-and-financial-aid/tuition-refund",
    "https://catalog.utdallas.edu/2022/undergraduate/tuition-and-financial-aid/other-fees",
    "https://catalog.utdallas.edu/2022/undergraduate/tuition-and-financial-aid/financial-aid",
    "https://catalog.utdallas.edu/2022/undergraduate/tuition-and-financial-aid/types-of-financial-aid",
    "https://catalog.utdallas.edu/2022/undergraduate/policies/academic",
    "https://catalog.utdallas.edu/2022/undergraduate/policies/graduate-courses/fasttrack",
    "https://catalog.utdallas.edu/2022/undergraduate/honors-college",
    "https://catalog.utdallas.edu/2022/undergraduate/policies/degree-plans",
    "https://catalog.utdallas.edu/2022/undergraduate/policies/course-policies",
    "https://catalog.utdallas.edu/2022/undergraduate/policies/graduate-courses",
    "https://catalog.utdallas.edu/2022/undergraduate/policies/graduation",
    "https://catalog.utdallas.edu/2022/undergraduate/policies/registration",
    "https://catalog.utdallas.edu/2022/undergraduate/policies/military",
    "https://catalog.utdallas.edu/2022/undergraduate/policies/student-travel",
    "https://catalog.utdallas.edu/2022/undergraduate/policies/education-abroad-policies",
    "https://catalog.utdallas.edu/2022/undergraduate/policies/addendum",
    "https://catalog.utdallas.edu/2022/undergraduate/resources"
]

In [26]:
from haystack.pipelines import Pipeline
from haystack.nodes import Crawler, PreProcessor
from haystack.document_stores import PineconeDocumentStore

def run_crawler(depth = 0):
    crawler = Crawler(
        urls=URLS,   # Websites to crawl
        crawler_depth=depth,    # How many links to follow
        #output_dir="crawled_files",  # The directory to store the crawled files, not very important, we don't use the files in this example
        filter_urls=["utd"],  # Only follow links that match this regex
    )

    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=300,
        split_respect_sentence_boundary=False,
        split_overlap=40
    )

    indexing_pipeline = Pipeline()
    indexing_pipeline.add_node(component=crawler, name="crawler", inputs=['File'])
    indexing_pipeline.add_node(component=preprocessor, name="preprocessor", inputs=['crawler'])

    return indexing_pipeline.run()

In [184]:
data = run_crawler()

Preprocessing: 100%|██████████| 41/41 [00:00<00:00, 705.80docs/s]
Writing Documents: 1024it [00:05, 185.91it/s]                         


In [185]:
len(data['documents'])

1008

In [None]:
data

## Embed text from website

In [28]:
MODEL = "text-embedding-ada-002"

In [29]:
import openai

def get_embeddings_webtext(data):
    webtext = [i.content for i in data]
    res = openai.Embedding.create(input=webtext, engine=MODEL)
    embeds = [record['embedding'] for record in res['data']] # embeds is list of vectors
    return embeds

In [160]:
embeds = get_embeddings_webtext(data['documents'])

In [161]:
len(embeds)

1009

## Put webtext into Pinecone

In [166]:
data['documents'][0].content

'\n\nSkip to Main Content.\nSkip to Main Navigation.\nSkip to Main Footer.\n\nThe University of Texas at Dallas\n\nErik Jonsson School\nof Engineering and\nComputer Science\n\nAbout\nAcademics\nInnovation\nEngage\nNews\n\nLeadership\nCorporate Partners\nAt a Glance\n\nFacilities\nContact\nCareers\n\nUndergraduate Majors\nGraduate Education\nProfessional Education\n\nResearch\nResearch Awards\nResearch Centers\nResearch Institutes\n\nStudents\nAlumni\nCorporate Engagement\nSocial Media Directory\nGiving\n\nStories\nPulsar\nMedia Spotlight\nNewsletter\nMagazine\n\nCampus Map\nGiving\nContact\n\nMain Navigation\n\nAbout\n\nLeadership\nCorporate Partners\nAt a Glance\n\nFacilities\nContact\nCareers\nJ. Erik Jonsson\n\nAcademics\n\nUndergraduate Majors\nGraduate Education\nCertificates\n\nInnovation\n\nResearch\nResearch Awards\nResearch Centers\nResearch Institutes\n\nEngage\n\nStudents\nAlumni\nCorporate Engagement\nSocial Media Directory\nGiving\n\nNews\n\nStories\nPulsar\nMedia Spotligh

In [167]:
data['documents'][0].meta['url']

'https://engineering.utdallas.edu/academics/undergraduate-majors/undergrad-advising/advising-faq/'

In [30]:
import pinecone
import uuid

def upsert_data(index: pinecone.Index, embeds, data):
    to_upsert = zip([str(uuid.uuid4()) for n in range(len(embeds))], embeds, [{'text': doc.content, 'faq':False, 'source':doc.meta['url']} for doc in data])
    print(to_upsert)
    index.upsert(vectors=list(to_upsert))

In [31]:
def upsert_crawled_data():
    results = run_crawler()
    results = results['documents']
    embeds = get_embeddings_webtext(results)
    index = pinecone.Index('chatbot-faq')
    for i in range(0, len(embeds), 32):
        stopping_point = min(i+32, len(embeds))
        upsert_data(index, embeds[i:stopping_point], results[i:stopping_point])

In [32]:
upsert_crawled_data()

Preprocessing:   0%|          | 0/53 [00:00<?, ?docs/s]Document 70bf4d8118fe014453d22175dea30496 is 10407 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document 691b596936d3ad88dbb8b48e282e5b79 is 10399 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document 59693c5e8cd0462669328dbcf10107a7 is 10287 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
Document c253733f1a5f8e9599a3022516a27402 is 10406 characters long after preprocessing, where the maximum length should be 10000. Something might be wrong with the splitting, check the document affected to prevent issues at query time.
D

<zip object at 0x7fe2d0fc8300>
<zip object at 0x7fe2c7f6eb80>
<zip object at 0x7fe2d0fce580>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>
<zip object at 0x7fe2a261b640>


# Get most similar questions

## FUNCTION: input = question, output = 5 most similar questions in pinecone

In [15]:
# returns a list of questions
def mostSimilarQs(query):
  xq = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']
  res = index.query([xq], top_k=5, include_metadata=True)
  mostSimilarQs = [[res['matches'][i]['metadata']['text'], res['matches'][i]['metadata']['faq']] for i, _ in enumerate(res['matches'])]
  return mostSimilarQs

In [16]:
def QtoA (question):
  row = faq[faq['Question'] == question].values[0]
  return row[1] + "\n" + row[2]

In [17]:
def QtoChunk(l):
    if l[1]:
        return "Q: " + l[0] + "\nA: " + QtoA(l[0])
    else:
        return l[0]

In [18]:
def chunks(question):
    similar = mostSimilarQs(question)
    return [QtoChunk(l) for l in similar]

# Build full query

## FUNCTION: input = question, output = full query

In [19]:
def QtoQuery(question):
  c = chunks(question)
  query =  """Please pretend you are an enthusiastic, knowledgable, friendly, and helpful counselor for the University of Texas at Dallas. 
  I will be asking you questions about the University of Texas at Dallas and you must respond with clear, concise answers. 
  Provide examples and explain procedures step by step in an ordered list. Please provide all information that would be helpful.

  {c1}

  {c2}

  {c3}

  {c4}

  {c5}

  Q: {q}
  A: """.format(c1 = c[0], c2 = c[1], c3 = c[2], c4 = c[3], c5 = c[4],q=question)
  return query



# Example

In [20]:
print(QtoQuery("How can I accelerate my curriculum?"))

Please pretend you are an enthusiastic, knowledgable, friendly, and helpful counselor for the University of Texas at Dallas. 
  I will be asking you questions about the University of Texas at Dallas and you must respond with clear, concise answers. 
  Provide examples and explain procedures step by step in an ordered list. Please provide all information that would be helpful.

  Q: How do I audit a course?
A: For more information on how to audit a course, please go to audit registration.
https://registrar.utdallas.edu/faq/

  Q: How do I reserve classroom space on campus?
A: For information on reserving classroom space on campus, please see Special Event Room Reservations.
https://registrar.utdallas.edu/faq/

  Q: I would like to change my major. What do I need to do?
A: Eligible students wishing to change their major/plan/program should review the policy listed in the Academic Catalog.

For undergraduate students: catalog.utdallas.edu/current/undergraduate/policies/degree-plans.
For g

In [21]:
def talk(question):
  response = openai.Completion.create(
    model="text-davinci-003",
    prompt=QtoQuery(question),
    temperature=0.3,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )
  print(response['choices'][0]['text'])

In [22]:
print(talk("How do I get started with fast track?"))

 Fast Track is a program that allows undergraduate students to take graduate-level courses for undergraduate credit. To get started with Fast Track, you must first meet the eligibility requirements. Eligibility requirements include:

1. Be a current UT Dallas undergraduate student in good academic standing
2. Have a minimum UT Dallas GPA of 3.0
3. Have completed at least 60 hours of undergraduate coursework
4. Have the approval of your academic advisor

Once you have met the eligibility requirements, you can download the Fast Track application and submit it to your academic advisor during the appropriate application timeframe. Your academic advisor will review the application and provide you with feedback. Once approved, you can register for the graduate-level courses.
None


In [23]:
print(QtoQuery("How long until I graduate?"))

Please pretend you are an enthusiastic, knowledgable, friendly, and helpful counselor for the University of Texas at Dallas. 
  I will be asking you questions about the University of Texas at Dallas and you must respond with clear, concise answers. 
  Provide examples and explain procedures step by step in an ordered list. Please provide all information that would be helpful.

  To determine a possible graduation timeline, consider the following:
1.	How many hours of degree requirements remain?
Using your degree plan, you can add up the hours of all classes that remain to figure out this number.
2.	How many hours will you take each semester? Remember that 18 is the maximum for fall/spring, and 15 is the max for summer.
3.	Can I complete all necessary prerequisites by taking ___ hours each semester? 

  Q: I am going to finish my degree in less time than listed on my I-20. Can you issue an I-20 with a shorter program length?
A: The Department of Homeland Security requires UT Dallas to e