# Import all your libraries

In [21]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS

In [22]:
import os

faq = None
for path in os.listdir():
    if path == "UTDFAQ.csv":
        faq = path
print(faq)

# Load the Dataset

In [23]:
from datasets import load_dataset
from rich import print
from rich import print

faq_dataset = load_dataset(
    "csv", data_files=faq)
print(faq_dataset)
## remove all the None values
faq_dataset = faq_dataset.filter(lambda x: x['Question'] is not None and x['Answering'] is not None)
## reaplace all the word ARC to AccessAbility Resource Center and
'''
Office location: Administration Building, Room 2.224
Email: studentaccess@utdallas.edu (Do not email attachments, upload documents to utd.link/arcupload only.)
Phone: (972) 883-2098
Fax: Please don’t fax, use utd.link/arcupload
Mail: AD 30, 800 West Campbell Rd., Richardson TX 75080
'''
faq_dataset = faq_dataset.map(lambda x: {'Question': x['Question'].replace('ARC', 'AccessAbility Resource Center'), 'Answering': x['Answering'].replace('ARC', 'AccessAbility Resource Center')})

Using custom data configuration default-36daa326f93da8ce
Found cached dataset csv (/Users/ari/.cache/huggingface/datasets/csv/default-36daa326f93da8ce/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /Users/ari/.cache/huggingface/datasets/csv/default-36daa326f93da8ce/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-7d5e8c8a8ec80ff4.arrow
Loading cached processed dataset at /Users/ari/.cache/huggingface/datasets/csv/default-36daa326f93da8ce/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-827989fef0daa59e.arrow


# Preprocess the dataset

## remove all the None values

In [24]:
## remove all the None values
faq_dataset = faq_dataset.filter(lambda x: x['Question'] is not None and x['Answering'] is not None)

Loading cached processed dataset at /Users/ari/.cache/huggingface/datasets/csv/default-36daa326f93da8ce/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-4920302071d92ab5.arrow


## Replace all the ARC to Accessible Resource Center

In [25]:
# replace all the word UTD to University of Texas at Dallas
# replace all the ARC to AccessAbility Resource Center
faq_dataset = faq_dataset.map(lambda x: {'Question': x['Question'].replace('UTD', 'University of Texas at Dallas'), 'Answering': x['Answering'].replace('UTD', 'University of Texas at Dallas')})
faq_dataset = faq_dataset.map(lambda x: {'Question': x['Question'].replace('ARC', 'AccessAbility Resource Center'), 'Answering': x['Answering'].replace('ARC', 'AccessAbility Resource Center')})

Loading cached processed dataset at /Users/ari/.cache/huggingface/datasets/csv/default-36daa326f93da8ce/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-eb98abd37b61135f.arrow
Loading cached processed dataset at /Users/ari/.cache/huggingface/datasets/csv/default-36daa326f93da8ce/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-387c03f3c4dfda7a.arrow


# Load the Open AI Emebddigns

In [26]:
import openai
from dotenv import load_dotenv
import os
load_dotenv()  # take environment variables from .env.

OPENKEY_API = os.getenv("OPENAI_API_KEY")
#ORGANIZATION_ID = os.getenv("ORGANIZATION_ID")
#openai.organization = ORGANIZATION_ID
# get this from top-right dropdown on OpenAI under organization > settings
openai.api_key = OPENKEY_API
# get API key from top-right dropdown on OpenAI website

openai.Engine.list()  # check we have authenticated
print(openai.Engine.list())
## model of choices
MODEL = "text-embedding-ada-002"

# Ini the Pinecone Client

In [27]:
import pinecone
PINECONE_APIKEY = os.getenv("PINECONE_API_KEY")
# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key = PINECONE_APIKEY,
    environment="us-east-1-aws"
)

In [28]:
index = pinecone.Index(index_name="chatbot-faq")

In [29]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 156}},
 'total_vector_count': 156}

# Role Prompt

In [30]:
role_prompt = """
Please act as a University of Texas at Dallas Counselor. I will provide you with an individual 
looking for guidance at the University of Texas at Dallas, and your task is to help them 
solve their problem\n
"""

In [31]:
def post_process(ans):
    if "ARC" in ans:
        ans = ans.replace("ARC", "AccessAbility Resource Center")
        ans += """ 
        Office location: Administration Building, Room 2.224 \n
        Email: studentaccess@utdallas.edu (Do not email attachments, upload documents to utd.link/arcupload only.) \n
        Phone: (972) 883-2098 \n
        Fax: Please don’t fax, use utd.link/arcupload \n
        Mail: AD 30, 800 West Campbell Rd., Richardson TX 75080    Office location: Administration Building, Room 2.224\n
        """
    return ans

In [32]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Pinecone

def query_vector_database(query):
    embeddings = OpenAIEmbeddings()

    # vectorstore = Pinecone.from_documents(documents, embeddings, index_name=INDEX_NAME) # only used for initial index creation, upserts document embeddings as well as the documents themselves
    vectorstore = Pinecone(index, embeddings.embed_query, "text") # use this for subsequent runs

    docs = vectorstore.similarity_search_with_score(query)

    res = []
    for doc in docs:
      answer = faq_dataset['train'].filter(lambda x: x['Question'] == doc[0].page_content) # get row with the corresponding question in query
      res.append({"Question": f"{answer['Question'][0]}", "Answer": f"{answer['Answering'][0]}", "URL": f"{answer['URL'][0]}"}) # adds a dictionary of the row to list
    return res

In [33]:
embeddings = OpenAIEmbeddings()
vectorstore = Pinecone(index, embeddings.embed_query, "text") # use this for subsequent runs
docs = vectorstore.similarity_search_with_score("hi")
print(docs)

In [34]:
from langchain.prompts.prompt import PromptTemplate
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.chat import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    AIMessagePromptTemplate,
    HumanMessagePromptTemplate,
)
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)

def create_an_standard_qa_prompt(res):
    example_prompt = PromptTemplate(
        input_variables=["Question", "Answer", "URL"], 
        template="Question: {Question}\n{Answer}\nSource:{URL}"
    )

    fewShotPrompt = FewShotPromptTemplate(
        examples=res,
        example_prompt=example_prompt,
        suffix="Question: {input}",
        input_variables=["input"]
    )

    system_message_prompt = SystemMessage(content=role_prompt)
    human_message_prompt = HumanMessagePromptTemplate(prompt=fewShotPrompt)
    chat_prompt = ChatPromptTemplate.from_messages([system_message_prompt, human_message_prompt])
    type(chat_prompt)
    return chat_prompt

In [35]:
from langchain.chat_models import ChatOpenAI

chat = ChatOpenAI(temperature=0)

In [36]:
from langchain import LLMChain

def called_llm_decoder_model(query, prompt):
  chain = LLMChain(llm=chat, prompt=prompt)
  output = chain.run(input=query)
  return output


# Gradio

In [37]:
import gradio as gr
questions = []
answers = []
import random
random_value = random.randint(0, 1000)

def utd_chatbot( question ):
    ## store the question in the csvs file
    #print(question)
    res = query_vector_database(question)
    #print(res)
    propmt = create_an_standard_qa_prompt(res)
    #print(propmt)
    ans = called_llm_decoder_model(question, propmt)
    # post_process(ans)
    return ans

In [42]:
## Lauch of Gradio
demo = gr.Interface(fn=utd_chatbot, inputs="text", outputs="text")
demo.launch( share = True )

Running on local URL:  http://127.0.0.1:7860
Running on public URL: https://e750fb1d-778a-427c.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades (NEW!), check out Spaces: https://huggingface.co/spaces




  0%|          | 0/1 [00:00<?, ?ba/s]

Traceback (most recent call last):
  File "/Users/ari/miniconda3/envs/aiml/lib/python3.9/site-packages/gradio/routes.py", line 344, in run_predict
    output = await app.get_blocks().process_api(
  File "/Users/ari/miniconda3/envs/aiml/lib/python3.9/site-packages/gradio/blocks.py", line 1012, in process_api
    result = await self.call_function(
  File "/Users/ari/miniconda3/envs/aiml/lib/python3.9/site-packages/gradio/blocks.py", line 830, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/Users/ari/miniconda3/envs/aiml/lib/python3.9/site-packages/anyio/to_thread.py", line 31, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "/Users/ari/miniconda3/envs/aiml/lib/python3.9/site-packages/anyio/_backends/_asyncio.py", line 937, in run_sync_in_worker_thread
    return await future
  File "/Users/ari/miniconda3/envs/aiml/lib/python3.9/site-packages/anyio/_backends/_asyncio.py", line 867, in run
    result = context.run(func, *args)
  File

In [41]:
# Close the a demo
demo.close()

Closing server running on port: 7860


# Use Language Chain to Generate one Example

In [40]:
from langchain import PromptTemplate


template = """
I want you to act as a naming consultant for new companies.

Here are some examples of good company names:

- search engine, Google
- social media, Facebook
- video sharing, YouTube

The name should be short, catchy and easy to remember.

What is a good name for a company that makes {product}?
"""

prompt = PromptTemplate(
    input_variables=["product"],
    template=template,
)

from langchain.llms import OpenAI
llm = OpenAI(model_name="text-davinci-003", n=2, best_of=2)
from rich import print
print( llm("Tell me about yourself") ) 