# Import all your libraries

In [7]:
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.faiss import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain import OpenAI, VectorDBQA

# Load the Dataset

In [8]:
from datasets import load_dataset
from rich import print
from rich import print

faq_dataset = load_dataset("csv" , data_files="/home/null/code/utd_chatbot/FAQ on UTD Student Accesebility - Sheet1.csv")
print(faq_dataset)
## remove all the None values
faq_dataset = faq_dataset.filter(lambda x: x['Question'] is not None and x['Answering'] is not None)
## reaplace all the word ARC to AccessAbility Resource Center and
'''
Office location: Administration Building, Room 2.224
Email: studentaccess@utdallas.edu (Do not email attachments, upload documents to utd.link/arcupload only.)
Phone: (972) 883-2098
Fax: Please don’t fax, use utd.link/arcupload
Mail: AD 30, 800 West Campbell Rd., Richardson TX 75080
'''
faq_dataset = faq_dataset.map(lambda x: {'Question': x['Question'].replace('ARC', 'AccessAbility Resource Center'), 'Answering': x['Answering'].replace('ARC', 'AccessAbility Resource Center')})

  from .autonotebook import tqdm as notebook_tqdm
Using custom data configuration default-b5279144beba40c5
Found cached dataset csv (/home/null/.cache/huggingface/datasets/csv/default-b5279144beba40c5/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
100%|██████████| 1/1 [00:00<00:00, 200.57it/s]


Loading cached processed dataset at /home/null/.cache/huggingface/datasets/csv/default-b5279144beba40c5/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-d8b863d6763fbde1.arrow
Loading cached processed dataset at /home/null/.cache/huggingface/datasets/csv/default-b5279144beba40c5/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-0d1da929658a0122.arrow


# Preprocess the dataset

## remove all the None values

In [9]:
## remove all the None values
faq_dataset = faq_dataset.filter(lambda x: x['Question'] is not None and x['Answering'] is not None)

Loading cached processed dataset at /home/null/.cache/huggingface/datasets/csv/default-b5279144beba40c5/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-f0be635390229960.arrow


## Replace all the ARC to Accessible Resource Center

In [10]:
# replace all the word UTD to University of Texas at Dallas
# replace all the ARC to AccessAbility Resource Center
faq_dataset = faq_dataset.map(lambda x: {'Question': x['Question'].replace('UTD', 'University of Texas at Dallas'), 'Answering': x['Answering'].replace('UTD', 'University of Texas at Dallas')})
faq_dataset = faq_dataset.map(lambda x: {'Question': x['Question'].replace('ARC', 'AccessAbility Resource Center'), 'Answering': x['Answering'].replace('ARC', 'AccessAbility Resource Center')})

Loading cached processed dataset at /home/null/.cache/huggingface/datasets/csv/default-b5279144beba40c5/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-9c8a39eaf810fe9d.arrow
Loading cached processed dataset at /home/null/.cache/huggingface/datasets/csv/default-b5279144beba40c5/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317/cache-c63ea3f7cd72048e.arrow


# Load the Open AI Emebddigns

In [11]:
import openai
from dotenv import load_dotenv
import os
load_dotenv()  # take environment variables from .env.

OPENKEY_API = os.getenv("OPENAI_KEY")
ORGANIZATION_ID = os.getenv("ORGANIZATION_ID")
openai.organization = ORGANIZATION_ID
# get this from top-right dropdown on OpenAI under organization > settings
openai.api_key = OPENKEY_API
# get API key from top-right dropdown on OpenAI website

openai.Engine.list()  # check we have authenticated
## model of choices
MODEL = "text-similarity-babbage-001"

# Ini the Pinecone Client

In [12]:
import pinecone
PINECONE_APIKEY = os.getenv("PINECONE_APIKEY")
# initialize connection to pinecone (get API key at app.pinecone.io)
pinecone.init(
    api_key = PINECONE_APIKEY,
    environment="us-east1-gcp"
)

In [13]:
index = pinecone.Index(index_name="utd-chabot")

# Query a Vector Database

In [14]:
def query_vector_database(query):
    xq = openai.Embedding.create(input=query, engine=MODEL)['data'][0]['embedding']
    res = index.query([xq], top_k=5, include_metadata=True)
    return res

# Role Prompt

In [15]:
role_prompt = """
Please act as a University of Texas at Dallas Counselor. I will provide you with an individual 
looking for guidance at the University of Texas at Dallas, and your task is to help them 
solve their problem\n
"""

# Create an standard QA Propmt

In [16]:
def create_an_standard_qa_prompt( query , res):
    import copy
    role_prompt_temp =  copy.deepcopy(role_prompt)
    for query_response in  res["matches"]:
        try:
            answer = faq_dataset['train'].filter(lambda x: x['Question'] == query_response['metadata']['text'])['Answering'][0]
            role_prompt_temp += f"Q: {query_response['metadata']['text']}\nA: {answer}\n"
        except:
            print(f"The question {query_response['metadata']['text']} is not in the dataset")
            print(f"Errors")
            pass
    prompt = role_prompt_temp + f"Q: {query}\nA:Let’s think step by step."
    return prompt

# Called a LLM Decoder Model

In [17]:
def called_llm_decoder_model(prompt):
  response = openai.Completion.create(
    model="text-davinci-002",
    prompt =  prompt,
    temperature = 0.8,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
  )
  return response["choices"][0]["text"]

# Post Process of the Output

In [18]:
def post_process(ans):
    if "ARC" in ans:
        ans = ans.replace("ARC", "AccessAbility Resource Center")
        ans += """ 
        Office location: Administration Building, Room 2.224 \n
        Email: studentaccess@utdallas.edu (Do not email attachments, upload documents to utd.link/arcupload only.) \n
        Phone: (972) 883-2098 \n
        Fax: Please don’t fax, use utd.link/arcupload \n
        Mail: AD 30, 800 West Campbell Rd., Richardson TX 75080    Office location: Administration Building, Room 2.224\n
        """
    return ans

# Gradio

In [19]:
import gradio as gr
questions = []
answers = []
import random
random_value = random.randint(0, 1000)

def utd_chatbot( question ):
    ## store the question in the csvs file
    print(question)
    res = query_vector_database(question)
    print(res)
    propmt = create_an_standard_qa_prompt(question , res)
    print(propmt)
    ans = called_llm_decoder_model(propmt)
    post_process(ans)
    try:
        import pandas as pd
        questions.append( question )
        answers.append( ans )
        data = pd.DataFrame( {"questions": questions, "answers": answers} , columns=["questions", "answers"] )
        data.to_csv(f"user_response_{random_value}.csv")
    except:
        pass
    return ans

In [None]:
## Lauch of Gradio
demo = gr.Interface(fn=utd_chatbot, inputs="text", outputs="text")
demo.launch( share = True )   

In [21]:
# Close the a demo
demo.close()

Closing server running on port: 7860


# Use Language Chain to Generate one Example

In [22]:
from langchain import PromptTemplate


template = """
I want you to act as a naming consultant for new companies.

Here are some examples of good company names:

- search engine, Google
- social media, Facebook
- video sharing, YouTube

The name should be short, catchy and easy to remember.

What is a good name for a company that makes {product}?
"""

prompt = PromptTemplate(
    input_variables=["product"],
    template=template,
)

from langchain.llms import OpenAI
llm = OpenAI(model_name="text-davinci-003", n=2, best_of=2)
from rich import print
print( llm("Tell me about yourself") ) 

# This code Example I want to store all the generated response from the LLM from the dataset of FAQ in a dataset to check if the performances

In [35]:
import time
from langchain.llms import OpenAI
## loop through the dataset
no_template = []
for split  in faq_dataset.keys():
    for i in range( len(faq_dataset[split]) ):
        ## Question
        question = faq_dataset[split]["Question"][i]
        ## Answering
        answers = faq_dataset[split]["Answering"][i]
        time.sleep( 30 ) # https://help.openai.com/en/articles/5955598-is-api-usage-subject-to-any-rate-limits
        no_template.append( [question, answers , llm(question)] )
        

RateLimitError: The server had an error while processing your request. Sorry about that!