In [None]:
!pip install websocket
!pip install websocket-client
!pip install supabase tiktoken openai langchain

from IPython.display import clear_output
clear_output()

In [None]:
import os
from Perplexity import Perplexity
from supabase import create_client, Client
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import MarkdownHeaderTextSplitter
from langchain.vectorstores import SupabaseVectorStore
from langchain.document_loaders import TextLoader
from langchain.document_loaders.csv_loader import CSVLoader
from dotenv import load_dotenv

load_dotenv()

HOSTNAME = os.environ.get("HOSTNAME")
PORT = os.environ.get("PORT")
PASSWORD = os.environ.get("PASSWORD")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
SUPABASE_URL = os.environ.get("SUPABASE_URL")
SUPABASE_SERVICE_KEY = os.environ.get("SUPABASE_SERVICE_KEY")

embeddings = OpenAIEmbeddings()
client: Client = create_client(os.environ["SUPABASE_URL"], os.environ["SUPABASE_SERVICE_KEY"])

In [14]:
major_list = []

file_path = "./major_list.txt"
with open(file_path, "r") as file:
    for line in file:
        line = line.strip()
        if line:
            major_list.append(line)

major_list

['Architecture', 'Architectural Studies', 'Undergraduate Business', 'Bioengineering', 'Chemical And Biomolecular Engineering', 'Civil and Environmental Engineering', 'Computational and Applied Mathematics and Operations Research', 'Computer Science', 'Electrical and Computer Engineering', 'Materials Science and Nanoengineering', 'Mechanical Engineering', 'Statistics', 'Ancient Mediterranean Civilizations', 'Art History', 'Asian Studies', 'Classical Studies', 'English', 'European Studies', 'French Studies', 'German Studies', 'History', 'Latin American and Latinx Studies', 'Medieval and Early Modern Studies', 'Philosophy', 'Religion', 'Spanish, Portuguese and Latin American Studies', 'Study of Women, Gender and Sexuality', 'Visual and Dramatic Arts', 'Bassoon Performance', 'Cello Performance', 'Clarinet Performance', 'Composition', 'Double Bass Performance', 'Flute Performance', 'Harp Performance', 'Horn Performance', 'Music', 'Music History', 'Music Theory', 'Oboe Performance', 'Organ P

In [111]:
import csv
import textwrap
from tqdm import tqdm

# academic
# questions = [f'What academic resources are there for rice university students who major in {major}?' for major in major_list]
# PHD
# questions = [f'I am currently an undergrad at Rice major in {major}. I want to pursue PHD after graduation. What resouces are there at Rice to help me prepare and apply for PHD in {major}?' for major in major_list]
# programs
# questions = [f'What extra-curricular programs are there for Rice University students major in {major}?' for major in major_list]
# career
# questions = [f'What career resources are there for Rice University students major in {major}?' for major in major_list]

# questions = ['What entrepreneurship programs and resources are there at Rice?']
# questions = ['How and where to reach out Rice University Alumni networks?']
# questions = ['Which rice alumnis from Rice are working on a startp?']
# questions = ['What resources are there for Rice students for investment banking recruiting?']
questions = []
failed_questions = []
question_answer_pair = []

# Perplexity AI answers each question
for question in tqdm(questions):
    try:
        perplexity = Perplexity()
        response = perplexity.search(question)

        answer = response.json_answer_text['answer']
        citations = '\n\ncitations: \n'
        for web_result_idx in range(len(response.json_answer_text['web_results'])):
            web_result = response.json_answer_text['web_results'][web_result_idx]
            url = web_result['url']
            citations += f'{[web_result_idx + 1]} {url} \n'

        answer += citations

        answer = textwrap.dedent(answer).strip().replace('\n', ' ')
        question_answer_pair.append({'question': question, 'answer': answer})
    except Exception as e:
        print(f"Error {str(e)} occurred while processing the question: {question}")
        failed_questions.append(question)
        continue

# Save the question answer pair in a local CSV
with open("./faq.csv", mode='w', newline='', encoding='utf-8') as file:
    fieldnames = ['question', 'answer']
    writer = csv.DictWriter(file, fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(question_answer_pair)

  0%|          | 0/1 [00:00<?, ?it/s]2023-07-29 17:00:19,673:INFO - Websocket connected
100%|██████████| 1/1 [00:08<00:00,  8.49s/it]


In [110]:
loader = CSVLoader(file_path="./faq.csv",  source_column="question")

data = loader.load()

vector_store = SupabaseVectorStore(client=client,
                                   embedding=embeddings,
                                   table_name='faq')

vector_store.add_documents(data)

['02e71a1e-489c-49e7-bc43-31c60016d024',
 '7d9a6d0a-dd7e-4967-8368-925105e2cff9',
 '62fcb79f-e77b-4dae-b316-389b72e6e44c',
 'c0fcdeac-bdcd-4333-8f1c-9a247fbdde98',
 'bc1463f4-d700-4d01-803d-0dcefd51411a',
 '1d5661d3-e0e3-484c-8175-91c3a9bd4f76',
 '4aec2d10-e29b-4216-bec9-00a0d7d68e55',
 'fd9232d7-e84a-4673-b92d-89b56919b1ab',
 'f834bd2d-39bd-4f45-ade6-9fae72067ff1',
 '392d8e2d-fe79-4d0a-b25d-ac9773735599',
 '4eb2b276-0479-48bd-baa9-afbb911d9baa',
 'bd965405-046c-419f-bb18-e96aefd64614',
 'ab98ea07-7cee-4a46-b504-3d33d5d9aef1',
 'b19ac410-b861-42a5-b8fa-de52ae45b2c3',
 'f86521fd-b43f-455e-a70f-5cf2530faf75',
 'c1c7298b-2ed0-41e2-9d01-b50ca6fe2e95',
 'c4d2286b-882e-4ada-8fd1-c91e6f09db6e',
 'e4ac6eaa-e02a-4169-bd81-28338c7faa02',
 'ef3ac22c-cc82-49e1-bee6-a462d2e7c1eb',
 'd8c8fd7a-2c04-425c-a75f-b1f9566f3dce',
 'a5969bf2-2239-44ea-9f6c-206a684aff3c',
 'e30a433c-0604-4ace-b264-87ee20660ff7',
 '2ccd4915-835d-4a63-900b-27cad94e3476',
 '5e3c056b-9a69-40ce-9bc4-d822c64d2dfc',
 'ca22b2e8-5c99-

### Self-learn based on user chat history:

In [None]:
import redis
import json
import textwrap

def get_all_failed_queries():
    def connect_reddis():
        return redis.Redis(
            host=HOSTNAME,
            port=PORT, 
            password=PASSWORD,
            ssl=True,
            decode_responses=True
        )

    r = connect_reddis()

    # Get all chat keys by pattern
    all_keys = r.keys('chat:*')

    # Get all failed queries
    all_fail_query = []

    for hash_key in all_keys:
        hash_data = r.hgetall(hash_key)

        messages = json.loads(hash_data['messages'])
        for idx in range(len(messages)):
            message = messages[idx]
            role = message['role']
            content = message['content']
            if role == 'assistant' and 'sorry' in content.lower():
                if idx - 1 >= 0:
                    prev_message = messages[idx - 1]
                    if prev_message['role'] == 'user':
                        all_fail_query.append(prev_message['content'])
    
    return all_fail_query


def perplexity_solve_with_rice_context(question):
    try:
        if "rice" not in question.lower():
            question = 'At Rice university: ' + question

        perplexity = Perplexity()
        response = perplexity.search(question)

        answer = response.json_answer_text['answer']
        citations = '\n\ncitations: \n'
        for web_result_idx in range(len(response.json_answer_text['web_results'])):
            web_result = response.json_answer_text['web_results'][web_result_idx]
            url = web_result['url']
            citations += f'{[web_result_idx + 1]} {url} \n'

        answer += citations

        answer = textwrap.dedent(answer).strip().replace('\n', ' ')

        return question, answer
    except Exception as e:
        print(f"Error {str(e)} occurred while processing the question: {question}")
        return None