In [1]:
from typing import Iterator
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_experimental.text_splitter import SemanticChunker
from langchain_chroma import Chroma

from glob import glob
from html2text import HTML2Text
from bs4 import BeautifulSoup
import requests

In [2]:
embedding_model = SentenceTransformerEmbeddings(model_name='paraphrase-MiniLM-L6-v2')
# semantic_chunker = SemanticChunker(embedding_model)

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [3]:
vector_store = Chroma(collection_name="faq", persist_directory="./db", embedding_function=embedding_model)

In [None]:
def faq_html_parser(html):
    soup = BeautifulSoup(html)
    question = soup.find(id="kb_article_question")
    answer = soup.find(id="kb_article_text")

    if not question or not answer:
        return None
    
    return f"question: {question.text}\n\nanswer: {answer.text}"

class HTMLDirectoryLoader(BaseLoader):
    def __init__(self, dir_path: str, html_parser):
        self.html_parser = html_parser
        self.dir_path = dir_path

    def lazy_load(self) -> Iterator[Document]:
        for path in glob(f'{self.dir_path}/*.html'):
            with open(path, "r") as f:
                html_file = f.read()
            extracted = self.html_parser(html_file)

            if not extracted:
                continue
            
            yield Document(page_content=extracted, metadata={"source": path})

faq_html_loader = HTMLDirectoryLoader("../web-scraper/faq-archive", faq_html_parser)
documents = faq_html_loader.lazy_load()
vector_store.add_documents(documents)

In [15]:
class ManoaNewsLoader(BaseLoader):
    def __init__(self) -> None:
        pass

    def lazy_load(self) -> Iterator[Document]:
        base_url = "https://manoa.hawaii.edu/news/archive.php"
        response = requests.get(base_url)

        if response.status_code != 200:
            return []
        
        soup = BeautifulSoup(response.content, 'html.parser')
        news = soup.find("news")
        latest = news.find("div")
        title = latest.text
        news_url = base_url.replace("archive", "article") + "?aId=13339"

        # expanded_response = requests.get(news_url)

        # if response.status_code != 200:
        #     return []
        
        # soup = BeautifulSoup(expanded_response.content, 'html.parser')
        # yield soup
        # expanded_news = soup.find("news")

        yield Document(page_content=f"Current News: {title}\nFor more information, visit this url: {news_url}", metadata={"source": news_url, "title": title})
        

manoa_news_loader = ManoaNewsLoader()
for doc in manoa_news_loader.lazy_load():
    print(doc)

page_content='Current News: Mānoa: New zero-interest loan program for UH engineering students
For more information, visit this url: https://manoa.hawaii.edu/news/article.php?aId=13339' metadata={'source': 'https://manoa.hawaii.edu/news/article.php?aId=13339', 'title': 'Mānoa: New zero-interest loan program for UH engineering students'}


In [13]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory

llm = ChatOllama(model="llama3", temperature=0)
retriever = vector_store.as_retriever()

### Contextualize question ###
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

### Answer question ###
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise. DO NOT mention the context, users do not see it."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [28]:
store = {}

In [27]:
while True:
    user_input = input()
    if not user_input:
        break
    answer = conversational_rag_chain.invoke(
        {"input": user_input},
        config={
            "configurable": {"session_id": "1"}
        },
    )["answer"]

    print(answer)

The phone number for ITS (Information Technology Services) at UH is 808-956-6033.
According to the context provided earlier, the phone numbers mentioned are:

* Directory Assistance (1-808-555-1212) - $2.49 per call
* Directory Assistance (9+00) - $7.95 per call

Let me know if you need any further assistance!
