In [1]:
from typing import Iterator
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.chat_models import ChatOllama
from langchain_experimental.text_splitter import SemanticChunker
from langchain_chroma import Chroma
from langchain_core.retrievers import BaseRetriever

from glob import glob
from html2text import HTML2Text
from bs4 import BeautifulSoup
import requests

In [2]:
embedding_model = SentenceTransformerEmbeddings(model_name='paraphrase-MiniLM-L6-v2')
# embedding_model_large = SentenceTransformerEmbeddings(model_name='dunzhang/stella_en_1.5B_v5/2_Dense_1024')
# semantic_chunker = SemanticChunker(embedding_model)

  warn_deprecated(
  from tqdm.autonotebook import tqdm, trange


In [3]:
vector_store = Chroma(collection_name="faq", persist_directory="./db", embedding_function=embedding_model)

In [10]:
def faq_html_parser(html):
    soup = BeautifulSoup(html)
    question = soup.find(id="kb_article_question")
    answer = soup.find(id="kb_article_text")

    if not question or not answer:
        return None
    
    return f"question: {question.text}\n\nanswer: {answer.text}"

class HTMLDirectoryLoader(BaseLoader):
    def __init__(self, dir_path: str, html_parser):
        self.html_parser = html_parser
        self.dir_path = dir_path

    def lazy_load(self) -> Iterator[Document]:
        for path in glob(f'{self.dir_path}/*.html'):
            with open(path, "r") as f:
                html_file = f.read()
            extracted = self.html_parser(html_file)

            if not extracted:
                continue
            
            yield Document(page_content=extracted, metadata={"source": path})

faq_html_loader = HTMLDirectoryLoader("../web-scraper/faq-archive", faq_html_parser)
documents = faq_html_loader.lazy_load()
vector_store.add_documents(documents)

['e490017c-e7f4-454f-aa57-1e6e64499433',
 '25b18c2c-275c-4baf-a832-d6b361ac19ca',
 '2ac90532-0c98-4e67-b781-717f72e871f2',
 'bb351160-6099-4bb4-acec-f4dcb00541be',
 '66107525-47ba-4968-aeca-3b80650a6829',
 '9800b842-14eb-4355-b5b4-d450e658b82e',
 'afb86087-6b93-4d7b-9240-648496beacbf',
 '19261532-5226-467e-ad21-ce1055b57474',
 '8fcd67fd-69c9-46e0-a0ce-ad24fc97dbb8',
 '04ab806b-046e-4597-bc78-2a8c0292017d',
 '09378a29-2f79-4811-98e5-c3f1fec47676',
 '71429892-31ba-478e-a1ed-3515b1956632',
 '82c64dab-6791-4cae-9ff6-58ee40149ff8',
 '386507ec-b58d-442b-8d29-1ea95277d008',
 'c767780a-91b3-4470-b361-747517fe19a3',
 '52563612-8a4d-416f-b622-2100136a1663',
 'b689209c-52c2-460c-8ad3-cd79159a8e78',
 'b6757ae6-381d-4600-9f1c-ad4ff5cb9b22',
 '9905059f-69ec-4672-b5c7-439b49f05e52',
 '2e8170c5-65f4-4ee6-b065-4847215599ad',
 '7fef2c00-5fc3-42ad-aacd-1010865078ab',
 '61b82826-c820-40e0-a66e-94acdf2959e6',
 '1704af74-76cc-4f96-9df7-589876495b9f',
 '1cda08d6-7b5d-4cdc-9a57-e826b26d7eed',
 'c0e9de61-fb81-

In [26]:
from typing import List
from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
from langchain_core.documents import Document

class ManoaNewsLoader(BaseLoader):
    def __init__(self) -> None:
        pass

    def lazy_load(self) -> Iterator[Document]:
        base_url = "https://manoa.hawaii.edu/news/archive.php"
        response = requests.get(base_url)

        if response.status_code != 200:
            return []
        
        soup = BeautifulSoup(response.content, 'html.parser')
        news = soup.find("news")
        latest = news.find("div")
        title = latest.text
        news_url = base_url.replace("archive", "article") + "?aId=13339"

        # expanded_response = requests.get(news_url)

        # if response.status_code != 200:
        #     return []
        
        # soup = BeautifulSoup(expanded_response.content, 'html.parser')
        # yield soup
        # expanded_news = soup.find("news")

        yield Document(page_content=f"Current News: {title}\nFor more information, visit this url: {news_url}", metadata={"source": news_url, "title": title})
        
class ManoaNewsRetriever(BaseRetriever):
    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
        manoa_news_loader = ManoaNewsLoader()
        collection = Chroma()
        collection.reset_collection()
        news_vector_store = collection.from_documents(manoa_news_loader.lazy_load(), embedding_model)
        return news_vector_store.similarity_search(query, k=2)

manoa_news_retriever = ManoaNewsRetriever()

In [27]:
manoa_news_retriever.invoke("what is the current news?")

Number of requested results 2 is greater than number of elements in index 1, updating n_results = 1


[Document(page_content='Current News: Mānoa: New zero-interest loan program for UH engineering students\nFor more information, visit this url: https://manoa.hawaii.edu/news/article.php?aId=13339')]

In [33]:
class ManoaNowNewsLoader(BaseLoader):
    def __init__(self) -> None:
        pass
    
    def lazy_load(self) -> Iterator[Document]:
        base_url = "https://www.manoanow.org/kaleo/news/"
        response = requests.get(base_url)

        h = HTML2Text()

        if response.status_code != 200:
            return []
        
        soup = BeautifulSoup(response.content, 'html.parser')
        news = soup.find_all("article")

        for article in news:
            content = h.handle(str(article))
            yield Document(page_content=f"Current News: {content}", metadata={"source": "https://www.manoanow.org/kaleo/news/"})
        
class ManoaNowNewsRetriever(BaseRetriever):
    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
        manoa_now_news_loader = ManoaNowNewsLoader()
        collection = Chroma()
        collection.reset_collection()
        news_vector_store = collection.from_documents(manoa_now_news_loader.lazy_load(), embedding_model)
        return news_vector_store.similarity_search(query, k=2)

manoa_now_news_retriever = ManoaNowNewsRetriever()
manoa_now_news_retriever.invoke("what is the current news?")

[Document(page_content='Current News: [ Lewa i ke alahaka o\nNu‘alolo\n](/kaleo/lewa-i-ke-alahaka-o-nu-\nalolo/article_680f1f56-d1ba-11ee-8f26-87866682a2b7.html)\n\n####  [ Lewa i ke alahaka o Nu‘alolo](/kaleo/lewa-i-ke-alahaka-o-nu-\nalolo/article_680f1f56-d1ba-11ee-8f26-87866682a2b7.html)\n\n  * ʻĀnela Akiu, NHSS Research Center & Program Assistant \n  * Updated  Apr 12, 2024\n  * [ Comments ](/kaleo/lewa-i-ke-alahaka-o-nu-alolo/article_680f1f56-d1ba-11ee-8f26-87866682a2b7.html#comments)\n\nHawai‘inuiākea, established in 2007, is the only college of Indigenous\nknowledge in a Research I institution. The five different branches of the\nschool include: Kamakakūokalani, or Center of Hawaiian Studies; Kawaihuelani,\nor Center for Hawaiian Language; Ka Papa Lo‘i O Kānewai Cultural Gard…\n\n'),
 Document(page_content='Current News: [ ![Ka waiwai o ka Mahina ʻŌlelo\

In [4]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.retrievers import EnsembleRetriever

llm = ChatOllama(model="llama3", temperature=0)
retriever = vector_store.as_retriever(search_kwargs={"k": 1})
combined_retriever = EnsembleRetriever(retrievers=[retriever, ])

### Contextualize question ###
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm, combined_retriever, contextualize_q_prompt
)

### Answer question ###
system_prompt = (
    "You are an assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If the answer DOES NOT appear in the context, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise. DO NOT mention the context, users do not see it."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [10]:
store = {}

In [11]:
while True:
    user_input = input()
    print(user_input)
    
    if not user_input:
        break

    answer = conversational_rag_chain.invoke(
        {"input": user_input},
        config={
            "configurable": {"session_id": "1"}
        },
    )["answer"]

    print(answer)

where is hawaii located?
I don't know.

