In [1]:
from typing import Iterator
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document
from langchain_core.prompts import PromptTemplate
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain_community.embeddings import SentenceTransformerEmbeddings
from sentence_transformers import SentenceTransformer
from langchain_community.chat_models import ChatOllama
from langchain_experimental.text_splitter import SemanticChunker
from langchain_chroma import Chroma
from langchain_core.retrievers import BaseRetriever

from glob import glob
from html2text import HTML2Text
from bs4 import BeautifulSoup
import requests

  from tqdm.autonotebook import tqdm, trange


In [2]:
from langchain_huggingface import HuggingFaceEmbeddings

model_name = 'dunzhang/stella_en_400M_v5'
model_kwargs = {'device': 'cuda', "trust_remote_code": True}

embedding_model = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
)

Some weights of the model checkpoint at dunzhang/stella_en_400M_v5 were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
# embedding_model = SentenceTransformerEmbeddings(model_name='paraphrase-MiniLM-L6-v2')
# embedding_model_large = SentenceTransformerEmbeddings(model_name='dunzhang/stella_en_400M_v5')
# semantic_chunker = SemanticChunker(embedding_model)

In [3]:
vector_store = Chroma(collection_name="faq", persist_directory="./db", embedding_function=embedding_model)

In [5]:
# vector_store.reset_collection()
vector_store.similarity_search("where is the ITS building located?")

[Document(metadata={'source': '../web-scraper/faq-archive/383.html'}, page_content='question: \nWhere is the ITS Hamilton Lab located?\n\nanswer: \nThe ITS Hamilton Lab is located on the first floor of the Hamilton Library.\n'),
 Document(metadata={'source': '../web-scraper/faq-archive/1545.html'}, page_content='question: \nIT Center FAQs\n\nanswer: \nBelow you will find some of the most commonly asked questions concerning the new University of Hawaii Information Technology Center on the UH Manoa campus. The structure is designed to support modern teaching, administration and research for all ten UH campuses and students throughout the state of Hawaii. Click on a question to access its\' corresponding answer. If you have  comments/questions please email them to:  itctrcom@hawaii.edu\n\nUpdates on construction may be read on the  \tUH IT Center webpage\nFollow construction images of the building on the  UH Flickr page \n\xa0 \n\nWhat is the official name of the IT Center?\nWhat is the c

In [39]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,   # Number of characters per chunk
    chunk_overlap=50  # Number of overlapping characters between chunks
)

In [None]:
import re

def faq_html_parser(html):
    soup = BeautifulSoup(html)
    question = soup.find(id="kb_article_question")
    answer = soup.find(id="kb_article_text")

    if not question or not answer:
        return None
    
    qa = f"question: {question.text}\n\nanswer: {answer.text}"
    removed_repeating_newlines = re.sub(r'\n{3,}', '\n\n', qa)

    return removed_repeating_newlines

class HTMLDirectoryLoader(BaseLoader):
    def __init__(self, dir_path: str, html_parser):
        self.html_parser = html_parser
        self.dir_path = dir_path

    def lazy_load(self) -> Iterator[Document]:
        for path in glob(f'{self.dir_path}/*.html'):
            with open(path, "r") as f:
                html_file = f.read()
            extracted = self.html_parser(html_file)

            if not extracted:
                continue
            
            yield Document(page_content=extracted, metadata={"source": path})

faq_html_loader = HTMLDirectoryLoader("../web-scraper/faq-archive", faq_html_parser)
faq_documents = list(faq_html_loader.lazy_load())
# faq_split_documents = text_splitter.split_documents(faq_documents)
vector_store.add_documents(faq_documents)

In [5]:
vector_store.similarity_search("duo mobile")

[Document(metadata={'source': '../web-scraper/faq-archive/1859.html'}, page_content='question: \nDuo Mobile app 4.0 update\n\nanswer: \nOverview\nThe Duo Mobile 4.0 version of the app is a significant update to the user interface.\xa0 This update will be released on the following schedule:\n\nFor iOS: October 11 through 18, 2021\nFor Android: October 11 through 15, 2021\n\nDuo Mobile 4.0 requires\n\niOS 13 and up\nAndroid 8 and up\n\nOlder versions of the Duo Mobile app will continue to work.\nSummary of changes\nThe following are some of the changes to the Duo Mobile app in version 4.0.\nNew Duo Approve screen\nIn versions prior to 4.0, the Approve button was on the left and the Deny button was on the right.\xa0 In Duo Mobile 4.0, the Approve button has been moved to the right, and the Deny button moved to the left.\xa0 Duo made this change to follow the industry standard of placing positive actions on the right (move forward in the flow of action), and negative actions on the left (m

In [80]:
import json

def json_parser(d):
    return {
        "page_content": d["extracted"],
        "metadata" : {"source": d["url"]}
    }

class JSONFileLoader(BaseLoader):
    def __init__(self, json_path: str, json_parser):
        self.json_path = json_path
        self.json_parser = json_parser

    def lazy_load(self) -> Iterator[Document]:
        with open(self.json_path, "r") as f:
            json_data = json.load(f)
        
        for d in json_data:
            parsed = self.json_parser(d)

            yield Document(
                page_content=parsed["page_content"],
                metadata=parsed["metadata"]
            )

json_file_loader = JSONFileLoader("../web-scraper/data/urls.json", json_parser)
json_documents = json_file_loader.lazy_load()
json_split_documents = text_splitter.split_documents(json_documents)
vector_store.add_documents(json_split_documents)

['f45f49ee-c8bf-4f47-915a-b5b5fd36d1bd',
 'f1bf7ed1-54bd-4b1d-9fc1-6b6260a04309',
 '07dd924c-add7-41c5-879e-b1c0aef9c845',
 '79fcf1cd-08d9-42b6-b7bb-0cfe1e8413ce',
 '3eb6f3aa-bac7-4359-9156-a4bd23bb0221',
 'fb38cbcd-9cc8-412f-8b1d-2fc1c3795691',
 'c8dcff27-8cf7-4ea1-9497-9729bcf55ea9',
 'bb57fa66-d4a4-442d-93ab-ea6ea3bab4e7',
 'e0eaffab-ac7d-4cfd-99d0-f4137e1ee279',
 'cd6d5573-172c-42bd-a9aa-208611ae37a9',
 '64586433-b173-4c48-9cd8-8099a86e3154',
 'c55a9fb9-456a-4ef3-ab05-15f5e0ffebdd',
 'c44ec07b-664c-48a4-9ade-06105e08e6a6',
 'c9bf71c2-0122-4118-a27c-13f88b96df0d',
 '3720e295-35b7-4bf2-a4ac-c8ac6b359a97',
 'eb7bf848-b9ec-4ec0-9c56-022aed5232d0',
 '27677084-5865-4731-9dc0-11e4703c3b39',
 'f6c687cf-fced-4658-b166-a1a0971abecd',
 'bcdbf7cb-fb07-4ccc-89dd-3179217b2780',
 'a568b444-4e27-4146-9199-bed83e084367',
 '4a7b9fad-e802-4b15-828a-c35de0d28bdd',
 'c62e528a-ec57-45ad-b1da-d4bc8de61740',
 '09edfa60-69cd-47d9-b202-b38ae7e08ea7',
 '733c2fd9-441a-44a1-8ae1-2cd0b6eabaa5',
 '0bf2c19c-0cc9-

In [26]:
from typing import List
from langchain_core.callbacks.manager import CallbackManagerForRetrieverRun
from langchain_core.documents import Document

class ManoaNewsLoader(BaseLoader):
    def __init__(self) -> None:
        pass

    def lazy_load(self) -> Iterator[Document]:
        base_url = "https://manoa.hawaii.edu/news/archive.php"
        response = requests.get(base_url)

        if response.status_code != 200:
            return []
        
        soup = BeautifulSoup(response.content, 'html.parser')
        news = soup.find("news")
        latest = news.find("div")
        title = latest.text
        news_url = base_url.replace("archive", "article") + "?aId=13339"

        # expanded_response = requests.get(news_url)

        # if response.status_code != 200:
        #     return []
        
        # soup = BeautifulSoup(expanded_response.content, 'html.parser')
        # yield soup
        # expanded_news = soup.find("news")

        yield Document(page_content=f"Current News: {title}\nFor more information, visit this url: {news_url}", metadata={"source": news_url, "title": title})
        
class ManoaNewsRetriever(BaseRetriever):
    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
        manoa_news_loader = ManoaNewsLoader()
        collection = Chroma()
        collection.reset_collection()
        news_vector_store = collection.from_documents(manoa_news_loader.lazy_load(), embedding_model)
        return news_vector_store.similarity_search(query, k=2)

manoa_news_retriever = ManoaNewsRetriever()

In [27]:
manoa_news_retriever.invoke("what is the current news?")

Number of requested results 2 is greater than number of elements in index 1, updating n_results = 1


[Document(page_content='Current News: Mānoa: New zero-interest loan program for UH engineering students\nFor more information, visit this url: https://manoa.hawaii.edu/news/article.php?aId=13339')]

In [33]:
class ManoaNowNewsLoader(BaseLoader):
    def __init__(self) -> None:
        pass
    
    def lazy_load(self) -> Iterator[Document]:
        base_url = "https://www.manoanow.org/kaleo/news/"
        response = requests.get(base_url)

        h = HTML2Text()

        if response.status_code != 200:
            return []
        
        soup = BeautifulSoup(response.content, 'html.parser')
        news = soup.find_all("article")

        for article in news:
            content = h.handle(str(article))
            yield Document(page_content=f"Current News: {content}", metadata={"source": "https://www.manoanow.org/kaleo/news/"})
        
class ManoaNowNewsRetriever(BaseRetriever):
    def _get_relevant_documents(self, query: str, *, run_manager: CallbackManagerForRetrieverRun) -> List[Document]:
        manoa_now_news_loader = ManoaNowNewsLoader()
        collection = Chroma()
        collection.reset_collection()
        news_vector_store = collection.from_documents(manoa_now_news_loader.lazy_load(), embedding_model)
        return news_vector_store.similarity_search(query, k=2)

manoa_now_news_retriever = ManoaNowNewsRetriever()
manoa_now_news_retriever.invoke("what is the current news?")

[Document(page_content='Current News: [ Lewa i ke alahaka o\nNu‘alolo\n](/kaleo/lewa-i-ke-alahaka-o-nu-\nalolo/article_680f1f56-d1ba-11ee-8f26-87866682a2b7.html)\n\n####  [ Lewa i ke alahaka o Nu‘alolo](/kaleo/lewa-i-ke-alahaka-o-nu-\nalolo/article_680f1f56-d1ba-11ee-8f26-87866682a2b7.html)\n\n  * ʻĀnela Akiu, NHSS Research Center & Program Assistant \n  * Updated  Apr 12, 2024\n  * [ Comments ](/kaleo/lewa-i-ke-alahaka-o-nu-alolo/article_680f1f56-d1ba-11ee-8f26-87866682a2b7.html#comments)\n\nHawai‘inuiākea, established in 2007, is the only college of Indigenous\nknowledge in a Research I institution. The five different branches of the\nschool include: Kamakakūokalani, or Center of Hawaiian Studies; Kawaihuelani,\nor Center for Hawaiian Language; Ka Papa Lo‘i O Kānewai Cultural Gard…\n\n'),
 Document(page_content='Current News: [ ![Ka waiwai o ka Mahina ʻŌlelo\

In [12]:
from langchain.chains import create_history_aware_retriever
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.chat_history import BaseChatMessageHistory
from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.runnables.history import RunnableWithMessageHistory
from langchain.retrievers import EnsembleRetriever
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

llm = ChatOllama(model="llama3", temperature=0)
retriever = vector_store.as_retriever(search_kwargs={"k": 1})
combined_retriever = EnsembleRetriever(retrievers=[retriever, ])

compressor = LLMChainExtractor.from_llm(llm)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=combined_retriever
)

### Contextualize question ###
contextualize_q_system_prompt = (
    "Given a chat history and the latest user question "
    "which might reference context in the chat history, "
    "formulate a standalone question which can be understood "
    "without the chat history. Do NOT answer the question, "
    "just reformulate it if needed and otherwise return it as is."
)

contextualize_q_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", contextualize_q_system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

history_aware_retriever = create_history_aware_retriever(
    llm, retriever, contextualize_q_prompt
)

### Answer question ###
system_prompt = (
    "Answer the question given ONLY the provided context.\n"
    "If the answer DOES NOT appear in the context, say 'I don't know'.\n"
    "Use three sentences maximum and keep the answer concise.\n"
    "DO NOT mention the context, users do not see it."
    "\n\n"
    "{context}"
)

qa_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        MessagesPlaceholder("chat_history"),
        ("human", "{input}"),
    ]
)

question_answer_chain = create_stuff_documents_chain(llm, qa_prompt)
rag_chain = create_retrieval_chain(history_aware_retriever, question_answer_chain)

store = {}

def get_session_history(session_id: str) -> BaseChatMessageHistory:
    if session_id not in store:
        store[session_id] = ChatMessageHistory()
    return store[session_id]

conversational_rag_chain = RunnableWithMessageHistory(
    rag_chain,
    get_session_history,
    input_messages_key="input",
    history_messages_key="chat_history",
    output_messages_key="answer",
)

In [13]:
store = {}

In [14]:
while True:
    user_input = input()
    print(user_input)
    
    if not user_input:
        break

    answer = conversational_rag_chain.invoke(
        {"input": user_input},
        config={
            "configurable": {"session_id": "1"}
        },
    )["answer"]

    print(answer)
    print()

what is MFA?
MFA can refer to several things, but the most common meanings are:

1. **Master of Fine Arts**: A graduate degree that focuses on creative writing, visual arts, or performing arts. An MFA program typically takes 2-3 years to complete and provides students with advanced training in their chosen field.
2. **Multi-Factor Authentication**: A security process that requires users to provide multiple forms of identification before accessing a system, network, or application. This adds an extra layer of protection against unauthorized access and helps prevent identity theft.
3. **MFA (Mobile First Architecture)**: An architectural approach that prioritizes mobile devices as the primary platform for delivering applications and services. MFA emphasizes simplicity, ease of use, and adaptability to different screen sizes and input methods.

In general, MFA is about providing an additional layer of security, flexibility, or creativity to achieve a specific goal.


