In [2]:
import os
import re

from langchain.docstore.document import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain import PromptTemplate
import time

from langchain.utilities import GoogleSearchAPIWrapper

from langchain.callbacks.base import BaseCallbackHandler
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.retrievers.web_research import WebResearchRetriever

from langchain.embeddings.sentence_transformer import SentenceTransformerEmbeddings
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import CTransformers
from langchain.chains import RetrievalQA
from langchain.llms import LlamaCpp
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.callbacks.manager import CallbackManager
from langchain.memory import ConversationBufferMemory
import faiss
from langchain.vectorstores import FAISS 
from langchain.docstore import InMemoryDocstore

In [8]:
def to_document(web_data):
    docs = []
    for data in web_data:
        text = data["snippet"]
        metadata = {"title" : data["title"], "source" : data["link"]}
        docs.append(Document(page_content = text, metadata = metadata))
    return docs

def split_docs(documents,chunk_size=1000,chunk_overlap=200):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    sp_docs = text_splitter.split_documents(documents)
    return sp_docs

custom_prompt_template = """ Use the following pieces that data from google search of information to answer the user's question.
If you don't know the answer, please just say that you don't know the answer, don't try to make up
an answer. 

Context : {context}
Question : {question}

Only returns the helpful and reasonable answer below and nothing else.
No need to return the question and don't return duplicate answer. Please don't show unhelpful answers.
Helpful answer:
"""

def set_custom_prompt(custom_prompt_template):
    prompt = PromptTemplate(template=custom_prompt_template, 
                            input_variables=['context','question'])
    return prompt

def load_llm():
    n_gpu_layers = 32  # Change this value based on your model and your GPU VRAM pool.
    n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    llm = LlamaCpp(
        model_path="llama-2-7b-chat.ggmlv3.q8_0.bin",
        n_gpu_layers=n_gpu_layers,
        n_batch=n_batch,
        callback_manager=callback_manager,
        verbose=True,n_ctx = 4096, temperature = 0.1, max_tokens = 4096, 
    )
    return llm


def load_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name = "thenlper/gte-base",
                                       model_kwargs = {'device': 'cpu'})
    return embeddings


def check_duplicate(source_list):
    res = []
    for i in source_list:
        if i not in res:
            res.append(i)
    return res

def convert_to_website_format(urls):
    convert_urls = []
    for url in urls:
        # Remove any '.html' at the end of the URL
        url = re.sub(r'\.html$', '', url)
        # Check if the URL starts with 'www.' or 'http://'
        if url.startswith("www."):
            #url = 'https://' + url
            url = "https://" + url[4:]
        if '/index' in url:
            url = url.split('/index')[0]
        match = re.match(r'^([^ ]+)', url)
        if match:
            url = match.group(1)
        convert_urls.append(url)
    return convert_urls

def regex_source(answer):
    pattern = r"'source': '(.*?)'"
    matchs = re.findall(pattern, str(answer))
    convert_urls = convert_to_website_format(matchs)
    res_urls = check_duplicate(source_list=convert_urls)
    #res_urls = filter_similar_url(res_urls)
    return res_urls

def filter_search(db_similarity, diff_val):
    filter_list = []
    top_score = db_similarity[0][1]
    for index, score in enumerate(db_similarity) :
        if score[1] - top_score <= diff_val:
              filter_list.append(score)
    return filter_list  

def qa_retrival(llm, db, qa_prompt):
    memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True, input_key="query", output_key="result")
    qa_chain = RetrievalQA.from_chain_type(
        llm = llm,
        chain_type = "stuff",
        retriever = db.as_retriever(search_kwargs = {'k':5}), 
        return_source_documents = True,
        chain_type_kwargs = {"prompt":qa_prompt},
        memory = memory) 
    return qa_chain


In [9]:
class PrintRetrievalHandler(BaseCallbackHandler):
    def __init__(self, container):
        self.container = container.expander("Context Retrieval")

    def on_retriever_start(self, query: str, **kwargs):
        self.container.write(f"**Question:** {query}")

    def on_retriever_end(self, documents, **kwargs):
        # self.container.write(documents)
        for idx, doc in enumerate(documents):
            source = doc.metadata["source"]
            self.container.write(f"**Results from {source}**")
            self.container.text(doc.page_content)


class StreamHandler(BaseCallbackHandler):
    def __init__(self, container, initial_text=""):
        self.container = container
        self.text = initial_text

    def on_llm_new_token(self, token: str, **kwargs) -> None:
        self.text += token
        self.container.info(self.text)


In [10]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyAwuzcj5uOZHwPZUMDai2KjL1vKcJAe5Cs"
os.environ["GOOGLE_CSE_ID"] = "b6aeed6a754c34354"
search = GoogleSearchAPIWrapper()

llm = load_llm()
embeddings_model = load_embeddings()  
dimension = 768
index = faiss.IndexFlatL2(dimension)  
vectorstore = FAISS(embeddings_model.embed_query, index, InMemoryDocstore({}), {})

web_retriever = WebResearchRetriever.from_llm(
    vectorstore=vectorstore,
    llm=llm, 
    search=search,
)


llama.cpp: loading model from llama-2-7b-chat.ggmlv3.q8_0.bin
llama_model_load_internal: format     = ggjt v3 (latest)
llama_model_load_internal: n_vocab    = 32000
llama_model_load_internal: n_ctx      = 4096
llama_model_load_internal: n_embd     = 4096
llama_model_load_internal: n_mult     = 256
llama_model_load_internal: n_head     = 32
llama_model_load_internal: n_head_kv  = 32
llama_model_load_internal: n_layer    = 32
llama_model_load_internal: n_rot      = 128
llama_model_load_internal: n_gqa      = 1
llama_model_load_internal: rnorm_eps  = 1.0e-06
llama_model_load_internal: n_ff       = 11008
llama_model_load_internal: freq_base  = 10000.0
llama_model_load_internal: freq_scale = 1
llama_model_load_internal: ftype      = 7 (mostly Q8_0)
llama_model_load_internal: model size = 7B
llama_model_load_internal: ggml ctx size =    0.08 MB
llama_model_load_internal: mem required  = 7354.73 MB (+ 2048.00 MB per state)
llama_new_context_with_model: kv self size  = 2048.00 MB
AVX = 1 | AVX

In [11]:
question = "Who is the prime minister of Thailand"
document = web_retriever.get_relevant_documents(question)

# sp_docs = split_docs(document)
# db = FAISS.from_documents(sp_docs, embeddings_model)
# qa_prompt = set_custom_prompt(custom_prompt_template)
# qa_chain = qa_retrival(llm, db, qa_prompt)
# start = time.time()

# # response = qa_chain({'query': question})
# # st.write(response["result"])

# # answer = st.empty()
# # answer.info('`Answer:`\n' + response["result"])

# # retrieval_streamer_cb = PrintRetrievalHandler(st.container())
# stream_handler = StreamHandler(answer, initial_text="`Answer:`\n\n")
# # response = qa_chain({"query": question},callbacks=[retrieval_streamer_cb, stream_handler])
# response = qa_chain({"query": question},callbacks=[stream_handler])
# answer.info('`Answer:`\n\n' + response['result'])


# urls = regex_source(response)
# for count, url in enumerate(urls):
#     print(str(count+1)+":", url)
# end = time.time()
# print("Respone time:",int(end-start),"sec")

  Sure, here are three Google search queries that are similar to the given question:
1. Who is the current prime minister of Japan?
2. Which country has the longest-serving prime minister in history?
3. Who was the first female prime minister of Australia?


llama_print_timings:        load time = 33779.98 ms
llama_print_timings:      sample time =    37.83 ms /    59 runs   (    0.64 ms per token,  1559.57 tokens per second)
llama_print_timings: prompt eval time = 33779.92 ms /    92 tokens (  367.17 ms per token,     2.72 tokens per second)
llama_print_timings:        eval time = 22321.21 ms /    58 runs   (  384.85 ms per token,     2.60 tokens per second)
llama_print_timings:       total time = 56384.51 ms


RuntimeError: asyncio.run() cannot be called from a running event loop

In [14]:
from langchain.callbacks.base import BaseCallbackHandler
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.retrievers.web_research import WebResearchRetriever
import os
import faiss
from langchain.vectorstores import FAISS 
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.docstore import InMemoryDocstore  
os.environ["GOOGLE_API_KEY"] = "AIzaSyAwuzcj5uOZHwPZUMDai2KjL1vKcJAe5Cs"
os.environ["GOOGLE_CSE_ID"] = "b6aeed6a754c34354"


def load_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name = "thenlper/gte-base",
                                       model_kwargs = {'device': 'cpu'})
    return embeddings

def load_llm():
    n_gpu_layers = 32  # Change this value based on your model and your GPU VRAM pool.
    n_batch = 512  # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])
    llm = LlamaCpp(
        model_path="llama-2-7b-chat.ggmlv3.q8_0.bin",
        n_gpu_layers=n_gpu_layers,
        n_batch=n_batch,
        callback_manager=callback_manager,
        verbose=True,n_ctx = 4096, temperature = 0.1, max_tokens = 4096, 
    )
    return llm

embeddings_model = load_embeddings() 
vectorstore_public = FAISS()