In [32]:
from dotenv import load_dotenv, find_dotenv
import os
load_dotenv(find_dotenv())
import requests
import xmltodict
from bs4 import BeautifulSoup
from haystack.pipelines import Pipeline
from haystack.nodes import PreProcessor
from haystack.document_stores import ElasticsearchDocumentStore
from haystack.utils import print_answers
from haystack.nodes import EmbeddingRetriever, DensePassageRetriever, MultihopEmbeddingRetriever, BM25Retriever, JoinDocuments, SentenceTransformersRanker
from haystack.nodes import TransformersQueryClassifier
from qdrant_haystack.document_stores import QdrantDocumentStore
from haystack.schema import Document
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")
import time
from pprint import pprint
import requests
import xmltodict
import gradio as gr
from bs4 import BeautifulSoup
from IPython.display import clear_output

In [33]:
class Chatbot():
    def __init__(self):
        self.preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=200,
        split_respect_sentence_boundary=True,
        split_overlap=10
        )       

        self.query_classifier = TransformersQueryClassifier()

        self.join_documents = JoinDocuments("concatenate")

        self.ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2")

        self.doc_params = {"Ranker": {"top_k":4}, "OpenAIRetriever": {"top_k":20}, "DPRRetriever": {"top_k":20}, "MultihopRetriever": {"top_k":20}, "BM25Retriever": {"top_k":20}}
        self.context_delimiter = "####"
        self.query_delimiter = "````"
        self.HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}

        self.full_chat_history = []
        self.current_chat_history = []
        self.default_message = {"role": "system", "content": "You are a helpful and firendly assistant"}
        self.messages = [self.default_message]
        self.len_chat_history = 4
        self.role = "You are a helpful and firendly assistant"

        self.model = "gpt-3.5-turbo"
        self.temperature = 0
    
    def set_doc_params(self,doc_params):
        self.doc_params = doc_params

    def set_role(self,role):
        self.role = role

    def set_model(self,model):
        self.model = model

    def set_temperature(self,temperature):
        self.temperature = temperature
        
    def extract_text_from_url(self,url, headers = None):
                if headers is None:
                    headers = self.HEADERS
                html = requests.get(url,headers=headers).text
                soup = BeautifulSoup(html, features="html.parser")
                text = soup.get_text()
                lines = (line.strip() for line in text.splitlines())
                return '\n'.join(line for line in lines if line)

    def get_urls(self,sitemap_url, headers = None):
                urls = []
                if headers is None:
                    headers = self.HEADERS
                sitemap = requests.get(sitemap_url,headers=headers).text
                try:
                    sitemap = xmltodict.parse(sitemap)
                    if 'sitemapindex' in sitemap:
                        sitemap = sitemap['sitemapindex']['sitemap']
                        for entry in sitemap:
                            urls += self.get_urls(entry['loc'])
                    else:
                        sitemap = sitemap['urlset']['url']
                        for entry in sitemap:
                            urls.append(entry['loc'])
                except:
                    print(f"Error parsing sitemap {sitemap_url}")
                return urls

    def get_pages(self,urls):
                pages = []
                for url in urls:
                    try:
                        pages.append({'text': self.extract_text_from_url(url), 'source': url})
                    except Exception as e:
                        print(e)
                return pages

    def get_documents(self,pages):
        docs = []
        for page in pages:
            document = Document(content=page['text'], meta={"url": page['source']},content_type="text")
            docs.append(document)
        docs = self.preprocessor.process(docs)
        return docs
    
    def make_document_stores(self,index, docs ,path = "src/data/qdrant"):
        self.bm25_document_store = ElasticsearchDocumentStore(
        # path = path,
        index=f"bm25_{index}",
        recreate_index=True,
        )

        self.openai_document_store = ElasticsearchDocumentStore(
            # path = path,
            index=f"openai_{index}",
            embedding_dim=1536,
            recreate_index=True,
        )
        self.multihop_document_store = ElasticsearchDocumentStore(
            # path = path,
            index=f"multihop_{index}",
            embedding_dim=384,
            recreate_index=True,
        )
        self.dpr_document_store = ElasticsearchDocumentStore(
            # path = path,
            index=f"dpr_{index}",
            embedding_dim=768,
            recreate_index=True,
            similarity="dot_product"
        )

        self.get_retrievers()

        self.bm25_document_store.write_documents(docs)

        self.openai_document_store.write_documents(docs)
        self.openai_document_store.update_embeddings(self.openai_retriever)

        self.multihop_document_store.write_documents(docs)
        self.multihop_document_store.update_embeddings(self.multihop_retriever)

        self.dpr_document_store.write_documents(docs)
        self.dpr_document_store.update_embeddings(self.dpr_retriever)
    
    def make_document_stores_from_sitemap(self,index,sitemap_url):
        urls = self.get_urls(sitemap_url)
        pages = self.get_pages(urls)
        docs = self.get_documents(pages)
        self.make_document_stores(index,docs)

    def get_document_stores(self,index,path = "src/data/qdrant"):
        self.bm25_document_store = ElasticsearchDocumentStore(
            # path = path,
            index=f"bm25_{index}"
        )

        self.openai_document_store = ElasticsearchDocumentStore(
            # path = path,
            index=f"openai_{index}",
            embedding_dim=1536
        )
        self.multihop_document_store = ElasticsearchDocumentStore(
            # path = path,
            index=f"multihop_{index}",
            embedding_dim=384
        )
        self.dpr_document_store = ElasticsearchDocumentStore(
            # path = path,
            index=f"dpr_{index}",
            embedding_dim=768,
            similarity="dot_product"
        )

        self.get_retrievers()
    
    def get_retrievers(self):
        self.bm25_retriever = BM25Retriever(document_store=self.bm25_document_store)

        self.openai_retriever = EmbeddingRetriever(
        document_store=self.openai_document_store,
        batch_size=8,
        embedding_model="text-embedding-ada-002",
        api_key=os.getenv("OPENAI_API_KEY"),
        max_seq_len=1536,progress_bar=False
        )

        self.multihop_retriever = MultihopEmbeddingRetriever("sentence-transformers/all-MiniLM-L6-v2",
            document_store=self.multihop_document_store,model_format='sentence_transformers',progress_bar=False)

        self.dpr_retriever = DensePassageRetriever(
            document_store=self.dpr_document_store,
            query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
            passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
            progress_bar=False
        )

    def make_document_pipeline(self):
        pipe = Pipeline()
        pipe.add_node(component=self.openai_retriever, name="OpenAIRetriever", inputs=["Query"])
        pipe.add_node(component=self.dpr_retriever, name="DPRRetriever", inputs=["Query"])
        pipe.add_node(component=self.multihop_retriever, name="MultihopRetriever", inputs=["Query"])
        pipe.add_node(component=self.bm25_retriever, name="BM25Retriever", inputs=["Query"])
        pipe.add_node(component=self.join_documents, name="JoinDocuments", inputs=["OpenAIRetriever","DPRRetriever","MultihopRetriever","BM25Retriever"])
        pipe.add_node(component=self.ranker, name="Ranker", inputs=["JoinDocuments"])

        self.document_pipeline = pipe

    def get_context(self,query,pipe = None, return_sources= False,*args,**kwargs):
        if pipe is None: pipe = self.document_pipeline
        if kwargs is None: kwargs = {"params":self.doc_params}
        docs = pipe.run(query,*args, **kwargs)
        sources = []
        context_string = ""
        for document in docs['documents']:
            context_string += document.content + '\n' 
            sources.append(document.meta['url'])
        if return_sources:
            return (context_string,sources)
        else:
            return context_string
        
    def get_conversation(self):
        conversation = ""
        for i in range(0,len(self.current_chat_history),2):
            conversation += "Human: "+self.current_chat_history[i]['content'] + '\n'
            conversation += "AI: "+self.current_chat_history[i+1]['content'] + '\n'
        return conversation
        
    def get_response(self,query, model = None,temperature = None,return_sources = False, debug = False):
        if model is None: model = self.model
        if temperature is None: temperature = self.temperature
        conversation = self.get_conversation()
        
        check_relation_prompt = f"""
            You have been given a conversation history between a human and an AI which is enclosed by {self.context_delimiter} and the most recent query
            of the human which is enclosed by {self.query_delimiter}. Your job is to determine if the most recent query of the human refers to some information
            or context present in the conversation history or if it is a standalone query which is not related to the conversation history.
            Respond with a Y or N character, with no punctuation:
            Y - If the query refers to something in the conversation history or is related to the conversation history
            N - otherwise

            Output a single letter only.

            Context:
            {self.context_delimiter} {conversation} {self.context_delimiter}

            Query:
            {self.query_delimiter} {query} {self.query_delimiter}
            """
        check_relation_message=[{"role": "user", "content": check_relation_prompt}]
        completion = openai.ChatCompletion.create(
                        model=model,
                        messages=check_relation_message,
                        temperature=temperature,
                        max_tokens=1
                        )
        related = completion.choices[0]['message']['content'].strip().lower()
        
        if debug:
            pprint(related)
        
        if related == "y":
            condense_question_prompt = f'''
                You are given a part of a conversation history between a human and an AI which is enclosed by {self.context_delimiter} and the most recent query 
                of the human which is enclosed by {self.query_delimiter}.
                If the query is not related to the conversation history then do not modify the query and return the query as is.
                Otherwise your role is to form a standalone question using the conversation history and the query of the human which can then be answered by an AI 
                without needing to know the conversation history.

                Context: 
                {self.context_delimiter} {conversation} {self.context_delimiter}
                
                Query: 
                {self.query_delimiter} {query} {self.query_delimiter}

                Standalone Question:
                '''
            condense_question_message=[{"role": "user", "content": condense_question_prompt}]
            completion = openai.ChatCompletion.create(
                            model=model,
                            messages=condense_question_message,
                            temperature=temperature,
                            )
            modified_query = completion.choices[0]['message']['content']
        else:
            modified_query = query

        if debug:
            pprint(modified_query)
        context,sources = self.get_context(modified_query,self.document_pipeline,return_sources=True,params = self.doc_params)
        if debug:
            pprint(context)
            pprint(sources)
        system_prompt = f'''
            You are a {self.role}.
            You are given some context which is enclosed by {self.context_delimiter} and a query which is enclosed by {self.query_delimiter}.
            Your role is to assist customers by providing accurate information, offering helpful recommendations, and guiding them towards the solutions of their issues. 
            Feel free to ask clarifying questions only if needed, to better understand the customer's needs and preferences. 
            Leverage the provided context and information in the question itself to answer the question effectively without generating false or fictional information. 
            Double check your response for accuracy. Your responses should be short, friendly and humanlike.
            Respond only to the following question using only the context and the information given in the question.
            Only use your existing knowledge for generic information and not for specific information. Do not make up any figures or facts.
            If you don't know the answer respond with "May I connect you with an expert in this topic to discuss this in detail?":

            Context: 
            {self.context_delimiter} {context} {self.context_delimiter}

            Remember, your expertise and helpfulness are key in assisting customers in making informed choices.'''
            
        self.messages[0]={"role": "system", "content": system_prompt}
        self.messages.append({"role": "user", "content": f"{self.query_delimiter}{modified_query}{self.query_delimiter}"})
        completion = openai.ChatCompletion.create(
                        model=model,
                        messages=self.messages,
                        temperature=temperature,
                        )
        if debug:
            pprint(completion)
        response = completion.choices[0]['message']['content']
        self.full_chat_history.append({"role": "user", "content": f"{query}"})
        self.full_chat_history.append({"role": "assistant", "content": response})
        if len(self.current_chat_history) == 2*self.len_chat_history:
            self.current_chat_history = self.current_chat_history[2:]
        self.current_chat_history.append({"role": "user", "content": f"{query}"})
        self.current_chat_history.append({"role": "assistant", "content": response})
        self.messages = [self.messages[0]] + self.current_chat_history
        if return_sources:
            return (response,sources)
        else:
            return response

    def clear_chat(self):
        self.current_chat_history.clear()
        self.messages = [self.default_message]
        self.full_chat_history = []

    def launch(self,share = False,show_sources= False, debug = False):
        with gr.Blocks() as demo:
            chatbot = gr.Chatbot()
            msg = gr.Textbox()
            clear = gr.Button("Clear")            
            def user(user_message, history):
                return "", history + [[user_message, None]]
            def bot(history):
                response,sources = self.get_response(history[-1][0],debug = debug,return_sources=True)
                urls = ""
                for url in sources:
                    urls += url
                    urls += "\n"
                if show_sources:
                    bot_message = response + "\n" + urls
                else: 
                    bot_message = response
                history[-1][1] = ""
                for character in bot_message:
                    history[-1][1] += character
                    time.sleep(0.01)
                    yield history

            msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
                bot, chatbot, chatbot
            )
            clear.click(self.clear_chat,  None, chatbot, queue=False)
        demo.queue()
        demo.launch(share=share)

In [34]:
chatbot = Chatbot()
# chatbot.make_document_stores_from_sitemap("sbnri","https://sbnri.com/sitemap.xml")
chatbot.get_document_stores("sbnri")
chatbot.make_document_pipeline()
role = "customer support guide representing SBNRI, a reputable online platform known for solving the banking needs of NRI's in India"
chatbot.set_role(role)
chatbot.set_temperature(0.3)

  return self.fget.__get__(instance, owner)()


In [35]:
chatbot.launch(show_sources=True,debug = True)

  s = socket.socket()  # create a socket object
  s = socket.socket()  # create a socket object
  s = socket.socket()  # create a socket object


Running on local URL:  http://127.0.0.1:7867

To create a public link, set `share=True` in `launch()`.


In [36]:
# from haystack.nodes.retriever import EmbeddingRetriever
# from haystack.document_stores import InMemoryDocumentStore
# from haystack.nodes.question_generator.question_generator import QuestionGenerator
# from haystack.nodes.label_generator.pseudo_label_generator import PseudoLabelGenerator

# document_store = InMemoryDocumentStore()
# document_store.write_documents(chatbot.bm25_document_store.get_all_documents())

# retriever = EmbeddingRetriever(document_store=document_store, 
#                                embedding_model="sentence-transformers/msmarco-distilbert-base-tas-b", 
#                                model_format="sentence_transformers",
#                                max_seq_len=200)
# document_store.update_embeddings(retriever)

In [37]:
# qg = QuestionGenerator(model_name_or_path="doc2query/msmarco-t5-base-v1", max_length=64, split_length=200, batch_size=12)
# psg = PseudoLabelGenerator(qg, retriever)
# output, _ = psg.run(documents=document_store.get_all_documents()) 
# retriever.train(output["gpl_labels"])