In [2]:
from dotenv import load_dotenv, find_dotenv
import os
load_dotenv(find_dotenv())
import requests
import xmltodict
from bs4 import BeautifulSoup
from haystack.pipelines import Pipeline
from haystack.nodes import PreProcessor
from haystack.document_stores import WeaviateDocumentStore, ElasticsearchDocumentStore
from haystack.utils import print_answers
from haystack.nodes import EmbeddingRetriever, DensePassageRetriever, MultihopEmbeddingRetriever, BM25Retriever, JoinDocuments, SentenceTransformersRanker
from haystack.nodes.prompt import PromptTemplate
from haystack.nodes import OpenAIAnswerGenerator
from haystack.nodes import TransformersQueryClassifier, JoinAnswers, AnswerParser
from haystack.nodes import FARMReader
from haystack.nodes import PromptTemplate, PromptNode, PromptModel
from qdrant_haystack.document_stores import QdrantDocumentStore
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")
from pprint import pprint

2023-06-04 22:33:32.800556: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-04 22:33:33.463928: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-06-04 22:33:33.463975: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory


In [3]:
import requests
import xmltodict
from bs4 import BeautifulSoup

HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}

def extract_text_from_url(url, headers = None):
            if headers is None:
                headers = HEADERS
            html = requests.get(url,headers=headers).text
            soup = BeautifulSoup(html, features="html.parser")
            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            return '\n'.join(line for line in lines if line)

def get_urls(sitemap_url, headers = None):
            urls = []
            if headers is None:
                headers = HEADERS
            sitemap = requests.get(sitemap_url,headers=headers).text
            try:
                sitemap = xmltodict.parse(sitemap)
                if 'sitemapindex' in sitemap:
                    sitemap = sitemap['sitemapindex']['sitemap']
                    for entry in sitemap:
                        urls += get_urls(entry['loc'])
                else:
                    sitemap = sitemap['urlset']['url']
                    for entry in sitemap:
                        urls.append(entry['loc'])
            except:
                print(f"Error parsing sitemap {sitemap_url}")
            return urls

def get_pages(urls):
            pages = []
            for url in urls:
                try:
                    pages.append({'text': extract_text_from_url(url), 'source': url})
                except Exception as e:
                    print(e)
            return pages

In [4]:
urls = get_urls("https://sbnri.com/sitemap.xml")
pages = get_pages(urls=urls)

Error parsing sitemap https://sbnri.com/sbnri-blog-definitions-sitemap.xml
Error parsing sitemap https://sbnri.com/sbnri-blog-entertainment-and-lifestyle-2-sitemap.xml
Error parsing sitemap https://sbnri.com/sbnri-blog-nri-bank-accounts-sitemap.xml
Error parsing sitemap https://sbnri.com/sbnri-blog-nri-income-tax-sitemap.xml
Error parsing sitemap https://sbnri.com/sbnri-blog-nri-investment-sitemap.xml
Error parsing sitemap https://sbnri.com/sbnri-blog-passport-sitemap.xml
Error parsing sitemap https://sbnri.com/sbnri-blog-real-estate-sitemap.xml
Error parsing sitemap https://sbnri.com/sbnri-blog-visa-sitemap.xml
Error parsing sitemap https://sbnri.com/sbnri-send-money-sitemap.xml


In [5]:
from haystack.schema import Document
docs = []
for page in pages:
    document = Document(content=page['text'], meta={"url": page['source']},content_type="text")
    docs.append(document)

In [6]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=10
)

docs = preprocessor.process(docs)

Preprocessing:   0%|          | 0/1680 [00:00<?, ?docs/s]

We found one or more sentences whose word count is higher than the split length.


In [20]:
document_store_1 = ElasticsearchDocumentStore(
    index="bm25_sbnri",
    recreate_index=True,
)

document_store_2 = ElasticsearchDocumentStore(
    index="openai_sbnri",
    embedding_dim=1536,
    recreate_index=True,
)
document_store_3 = ElasticsearchDocumentStore(
    index="multihop_sbnri",
    embedding_dim=384,
    recreate_index=True,
)
document_store_4 = ElasticsearchDocumentStore(
    index="dpr_sbnri",
    embedding_dim=768,
    recreate_index=True,
)

bm25_retriever = BM25Retriever(document_store=document_store_1)

openai_retriever = EmbeddingRetriever(
   document_store=document_store_2,
   batch_size=8,
   embedding_model="text-embedding-ada-002",
   api_key=os.getenv("OPENAI_API_KEY"),
   max_seq_len=1536,
)

multihop_retriever = MultihopEmbeddingRetriever("sentence-transformers/all-MiniLM-L6-v2",
    document_store=document_store_3,model_format='sentence_transformers')

dpr_retriever = DensePassageRetriever(
    document_store=document_store_4,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base"
)

  return self.fget.__get__(instance, owner)()


In [21]:
document_store_1.write_documents(docs)

document_store_2.write_documents(docs)
document_store_2.update_embeddings(openai_retriever)

document_store_3.write_documents(docs)
document_store_3.update_embeddings(multihop_retriever)

document_store_4.write_documents(docs)
document_store_4.update_embeddings(dpr_retriever)

Updating embeddings:   0%|          | 0/2140 [00:00<?, ? Docs/s]

Calculating embeddings:   0%|          | 0/268 [00:00<?, ?it/s]

Updating embeddings:   0%|          | 0/2140 [00:00<?, ? Docs/s]

Batches:   0%|          | 0/67 [00:00<?, ?it/s]

Updating embeddings:   0%|          | 0/2140 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/2144 [00:00<?, ? Docs/s]

In [9]:
document_store_1 = ElasticsearchDocumentStore(index = "bm25_sbnri")
document_store_2 = ElasticsearchDocumentStore(index = "openai_sbnri", embedding_dim=1536)
document_store_3 = ElasticsearchDocumentStore(index = "multihop_sbnri", embedding_dim=384)
document_store_4 = ElasticsearchDocumentStore(index = "dpr_sbnri", embedding_dim=768)

In [10]:
bm25_retriever = BM25Retriever(document_store=document_store_1)

openai_retriever = EmbeddingRetriever(
   document_store=document_store_2,
   batch_size=8,
   embedding_model="text-embedding-ada-002",
   api_key=os.getenv("OPENAI_API_KEY"),
   max_seq_len=1536,
)

multihop_retriever = MultihopEmbeddingRetriever("sentence-transformers/all-MiniLM-L6-v2",
    document_store=document_store_3,model_format='sentence_transformers')

dpr_retriever = DensePassageRetriever(
    document_store=document_store_4,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base"
)

In [11]:
query_classifier = TransformersQueryClassifier()

join_documents = JoinDocuments("merge")

ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2")

pipe = Pipeline()
pipe.add_node(component=openai_retriever, name="OpenAIRetriever", inputs=["Query"])
pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
pipe.add_node(component=multihop_retriever, name="MultihopRetriever", inputs=["Query"])
pipe.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["Query"])
pipe.add_node(component=join_documents, name="JoinDocuments", inputs=["OpenAIRetriever","DPRRetriever","MultihopRetriever","BM25Retriever"])
pipe.add_node(component=ranker, name="Ranker", inputs=["JoinDocuments"])

In [12]:
def get_context(query,pipe,return_docs = False,*args,**kwargs):
    docs = pipe.run(query,*args, **kwargs)
    context_string = ""
    for document in docs['documents']:
        context_string += document.content + '\n' 
    if return_docs:
        return (context_string,docs)
    else:
        return context_string

In [13]:
params = {"Ranker": {"top_k":5}, "OpenAIRetriever": {"top_k":20}, "DPRRetriever": {"top_k":20}, "MultihopRetriever": {"top_k":20}, "BM25Retriever": {"top_k":20}}
context_delimiter = "####"
query_delimiter = "````"

In [16]:
messages=[{"role": "system", "content": ""}]
while True:
    user_input = input("Enter your query: ")
    if user_input == "exit":
        break
    context = get_context(user_input,pipe,params = params)
    system_prompt = f'''
    You are a customer support guide representing SBNRI, a reputable online platform known for solving the banking needs of NRI's in India. 
    You are given some context which is enclosed by {context_delimiter} and a query which is enclosed by {query_delimiter}.
    Your role is to assist customers by providing accurate information, offering helpful recommendations, and guiding them towards the solutions of their issues. 
    Feel free to ask clarifying questions only if needed, to better understand the customer's needs and preferences. 
    Leverage the provided context and information in the question itself to answer the question effectively without generating false or fictional information. 
    Double check your response for accuracy. Your responses should be short, friendly and humanlike.
    Respond only to the following question using only the context and the information given in the question.
    Only use your existing knowledge for generic information and not for specific information. Do not make up any figures or facts.
    If you don't know the answer respond with "May I connect you with an expert in this topic to discuss this in detail?":

    Context: 
    {context_delimiter} {context} {context_delimiter}

    Remember, your expertise and helpfulness are key in assisting customers in making informed choices.'''
    
    messages[0]={"role": "system", "content": system_prompt}
    messages.append({"role": "user", "content": f"{query_delimiter}{user_input}{query_delimiter}"})
    completion = openai.ChatCompletion.create(
    model="gpt-3.5-turbo",
    messages=messages,
    temperature=0.9,
    )
    messages[-1] = {"role": "user", "content": f"{user_input}"}
    messages.append({"role": "assistant", "content": completion.choices[0]['message']['content']})
    print(completion.choices[0]['message']['content'])

Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Querying:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

I apologize, but I'm not sure I understand your question. The context provided is about the SWIFT payment system used by financial institutions. Could you please provide more information or context about your question?


Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Querying:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

I apologize, but I'm not sure what you are referring to. Could you please provide more context or information about what you are trying to find out? This will help me better understand your question and provide an accurate response.


Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Querying:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Mutual funds are a type of investment where money is pooled from multiple investors to invest in a diverse range of financial instruments such as stocks, bonds, and other securities. These funds are managed by professional fund managers who use their expertise to allocate the pooled money into different investments. After deducting the required fees and expenses, the return on investment is distributed proportionally among the investors depending on their contribution to the fund. Mutual funds are flexible investment options and investors can enter and exit a mutual fund at any time. They are popular among Indian as well as Non-Resident Indian (NRI) investors as they offer high liquidity and diversification opportunities.


Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Querying:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

There are a variety of mutual funds available in the market that cater to different investment needs and risk appetite. Some good ones to consider are:

1. SBI Bluechip Fund
2. HDFC Equity Fund
3. Mirae Asset Large Cap Fund
4. Axis Focused 25 Fund
5. ICICI Prudential Equity & Debt Fund

It is important to note that past performance is not a guarantee of future results and it's always recommended to study the fund prospectus and underlying investments before investing in any mutual fund. Additionally, it's important to consider your investment goals, risk tolerance, and investment horizon when selecting a mutual fund.
