In [73]:
from dotenv import load_dotenv, find_dotenv
import os
load_dotenv(find_dotenv())
import requests
import xmltodict
from bs4 import BeautifulSoup
from haystack.pipelines import Pipeline
from haystack.nodes import PreProcessor
from haystack.document_stores import WeaviateDocumentStore, ElasticsearchDocumentStore
from haystack.utils import print_answers
from haystack.nodes import EmbeddingRetriever, DensePassageRetriever, MultihopEmbeddingRetriever, BM25Retriever, JoinDocuments, SentenceTransformersRanker
from haystack.nodes.prompt import PromptTemplate
from haystack.nodes import OpenAIAnswerGenerator
from haystack.nodes import TransformersQueryClassifier, JoinAnswers, AnswerParser
from haystack.nodes import FARMReader
from haystack.nodes import PromptTemplate, PromptNode, PromptModel
from qdrant_haystack.document_stores import QdrantDocumentStore
import openai
openai.api_key = os.getenv("OPENAI_API_KEY")
from pprint import pprint

In [74]:
import requests
import xmltodict
from bs4 import BeautifulSoup

HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/113.0.0.0 Safari/537.36'}

def extract_text_from_url(url, headers = None):
            if headers is None:
                headers = HEADERS
            html = requests.get(url,headers=headers).text
            soup = BeautifulSoup(html, features="html.parser")
            text = soup.get_text()
            lines = (line.strip() for line in text.splitlines())
            return '\n'.join(line for line in lines if line)

def get_urls(sitemap_url, headers = None):
            urls = []
            if headers is None:
                headers = HEADERS
            sitemap = requests.get(sitemap_url,headers=headers).text
            try:
                sitemap = xmltodict.parse(sitemap)
                if 'sitemapindex' in sitemap:
                    sitemap = sitemap['sitemapindex']['sitemap']
                    for entry in sitemap:
                        urls += get_urls(entry['loc'])
                else:
                    sitemap = sitemap['urlset']['url']
                    for entry in sitemap:
                        urls.append(entry['loc'])
            except:
                print(f"Error parsing sitemap {sitemap_url}")
            return urls

def get_pages(urls):
            pages = []
            for url in urls:
                try:
                    pages.append({'text': extract_text_from_url(url), 'source': url})
                except Exception as e:
                    print(e)
            return pages

In [75]:
pages = get_pages(["https://sbnri.com/blog/nri-mutual-fund"])

In [76]:
from haystack.schema import Document
docs = []
for page in pages:
    document = Document(content=page['text'], meta={"url": page['source']},content_type="text")
    docs.append(document)

In [77]:
preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=True,
    split_by="word",
    split_length=200,
    split_respect_sentence_boundary=True,
    split_overlap=10
)

docs = preprocessor.process(docs)

Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]

We found one or more sentences whose word count is higher than the split length.


In [78]:
document_store_1 = ElasticsearchDocumentStore(
    index="bm25",
    recreate_index=True,
)

document_store_2 = ElasticsearchDocumentStore(
    index="openai",
    embedding_dim=1536,
    recreate_index=True,
)
document_store_3 = ElasticsearchDocumentStore(
    index="multihop",
    embedding_dim=384,
    recreate_index=True,
)
document_store_4 = ElasticsearchDocumentStore(
    index="dpr",
    embedding_dim=768,
    recreate_index=True,
)

bm25_retriever = BM25Retriever(document_store=document_store_1)

openai_retriever = EmbeddingRetriever(
   document_store=document_store_2,
   batch_size=8,
   embedding_model="text-embedding-ada-002",
   api_key=os.getenv("OPENAI_API_KEY"),
   max_seq_len=1536,
)

multihop_retriever = MultihopEmbeddingRetriever("sentence-transformers/all-MiniLM-L6-v2",
    document_store=document_store_3,model_format='sentence_transformers')

dpr_retriever = DensePassageRetriever(
    document_store=document_store_4,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base"
)

In [79]:
document_store_1.write_documents(docs)

document_store_2.write_documents(docs)
document_store_2.update_embeddings(openai_retriever)

document_store_3.write_documents(docs)
document_store_3.update_embeddings(multihop_retriever)

document_store_4.write_documents(docs)
document_store_4.update_embeddings(dpr_retriever)

Updating embeddings:   0%|          | 0/19 [00:00<?, ? Docs/s]

Calculating embeddings:   0%|          | 0/3 [00:00<?, ?it/s]

Updating embeddings:   0%|          | 0/19 [00:00<?, ? Docs/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Updating embeddings:   0%|          | 0/19 [00:00<?, ? Docs/s]

Create embeddings:   0%|          | 0/32 [00:00<?, ? Docs/s]

In [80]:
document_store_1 = ElasticsearchDocumentStore(index = "bm25")
document_store_2 = ElasticsearchDocumentStore(index = "openai", embedding_dim=1536)
document_store_3 = ElasticsearchDocumentStore(index = "multihop", embedding_dim=384)
document_store_4 = ElasticsearchDocumentStore(index = "dpr", embedding_dim=768)

In [81]:
bm25_retriever = BM25Retriever(document_store=document_store_1)

openai_retriever = EmbeddingRetriever(
   document_store=document_store_2,
   batch_size=8,
   embedding_model="text-embedding-ada-002",
   api_key=os.getenv("OPENAI_API_KEY"),
   max_seq_len=1536,
)

multihop_retriever = MultihopEmbeddingRetriever("sentence-transformers/all-MiniLM-L6-v2",
    document_store=document_store_3,model_format='sentence_transformers')

dpr_retriever = DensePassageRetriever(
    document_store=document_store_4,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base"
)

In [82]:
query_classifier = TransformersQueryClassifier()

join_documents = JoinDocuments("merge")

ranker = SentenceTransformersRanker(model_name_or_path="cross-encoder/ms-marco-MiniLM-L-12-v2")

pipe = Pipeline()
pipe.add_node(component=openai_retriever, name="OpenAIRetriever", inputs=["Query"])
pipe.add_node(component=dpr_retriever, name="DPRRetriever", inputs=["Query"])
pipe.add_node(component=multihop_retriever, name="MultihopRetriever", inputs=["Query"])
pipe.add_node(component=bm25_retriever, name="BM25Retriever", inputs=["Query"])
pipe.add_node(component=join_documents, name="JoinDocuments", inputs=["OpenAIRetriever","DPRRetriever","MultihopRetriever","BM25Retriever"])
pipe.add_node(component=ranker, name="Ranker", inputs=["JoinDocuments"])

In [83]:
results = pipe.run(query="Can NRI invest in Indian mutual funds?", params = {"Ranker": {"top_k":5}, "OpenAIRetriever": {"top_k":20}, "DPRRetriever": {"top_k":20}, "MultihopRetriever": {"top_k":20}, "BM25Retriever": {"top_k":20}})

Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Querying:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [84]:
context_string = ""
for document in results['documents']:
    context_string += document.content + '\n' 

In [85]:
print(context_string)
print(len(context_string))

The simple answer is yes NRIs, as well as PIOs, can invest in Indian mutual funds provided they adhere to the regulations of the Foreign Exchange Management Act (FEMA). NRIs can invest in mutual funds in India on repatriation as well as a non-repatriation basis. However, there are only a few asset management companies (AMCs) that accept mutual fund applications from NRIs in the USA and Canada. So, NRIs from these countries must check when investing in Indian mutual funds. NRIs are offered most of the benefits and conveniences as resident investors while investing. They can invest in equity funds, debt funds, or hybrid funds depending on their investment goals and risk tolerance. NRIs can repatriate the redemption proceeds as and when they wish to. Benefits of mutual funds for NRIsIndia is one of the fastest growing major economies in the world, attracting staggering foreign investment. NRIs can also be a part of India’s growth and for non-resident individuals who have dependents in Ind

In [86]:
completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": f"{context_string} given this context tell me Can NRI invest in Indian mutual funds?"}
  ]
)

In [87]:
completion

<OpenAIObject chat.completion id=chatcmpl-7Njjtp0L5mrGGuaDEikgQjKS5z3qF at 0x7f0f45dd7ba0> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "Yes, NRIs can invest in Indian mutual funds as long as they adhere to the regulations of the Foreign Exchange Management Act (FEMA). They can invest in equity funds, debt funds, or hybrid funds based on their investment goals and risk tolerance. NRIs can manage their portfolio online from the convenience of their home and can opt for systematic online transfers from one scheme to another. NRIs must have an NRE or NRO bank account in India to invest in mutual funds and must know about the mutual fund schemes they can invest in, investment procedure, regulations, and tax on capital gains.",
        "role": "assistant"
      }
    }
  ],
  "created": 1685891721,
  "id": "chatcmpl-7Njjtp0L5mrGGuaDEikgQjKS5z3qF",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage":

In [88]:
def get_context(query,pipe,return_docs = False,*args,**kwargs):
    docs = pipe.run(query,*args, **kwargs)
    context_string = ""
    for document in docs['documents']:
        context_string += document.content + '\n' 
    if return_docs:
        return (context_string,docs)
    else:
        return context_string

In [92]:
query = input("Enter an input: ")
params = {"Ranker": {"top_k":5}, "OpenAIRetriever": {"top_k":20}, "DPRRetriever": {"top_k":20}, "MultihopRetriever": {"top_k":20}, "BM25Retriever": {"top_k":20}}
context_delimiter = "####"
query_delimiter = "````"
context = get_context(query,pipe,params = params)
message = f'''
You are given some context which is enclosed by {context_delimiter} and a query which is enclosed by {query_delimiter}.
Given the context try to answer the question as accurately as possibly without making up facts.

Context = {context_delimiter}{context}{context_delimiter}

Question = {query_delimiter}{query}{query_delimiter}
'''
completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": message}
  ]
)

Calculating embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Querying:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [93]:
completion

<OpenAIObject chat.completion id=chatcmpl-7Njkj9NekB4mfx3eJKJr91OrhpW4E at 0x7f0f2c3c4860> JSON: {
  "choices": [
    {
      "finish_reason": "stop",
      "index": 0,
      "message": {
        "content": "Some good mutual funds to invest in for NRIs in Singapore, UAE-based NRIs, and NRIs in Australia are mentioned in the context, which include SBI mutual fund, HDFC mutual fund, ICICI prudential mutual fund, among others. However, for NRIs from the USA and Canada, the options for mutual fund investment in India can be limited.",
        "role": "assistant"
      }
    }
  ],
  "created": 1685891773,
  "id": "chatcmpl-7Njkj9NekB4mfx3eJKJr91OrhpW4E",
  "model": "gpt-3.5-turbo-0301",
  "object": "chat.completion",
  "usage": {
    "completion_tokens": 75,
    "prompt_tokens": 1281,
    "total_tokens": 1356
  }
}

In [98]:
messages = [{"role": "system", "content": "You are a helpful assistant"}]

def build_chat(user_input: str = "", asistant_input: str = ""):
  if user_input != "":
    messages.append({"role": "user", "content": user_input})
  if asistant_input != "":
    messages.append({"role": "assistant", "content": asistant_input})

def chat(input: str):
  build_chat(user_input=input)
  completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=messages
)
  chat_gpt_answer = completion.choices[0]['message']['content']
  build_chat(asistant_input=chat_gpt_answer)
  return chat_gpt_answer

In [99]:
chat("What are some good ones?")

"Thank you! I'm here to help you in any way I can. Here are some possible suggestions for how I can assist you:\n\n1. Provide information on a particular topic or answer your questions\n2. Help you with research or finding resources online\n3. Set reminders or help you manage your schedule\n4. Make suggestions or recommendations based on your preferences\n5. Assist you with tasks or projects\n6. Review and edit your work\n7. Proofread your writing\n8. Offer advice or support\n9. Help you troubleshoot a problem\n10. Provide entertainment or engage in conversation."

In [97]:
messages

[{'role': 'system', 'content': 'You are a helpful assistant'},
 {'role': 'user', 'content': 'Can NRI invest in Indian mutual funds?'},
 {'role': 'assistant', 'content': 'Y'},
 {'role': 'user', 'content': 'What are some good ones?'},
 {'role': 'assistant', 'content': 'T'}]