In [25]:
import pinecone
from sentence_transformers import SentenceTransformer,util
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
import re
model = SentenceTransformer('all-MiniLM-L6-v2') #384 dimensional

### Implement Scraper

In [26]:
def get_initial_list(link = "https://www.mutualofamerica.com/sitemap.xml"):
    out_list = []
    html = requests.get(link)
    soup = BeautifulSoup(html.content, 'lxml')
    xml_tag = soup.find_all('url')
    for link in soup.find_all("loc"):
        out_list.append(link.getText())
    return out_list

In [27]:
def get_html_content(url):
    """
    Extract raw content from a single web page
    """
    response = requests.get(url)
    return response.content

In [28]:
def get_text_chunks(html_content):
    """
    Turn HTML content into text chunks
    """
    soup = BeautifulSoup(html_content)
    text = soup.get_text()

    lines = (line.strip() for line in text.splitlines())
    chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
    
    return chunks

In [29]:
def filter_text_chunks(chunks, min_length = 50):
    """
    Get list of relevant text chunks
    """
    out_list = [chunk for chunk in chunks if chunk]
    final_list = [t for t in out_list if len(t) > min_length]
    
    return final_list

In [30]:
def get_filtered_text(url):
    """
    Orchestrates the scraping of text data
    """
    html_content = get_html_content(url)

    chunks = get_text_chunks(html_content)

    final_list = filter_text_chunks(chunks)
    
    return final_list

In [31]:
# Loop over pages to extract content
scraped_dict = {}
for url in get_initial_list():
    scraped_dict[url] = get_filtered_text(url)
    
# # Save url and content to a .csv
# import pandas as pd
# pd.DataFrame([scraped_dict]).transpose().to_csv('scraped_data_10032023.csv')

In [32]:
scraped_dict

{'https://www.mutualofamerica.com/employers/interest-account-and-investment-options/performance/learn-more/changes-over-time': ["How the Retirement Funds' Mix of Equities, Fixed Income and Money Market Funds Changes Over Time",
  "Over time, the Retirement Funds' mix of investment allocations (equities, fixed income and money market funds) will change based on a predetermined strategy. Generally, except for the Retirement Income Fund, the more time that remains until a target-date Retirement Fund approaches its target retirement date, the more emphasis that Fund will place on achieving capital appreciation by investing more heavily in equity funds.",
  'Alternatively, the less time that remains until a target-date Retirement Fund approaches its target retirement date, the more emphasis that Fund will place on preserving capital while also seeking to produce income, by investing more in fixed income and short-term investments.',
  'As each target-date Retirement Fund approaches its targ

### Upload website data to vector DataBase

In [13]:
# Erase an re-initialize vector database

pinecone.init(api_key="18f40410-86f9-4e69-b576-3066a6431baf", environment="gcp-starter")
pinecone.delete_index("apptest")
pinecone.create_index("apptest", dimension=384)
index = pinecone.Index("apptest")
index

<pinecone.index.Index at 0x12a57d37f70>

In [14]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [15]:
def addData(corpusData,url, id_):
    """
    Upload text and the corresponding embeddings to the vector database
    """
#     id  = index.describe_index_stats()['total_vector_count']
    for i in range(len(corpusData)):
        chunk=corpusData[i].replace('\r', '')
        chunkInfo=(str(id_+i),
                model.encode(chunk).tolist(),
                {'title': url,'context': chunk})
        index.upsert(vectors=[chunkInfo])

In [17]:
id_ = 0
for url, plain_text_chunks in scraped_dict.items(): 
#     print(f"Uploading data from url: {url}")
    addData(plain_text_chunks,url, id_)
    id_+=len(plain_text_chunks)

In [33]:
def find_match(query,k):
    """
    Select the k text chunks that are the closest to the query from a semantic perspective
    """
    query_em = model.encode(query).tolist()
    result = index.query(query_em, top_k=k, includeMetadata=True)

    return [result['matches'][i]['metadata']['title'] for i in range(k)],[result['matches'][i]['metadata']['context'] for i in range(k)]

### Create answering agent based on GPT3

In [34]:
import openai
openai.api_key='sk-get_your_key'

def create_prompt(context,query):
    """
    Aggregate the Context and the query to create a prompt
    """
    
#     header = "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query'. At the end of yout answer, display the key text elements that you used and explain their meaning  \n"
    header = """You will be provided with a context and a query. Your answer will be structures in 3 steps. 
    1/ Based on the query identify the key passages of the context. If the answer is not contained 
    within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query'. 
                
    2/ Then based on these key element, formulate a synthetic answer to the query. Then elaborate and provide the details 
    
    3/ For each of the 3 main concepts used to provide an answer, Exctract the complete context element and print them"""
    prompt = f"{header} \n query: {query} \n context: {context} "
    
    return prompt

In [20]:
query = "Give me the phone number of a rollover specialist"
docs,res = find_match(query,7)

In [21]:
docs

['https://www.mutualofamerica.com/insights-and-tools/resource-center/nrsm',
 'https://www.mutualofamerica.com/individuals/roth-ira',
 'https://www.mutualofamerica.com/individuals',
 'https://www.mutualofamerica.com/individuals/rollover-ira',
 'https://www.mutualofamerica.com/individuals/flexible-premium-annuity',
 'https://www.mutualofamerica.com/individuals/interest-account-and-investment-options',
 'https://www.mutualofamerica.com/individuals/traditional-ira']

In [22]:
context= "\n".join(res)
prompt = create_prompt(context,query)
# print(prompt)

In [23]:
context

'For more assistance from a Rollover Specialist, call 866.939.7655, Monday through Friday, 9:00 a.m. to 8:00 p.m. ET.\nGet one-on-one assistance from a Rollover Specialist by calling 866.939.7655, Monday through Friday, 9:00\xa0a.m. to 8:00\xa0p.m.\xa0ET.\nGet one-on-one assistance from a Rollover Specialist by calling 866.939.7655, Monday through Friday, 9:00\xa0a.m. to 8:00\xa0p.m.\xa0ET.\nGet one-on-one assistance from a Rollover Specialist by calling 866.939.7655, Monday through Friday, 9:00 a.m. to 8:00 p.m. ET.\nGet one-on-one assistance from a Rollover Specialist by calling 866.939.7655, Monday through Friday, 9:00\xa0a.m. to 8:00\xa0p.m.\xa0ET.\nGet one-on-one assistance from a Rollover Specialist by calling 866.939.7655, Monday through Friday, 9:00\xa0a.m. to 8:00\xa0p.m.\xa0ET.\nGet one-on-one assistance from a Rollover Specialist by calling 866.939.7655, Monday through Friday, 9:00\xa0a.m. to 8:00\xa0p.m.\xa0ET.'

In [36]:
def generate_answer(prompt):
    """
    Manages the API call to OpenAI and get the model answer
    """
    response = openai.ChatCompletion.create(
        model = 'gpt-3.5-turbo',
#         model = 'gpt-4',
        temperature = 1,
        messages = [{"role": "system" , "content": "You are a helpful assistant."},
                    {'role': 'user'   , 'content': f'{prompt}'}
        ]
    )
    return response.choices[0].message.content
#     return (response.choices[0].text).strip()

In [14]:
reply = generate_answer(prompt)
print(reply)

Step 1: Identify key passages

Key passages from the context include:
"For more assistance from a Rollover Specialist, call 866.939.7655, Monday through Friday, 9:00 a.m. to 8:00 p.m. ET."
"Our Representatives and Retirement Plan Rollover Specialists can help your employees understand the savings and investment options available under their retirement plan."
"You should consider the investment objectives, risks, and charges and expenses of the variable annuity contract and the underlying investment funds carefully before investing. This and other information is contained in the contract prospectus and underlying funds prospectuses and summary prospectuses, which can be obtained by calling 800.468.3785 or visiting mutualofamerica.com."

Step 2: Formulate a synthetic answer

To get the phone number of a rollover specialist, you can call 866.939.7655, Monday through Friday, 9:00 a.m. to 8:00 p.m. ET. The rollover specialist can provide assistance with understanding retirement savings and 

In [37]:
query = "what are the conditions for a tax payer to contribute to a roth IRA ?"
docs,res = find_match(query,7)
context= "\n".join(res)
prompt = create_prompt(context,query)
reply = generate_answer(prompt)
reply

In [39]:
# reply

'Key passages:\n- "Eligibility to make contributions to a Roth IRA is limited by income and filing status."\n- "All contributions to a Roth IRA are made on an after-tax basis."\n- "For single taxpayers with an AGI between $138,000 and $153,000, the maximum allowable contribution to a Roth IRA is proportionately reduced."\n- "With a Roth IRA, unlike other types of IRAs, you are not required to begin taking a distribution at any specific age, and you can continue to make contributions as long as you have earned income."\n- "Withdrawals of your after-tax contributions are not subject to federal income taxes."\n- "Withdrawals of your interest and any investment earnings are not subject to federal income taxes if taken at least five years after you first contribute to your Roth IRA, and you have attained age 59½; or withdrawals for a qualified first-time home purchase (up to a $10,000 maximum lifetime limit); as a result of your death or disability, or up to $5,000 to pay for expenses relat

In [40]:
query = "What is the air speed velocity of a swallow ?"
docs,res = find_match(query,7)
context= "\n".join(res)
prompt = create_prompt(context,query)
reply = generate_answer(prompt)
reply

'1/ Based on the query "what is the air speed velocity of a swallow?", there is not sufficient context to answer the query.\n\n2/ Sorry Not Sufficient context to answer query.\n\n3/ Sorry Not Sufficient context to answer query.'