In [1]:
import pinecone
from sentence_transformers import SentenceTransformer,util
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
import re
model = SentenceTransformer('all-MiniLM-L6-v2') #384 dimensional

  from tqdm.autonotebook import tqdm


### Implement Scraper

In [2]:
def get_html_content(url):
    """
    Extract raw content from a single web page
    """
    response = requests.get(url)
    return response.content

In [3]:
def get_plain_text(html_content):
    """
    Turn HTML content into text
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    for script in soup(["script"]):
        script.extract()
    return soup.get_text()

In [4]:
def split_text_into_chunks(plain_text, max_chars=2000):
    """
    Cut text in fixed length chunks
    """
    text_chunks = []
    current_chunk = ""
    for line in plain_text.split("\n"):
        if len(current_chunk) + len(line) + 1 <= max_chars:
            current_chunk += line + " "
        else:
            text_chunks.append(current_chunk.strip())
            current_chunk = line + " "
    if current_chunk:
        text_chunks.append(current_chunk.strip())
    return text_chunks

In [5]:
def scrape_text_from_url(url, max_chars=2000):
    """
    Aggregate the previous functions into one scraper
    """
    html_content = get_html_content(url)
    plain_text = get_plain_text(html_content)
    text_chunks = split_text_into_chunks(plain_text, max_chars)
    return text_chunks

In [6]:
def get_single_url(href, pattern_href = re.compile(r"(https://www.mutualofamerica.com[^#]+)(#.*)?")):
    """
    Return the URL if it's in the scope of the web scraping
    """
    m = pattern_href.match(href)
    if m is None:
        return None
    return m.group(1)

In [7]:
"""
Loop over the urls to find all the relevant web pages
"""

def get_url_list(to_explore):
    urls = set()
    while len(to_explore) > 0:
        to_explore2 = []
        for url in to_explore:
            html_content = requests.get(url)
            time.sleep(5)
            soup = BeautifulSoup(html_content.content, 'html.parser')
            for link in soup.find_all("a"):
                try:
                    href = get_single_url(urljoin(url, link["href"]))
                    if href is not None and href not in urls:
                        urls.add(href)
                        if "?" in href:
                            to_explore2.append(href)

                except KeyError:
                    pass
        to_explore = list(to_explore2)

    urls = sorted([url for url in list(urls) if "?" not in url])
    return urls

In [8]:
def get_initial_list(link = "https://www.mutualofamerica.com/sitemap.xml"):
    out_list = []
    html = requests.get(link)
    soup = BeautifulSoup(html.content, 'lxml')
    xml_tag = soup.find_all('url')
    for link in soup.find_all("loc"):
        out_list.append(link.getText())
    return out_list

In [9]:
# Loop over pages to extract content
scraped_dict = {}
for url in get_initial_list():
    scraped_dict[url] = scrape_text_from_url(url)
    
# # Save url and content to a .csv
# import pandas as pd
# pd.DataFrame([scraped_dict]).transpose().to_csv('scraped_data_10032023.csv')



### Upload website data to vector DataBase

In [5]:
# Erase an re-initialize vector database

pinecone.init(api_key="your_key", environment="gcp-starter")
# pinecone.delete_index("apptest")
# pinecone.create_index("apptest", dimension=384)
index = pinecone.Index("apptest")
index

<pinecone.index.Index at 0x2c58a33b1f0>

In [6]:
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.01029,
 'namespaces': {'': {'vector_count': 1029}},
 'total_vector_count': 1029}

In [12]:
def addData(corpusData,url, id_):
    """
    Upload text and the corresponding embeddings to the vector database
    """
#     id  = index.describe_index_stats()['total_vector_count']
    for i in range(len(corpusData)):
        chunk=corpusData[i].replace('\r', '')
        chunkInfo=(str(id_+i),
                model.encode(chunk).tolist(),
                {'title': url,'context': chunk})
        index.upsert(vectors=[chunkInfo])

In [7]:
# id_ = 0
# for url, plain_text_chunks in scraped_dict.items(): 
# #     print(f"Uploading data from url: {url}")
#     addData(plain_text_chunks,url, id_)
#     id_+=len(plain_text_chunks)

In [8]:
def find_match(query,k):
    """
    Select the k text chunks that are the closest to the query from a semantic perspective
    """
    query_em = model.encode(query).tolist()
    result = index.query(query_em, top_k=k, includeMetadata=True)

    return [result['matches'][i]['metadata']['title'] for i in range(k)],[result['matches'][i]['metadata']['context'] for i in range(k)]

### Create answering agent based on GPT3

In [9]:
import openai
openai.api_key='get-your-key'

def create_prompt(context,query):
    """
    Aggregate the Context and the query to create a prompt
    """
    
#     header = "Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query'. At the end of yout answer, display the key text elements that you used and explain their meaning  \n"
    header = """You will be provided with a context and a query. Your answer will be structures in 3 steps. 
    1/ Based on the query identify the key passages of the context. If the answer is not contained 
    within the text and requires some latest information to be updated, print 'Sorry Not Sufficient context to answer query'. 
                
    2/ Then based on these key element, formulate a synthetic answer to the query. Then elaborate and provide the details 
    
    3/ For each of the 3 main concepts used to provide an answer, Exctract the complete context element and print them"""
    prompt = f"{header} \n query: {query} \n context: {context} "
    
    return prompt

In [10]:
query = "Give me the phone number of a rollover specialist"
docs,res = find_match(query,7)

In [11]:
docs

['https://www.mutualofamerica.com/insights-and-tools/resource-center/nrsm',
 'https://www.mutualofamerica.com/insights-and-tools/resource-center/withdrawals-and-loans',
 'https://www.mutualofamerica.com/employers',
 'https://www.mutualofamerica.com/individuals/flexible-premium-annuity',
 'https://www.mutualofamerica.com/individuals/rollover-ira',
 'https://www.mutualofamerica.com/individuals/rollover-ira',
 'https://www.mutualofamerica.com/individuals/flexible-premium-annuity/features-and-benefits']

In [12]:
context= "\n".join(res)
prompt = create_prompt(context,query)
# print(prompt)

In [13]:
def generate_answer(prompt):
    """
    Manages the API call to OpenAI and get the model answer
    """
    response = openai.ChatCompletion.create(
        model = 'gpt-3.5-turbo',
#         model = 'gpt-4',
        temperature = 1,
        messages = [{"role": "system" , "content": "You are a helpful assistant."},
                    {'role': 'user'   , 'content': f'{prompt}'}
        ]
    )
    return response.choices[0].message.content
#     return (response.choices[0].text).strip()

In [14]:
reply = generate_answer(prompt)
print(reply)

Step 1: Identify key passages

Key passages from the context include:
"For more assistance from a Rollover Specialist, call 866.939.7655, Monday through Friday, 9:00 a.m. to 8:00 p.m. ET."
"Our Representatives and Retirement Plan Rollover Specialists can help your employees understand the savings and investment options available under their retirement plan."
"You should consider the investment objectives, risks, and charges and expenses of the variable annuity contract and the underlying investment funds carefully before investing. This and other information is contained in the contract prospectus and underlying funds prospectuses and summary prospectuses, which can be obtained by calling 800.468.3785 or visiting mutualofamerica.com."

Step 2: Formulate a synthetic answer

To get the phone number of a rollover specialist, you can call 866.939.7655, Monday through Friday, 9:00 a.m. to 8:00 p.m. ET. The rollover specialist can provide assistance with understanding retirement savings and 