In [2]:
import pinecone
from sentence_transformers import SentenceTransformer,util
import requests
from bs4 import BeautifulSoup
import time
from urllib.parse import urljoin
import re
model = SentenceTransformer('all-MiniLM-L6-v2') #384 dimensional

  from tqdm.autonotebook import tqdm


### Implement Scraper

In [3]:
def get_html_content(url):
    """
    Extract raw content from a single web page
    """
    response = requests.get(url)
    return response.content

In [4]:
def get_plain_text(html_content):
    """
    Turn HTML content into text
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    for script in soup(["script"]):
        script.extract()
    return soup.get_text()

In [5]:
def split_text_into_chunks(plain_text, max_chars=2000):
    """
    Cut text in fixed length chunks
    """
    text_chunks = []
    current_chunk = ""
    for line in plain_text.split("\n"):
        if len(current_chunk) + len(line) + 1 <= max_chars:
            current_chunk += line + " "
        else:
            text_chunks.append(current_chunk.strip())
            current_chunk = line + " "
    if current_chunk:
        text_chunks.append(current_chunk.strip())
    return text_chunks

In [6]:
def scrape_text_from_url(url, max_chars=2000):
    """
    Aggregate the previous functions into one scraper
    """
    html_content = get_html_content(url)
    plain_text = get_plain_text(html_content)
    text_chunks = split_text_into_chunks(plain_text, max_chars)
    return text_chunks

In [7]:
def get_single_url(href, pattern_href = re.compile(r"(https://www.mutualofamerica.com[^#]+)(#.*)?")):
    """
    Return the URL if it's in the scope of the web scraping
    """
    m = pattern_href.match(href)
    if m is None:
        return None
    return m.group(1)

In [8]:
"""
Loop over the urls to find all the relevant web pages
"""

def get_url_list(to_explore):
    urls = set()
    while len(to_explore) > 0:
        to_explore2 = []
        for url in to_explore:
            html_content = requests.get(url)
            time.sleep(5)
            soup = BeautifulSoup(html_content.content, 'html.parser')
            for link in soup.find_all("a"):
                try:
                    href = get_single_url(urljoin(url, link["href"]))
                    if href is not None and href not in urls:
                        urls.add(href)
                        if "?" in href:
                            to_explore2.append(href)

                except KeyError:
                    pass
        to_explore = list(to_explore2)

    urls = sorted([url for url in list(urls) if "?" not in url])
    return urls

In [9]:
url_list = get_url_list(["https://www.mutualofamerica.com/"])
len(url_list)

69

In [35]:
html = requests.get("https://www.mutualofamerica.com/sitemap.xml")
soup = BeautifulSoup(html.content, 'lxml')
xml_tag = soup.find_all('url')

to_explore2 = []
for link in soup.find_all("loc"):
    to_explore2.append(link.getText())

b'<?xml version="1.0" encoding="utf-8"?><urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><url><loc>https://www.mutualofamerica.com/employers/interest-account-and-investment-options/performance/learn-more/changes-over-time</loc><lastmod>2022-01-06T20:07:28+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.mutualofamerica.com/employers/interest-account-and-investment-options/performance/learn-more/right-for-you</loc><lastmod>2022-01-05T19:37:32+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.mutualofamerica.com/individuals/interest-account-and-investment-options/performance/learn-more-fpa/right-for-you</loc><lastmod>2022-02-03T23:40:15+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url><url><loc>https://www.mutualofamerica.com/individuals/interest-account-and-investment-options/performance/learn-more-fpa/changes-over-time</loc><lastmod>2022-01-07T21:13:25+

In [42]:
xml_tag

[<url><loc>https://www.mutualofamerica.com/employers/interest-account-and-investment-options/performance/learn-more/changes-over-time</loc><lastmod>2022-01-06T20:07:28+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url>,
 <url><loc>https://www.mutualofamerica.com/employers/interest-account-and-investment-options/performance/learn-more/right-for-you</loc><lastmod>2022-01-05T19:37:32+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url>,
 <url><loc>https://www.mutualofamerica.com/individuals/interest-account-and-investment-options/performance/learn-more-fpa/right-for-you</loc><lastmod>2022-02-03T23:40:15+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url>,
 <url><loc>https://www.mutualofamerica.com/individuals/interest-account-and-investment-options/performance/learn-more-fpa/changes-over-time</loc><lastmod>2022-01-07T21:13:25+00:00</lastmod><changefreq>weekly</changefreq><priority>0.5</priority></url>,
 <url><loc>h

In [67]:
len(to_explore2)

143

In [70]:
to_explore2

['https://www.mutualofamerica.com/employers/interest-account-and-investment-options/performance/learn-more/changes-over-time',
 'https://www.mutualofamerica.com/employers/interest-account-and-investment-options/performance/learn-more/right-for-you',
 'https://www.mutualofamerica.com/individuals/interest-account-and-investment-options/performance/learn-more-fpa/right-for-you',
 'https://www.mutualofamerica.com/individuals/interest-account-and-investment-options/performance/learn-more-fpa/changes-over-time',
 'https://www.mutualofamerica.com/individuals/interest-account-and-investment-options/performance/learn-more-fpa',
 'https://www.mutualofamerica.com/employers/interest-account-and-investment-options/performance/money-market',
 'https://www.mutualofamerica.com/employers/interest-account-and-investment-options/prospectus-landing/prospectus',
 'https://www.mutualofamerica.com/employers/interest-account-and-investment-options/performance/learn-more',
 'https://www.mutualofamerica.com/emp

In [69]:
link.text

'https://www.mutualofamerica.com/'

In [50]:
?link