In [2]:
# Test the Pyppeteer Puppeteer wrapper


# References
# - https://scrapeops.io/python-web-scraping-playbook/python-pyppeteer/
# - Note Chromium installed at /Users/stephengodfrey/Library/Application Support/pyppeteer/local-chromium/1181205
# - https://medium.com/@alexandrdzhumurat/leveraging-chatgpt-for-html-parsing-a-game-changer-rendering-regular-expressions-obsolete-5d8779d761ba


In [1]:
import asyncio
import sys, os


from pyppeteer import launch
from bs4 import BeautifulSoup

from pydantic import BaseModel
import openai


sys.path.insert(0, "../../utils/")
# os.listdir("../../utils/")
import authentication as au




## Get page content

In [2]:

# Get a screenshot of page
# async def main():
#     browser = await launch()
#     page = await browser.newPage()
#     await page.goto('https://quotes.toscrape.com/')
#     await page.screenshot({'path': 'screenshot.png'})
#     await browser.close()

# Get the html content and view with beautiful soup
async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto('https://www.cccco.edu/')

    ## Get HTML
    html = await page.content()
    await browser.close()
    return html

html_response = await main()

## Load HTML Response Into BeautifulSoup
soup = BeautifulSoup(html_response, "html.parser")

## Get the HTML code in a string
html_code_string = str(soup)


In [3]:
# dir(soup)

In [9]:
soup

# Return all links in a tags with href values
page_urls = []
for a in soup.find_all('a', href=True):
    # print("Found the URL:", a['href'])
    # page_urls.append((a.contents, a['href']))
    page_urls.append(a['href'])

print("The full length of this page's HTML content is {} characters long".format(len(str(soup))))
print()
print("There are {} <a tags with href values on this page".format(len(page_urls)))
print("There contain {} unique URLs".format(len(set(page_urls))))

atagloc = html_code_string.find("<a")
print("here's an example of an <a tag")
print(html_code_string[atagloc:atagloc+100])
print()

page_texts = []
for p in soup.find_all('p'):
    page_texts.append(p.text)

print("There are {} <p /p> tags on this page".format(len(page_texts)))
print("here's an example of an <p tag")
ptagloc = html_code_string.find("<p")

print(html_code_string[ptagloc:ptagloc+100])



The full length of this page's HTML content is 38734 characters long

There are 94 <a tags with href values on this page
There contain 44 unique URLs
here's an example of an <a tag
<a href="#page-banner">Skip to Main Content</a></div>
<nav aria-label="Utility Links" class="utility

There are 19 <p /p> tags on this page
here's an example of an <p tag
<p>Learn about career paths and career education opportunities designed to get you into good-paying 


## Use chatGPT to convert the HTML string to structured data

In [5]:
# set API Key
au_config = au.ApiAuthentication()
os.environ["OPENAI_API_KEY"] = au_config.apis_configs["OPENAI_KEY"]
openai.api_key = os.environ["OPENAI_API_KEY"]

client = openai.OpenAI()

class HTMLExtraction(BaseModel):
    title: str
    urls: list[str]
    texts: list[str]

system_prompt = ("You are an expert at converting HTML code to structured data. "
                 "You will be given semi-structured HTML code in the form of a string. "
                 "You should extract structured data from it "
                 "and return the requested information. "
                 "Definitions of expected output: "
                 "urls can be found in the href field inside text blocks opened with <a and closed with </a>"
                 "texts can be found in inside text blocks opened with <p and closed with </p>")

html_code_string = str(soup)

completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-08-06",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": html_code_string}
    ],
    response_format=HTMLExtraction,
)

gpt_response = completion.choices[0].message.parsed

# get the html extraction model in a dictionary
hsd = gpt_response.model_dump()


In [10]:
print("chatGPT found {} URLs on this page representing {} unique URLs".format(len(hsd['urls']),
                                                                              len(set(hsd['urls']))))

# Compare techniques for extracting URLs
urls_in_ai_not_bs = [u for u in hsd['urls'] if u not in page_urls]
urls_in_bs_not_in_ai  = [u for u in page_urls if u not in hsd['urls']]
print("These URLs were found using BeautifulSoup but not chatGPT")
print(urls_in_ai_not_bs)
print("These URLs were found by chatGPT but not BeautifulSoup")
print(urls_in_bs_not_in_ai)


chatGPT found 51 URLs on this page representing 42 unique URLs
These URLs were found using BeautifulSoup but not chatGPT
[]
These URLs were found by chatGPT but not BeautifulSoup
['#page-banner', 'Search-Results#site-search-bar']


In [11]:
print("chatGPT found {} texts on this page".format(len(hsd['texts'])))

chatGPT found 7 texts on this page
