# Test the Pyppeteer Puppeteer wrapper


### References
 - https://scrapeops.io/python-web-scraping-playbook/python-pyppeteer/
 - Note Chromium installed at /Users/stephengodfrey/Library/Application Support/pyppeteer/local-chromium/1181205
 - https://medium.com/@alexandrdzhumurat/leveraging-chatgpt-for-html-parsing-a-game-changer-rendering-regular-expressions-obsolete-5d8779d761ba


In [2]:
import asyncio
import sys, os


from pyppeteer import launch
from bs4 import BeautifulSoup

from pydantic import BaseModel
import openai

# This is needed for the OpenAI use
# It looks for a .env file with the OpenAI Key as one entry
sys.path.insert(0, "../../utils/")
import authentication as au


## Get page content

In [3]:
# Get the html content and view with beautiful soup
# turl = "https://www.cccco.edu/"
turl = "https://www.cccco.edu/About-Us/News-and-Media/Press-Releases/2024-celebrating-apprenticeship-week"
turl = "https://en.wikipedia.org/wiki/Chaffey_College"

async def main():
    browser = await launch()
    page = await browser.newPage()
    await page.goto(url=turl)

    ## Get HTML
    html = await page.content()
    await browser.close()
    return html

html_response = await main()

## Load HTML Response Into BeautifulSoup
soup = BeautifulSoup(html_response, "html.parser")

## Get the HTML code in a string
html_code_string = str(soup)


# Get a screenshot of page
# async def main():
#     browser = await launch()
#     page = await browser.newPage()
#     await page.goto('https://quotes.toscrape.com/')
#     await page.screenshot({'path': 'screenshot.png'})
#     await browser.close()




In [4]:
# dir(soup)

In [5]:
soup

# Return all links in a tags with href values
page_urls = []
for a in soup.find_all('a', href=True):
    # print("Found the URL:", a['href'])
    # page_urls.append((a.contents, a['href']))
    page_urls.append(a['href'])

print("The full length of this page's HTML content is {} characters long".format(len(str(soup))))
print()
print("There are {} <a tags with href values on this page".format(len(page_urls)))
print("There contain {} unique URLs".format(len(set(page_urls))))

atagloc = html_code_string.find("<a")
print("here's an example of an <a tag")
print(html_code_string[atagloc:atagloc+100])
print()

page_texts = []
for p in soup.find_all('p'):
    page_texts.append(p.text)

print("There are {} <p /p> tags on this page".format(len(page_texts)))
print("here's an example of an <p tag")
ptagloc = html_code_string.find("<p")

print(html_code_string[ptagloc:ptagloc+100])



The full length of this page's HTML content is 128737 characters long

There are 470 <a tags with href values on this page
There contain 383 unique URLs
here's an example of an <a tag
<a class="mw-jump-link" href="#bodyContent">Jump to content</a>
<div class="vector-header-container"

There are 9 <p /p> tags on this page
here's an example of an <p tag
<p><b>Chaffey College</b> is a <a class="mw-redirect" href="/wiki/Public_college" title="Public coll


In [6]:
page_urls

import urllib

# Get the base URL
purl = urllib.parse.urlparse(turl)
purl = "{}://{}".format(purl.scheme, purl.netloc)

page_urls_nav = []
for url in page_urls:
    page_urls_nav.append(urllib.parse.urljoin(purl, url))
# page_urls[1]


In [7]:
page_urls_nav

['https://en.wikipedia.org#bodyContent',
 'https://en.wikipedia.org/wiki/Main_Page',
 'https://en.wikipedia.org/wiki/Wikipedia:Contents',
 'https://en.wikipedia.org/wiki/Portal:Current_events',
 'https://en.wikipedia.org/wiki/Special:Random',
 'https://en.wikipedia.org/wiki/Wikipedia:About',
 'https://en.wikipedia.org/wiki/Wikipedia:Contact_us',
 'https://en.wikipedia.org/wiki/Help:Contents',
 'https://en.wikipedia.org/wiki/Help:Introduction',
 'https://en.wikipedia.org/wiki/Wikipedia:Community_portal',
 'https://en.wikipedia.org/wiki/Special:RecentChanges',
 'https://en.wikipedia.org/wiki/Wikipedia:File_upload_wizard',
 'https://en.wikipedia.org/wiki/Main_Page',
 'https://en.wikipedia.org/wiki/Special:Search',
 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en',
 'https://en.wikipedia.org/w/index.php?title=Special:CreateAccount&returnto=Chaffey+College',
 'https://en.wikipedia.org/w/index.p

## Use chatGPT to convert the HTML string to structured data

In [10]:
# set API Key
au_config = au.ApiAuthentication()
os.environ["OPENAI_API_KEY"] = au_config.apis_configs["OPENAI_KEY"]
openai.api_key = os.environ["OPENAI_API_KEY"]

client = openai.OpenAI()

class HTMLExtraction(BaseModel):
    title: str
    urls: list[str]
    texts: list[str]

system_prompt = ("You are an expert at converting HTML code to structured data. "
                 "You will be given semi-structured HTML code in the form of a string. "
                 "You should extract structured data from it "
                 "and return the requested information. "
                 "Definitions of expected output: "
                 "urls can be found in the href field inside text blocks opened with <a and closed with </a>"
                 "texts can be found in inside text blocks opened with <p and closed with </p>")

html_code_string = str(soup)

completion = client.beta.chat.completions.parse(
    model="gpt-4o-2024-08-06",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": html_code_string}
    ],
    response_format=HTMLExtraction,
)

gpt_response = completion.choices[0].message.parsed

# get the html extraction model in a dictionary
hsd = gpt_response.model_dump()


In [11]:
print("chatGPT found {} URLs on this page representing {} unique URLs".format(len(hsd['urls']),
                                                                              len(set(hsd['urls']))))

# Compare techniques for extracting URLs
urls_in_ai_not_bs = [u for u in hsd['urls'] if u not in page_urls]
urls_in_bs_not_in_ai  = [u for u in page_urls if u not in hsd['urls']]
print("These URLs were found using chatGPT but not BeautifulSoup")
print(urls_in_ai_not_bs)
print()
print("These URLs were found by BeautifulSoup but not chatGPT")
print(urls_in_bs_not_in_ai)


chatGPT found 6 URLs on this page representing 6 unique URLs
These URLs were found using chatGPT but not BeautifulSoup
['https://upload.wikimedia.org/wikipedia/en/4/46/Chaffey-college-logo.png', 'https://en.m.wikipedia.org/wiki/Chaffey_College', 'https://creativecommons.org/licenses/by-sa/4.0/deed.en']

These URLs were found by BeautifulSoup but not chatGPT
['#bodyContent', '/wiki/Main_Page', '/wiki/Wikipedia:Contents', '/wiki/Portal:Current_events', '/wiki/Special:Random', '/wiki/Wikipedia:About', '//en.wikipedia.org/wiki/Wikipedia:Contact_us', '/wiki/Help:Contents', '/wiki/Help:Introduction', '/wiki/Wikipedia:Community_portal', '/wiki/Special:RecentChanges', '/wiki/Wikipedia:File_upload_wizard', '/wiki/Main_Page', '/wiki/Special:Search', 'https://donate.wikimedia.org/wiki/Special:FundraiserRedirector?utm_source=donate&utm_medium=sidebar&utm_campaign=C13_en.wikipedia.org&uselang=en', '/w/index.php?title=Special:CreateAccount&returnto=Chaffey+College', '/w/index.php?title=Special:UserL

In [12]:
print("chatGPT found {} texts on this page".format(len(hsd['texts'])))

chatGPT found 9 texts on this page


In [13]:
hsd['texts']
" ".join(hsd['texts'])

"Chaffey College is a public community college in Rancho Cucamonga, California. The college serves students in Chino, Chino Hills, Fontana, Montclair, Ontario, Rancho Cucamonga and Upland. It is the oldest community college in California. The school was established in Ontario, California, in 1883, when city founders and brothers George and William Chaffey donated land and established an endowment for a private college. The private school was founded as the Chaffey College of Agriculture through the University of Southern California; USC, also a private university, had been founded three years earlier in nearby Los Angeles. The cornerstone of the new school was laid on March 17, 1883, at Fourth and Euclid in Ontario; it opened on October 15, 1885. The original institution included a secondary school and was run by USC until 1901. During this period, Chaffey's football team had a 1–1 series with the young USC football team, winning 32–6 in 1893 and losing 38–0 in 1897. Financial troubles

In [14]:
# print(page_texts)
" ".join(page_texts)


"Chaffey College is a public community college in Rancho Cucamonga, California. The college serves students in Chino, Chino Hills, Fontana, Montclair, Ontario, Rancho Cucamonga and Upland. It is the oldest community college in California.[2]\n The school was established in Ontario, California, in 1883, when city founders and brothers George and William Chaffey donated land and established an endowment for a private college.  The private school was founded as the Chaffey College of Agriculture through the University of Southern California;[3] USC, also a private university, had been founded three years earlier in nearby Los Angeles.  The cornerstone of the new school was laid on March 17, 1883, at Fourth and Euclid in Ontario; it opened on October 15, 1885. The original institution included a secondary school and was run by USC until 1901.  During this period, Chaffey's football team had a 1–1 series with the young USC football team, winning 32–6 in 1893 and losing 38–0 in 1897.[4]\n Fi