## Brochure Website Scraper
A simple Python class to scrape text content from brochure-style websites using requests and BeautifulSoup.

In [1]:
from openai import OpenAI
from website_scraper import Website
from IPython.display import Markdown, display
import json

In [2]:
from dotenv import load_dotenv
import os

# Load API key from the environment variable `GEMINI_API_KEY`.
load_dotenv(override=True)  
api_key = os.getenv('GEMINI_API_KEY')

In [3]:
gemini = OpenAI(
    api_key=api_key,
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

In [4]:
url = "https://apple.com"

In [5]:
website = Website(url)
website.get_links()

['https://support.apple.com/?cid=gn-ols-home-hp-tab',
 'https://fitness.apple.com/fitness-plus/explore?itscg=10000&itsct=fit-hero-hp_tile-apl-ann-250104',
 'https://fitness.apple.com/fitness-plus/explore?itscg=10000&itsct=fit-hero-hp_tile-apl-ann-250104',
 'https://fitness.apple.com/subscribe?itscg=10000&itsct=fit-hero-hp_tile-apl-ann-250104',
 'https://fitness.apple.com/subscribe?itscg=10000&itsct=fit-hero-hp_tile-apl-ann-250104',
 'https://card.apple.com/apply/application?referrer=cid%3Dapy-200-10000036&start=false',
 'https://wallet.apple.com/apple-card/setup/feature/ccs?referrer=cid%3Dapy-200-10000036',
 'https://tv.apple.com/us/movie/f1-the-movie/umc.cmc.3t6dvnnr87zwd4wmvpdx5came?l=en-US?itscg=10000&itsct=atv-apl_hp-stream_now--220622',
 'https://tv.apple.com/us/show/pluribus/umc.cmc.37axgovs2yozlyh3c2cmwzlza?l=en-US?itscg=10000&itsct=atv-apl_hp-stream_now--220622',
 'https://tv.apple.com/us/show/hijack/umc.cmc.1dg08zn0g3zx52hs8npoj5qe3?l=en-US?itscg=10000&itsct=atv-apl_hp-stream_

In [6]:
link_system_prompt = """
You are provided with a list of links found on a webpage.
You are able to decide which of the links would be most relevant to include in a brochure about the company,
such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:

{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [None]:
def get_links_user_prompt(url):
    user_prompt = f"""
Here is the list of links on the website {url} -
Please decide which of these are the top 10 relevant web links for a brochure about the company, 
respond with the full https URL in JSON format.
Do not include Terms of Service, Privacy, email links.

Links (some might be relative links):

"""
    website = Website(url)
    links = website.get_links()
    user_prompt += "\n".join(links)
    return user_prompt

In [25]:
print(get_links_user_prompt(url))


Here is the list of links on the website https://apple.com -
Please decide which of these are relevant web links for a brochure about the company, 
respond with the full https URL in JSON format.
Do not include Terms of Service, Privacy, email links.

Links (some might be relative links):

https://support.apple.com/?cid=gn-ols-home-hp-tab
https://fitness.apple.com/fitness-plus/explore?itscg=10000&itsct=fit-hero-hp_tile-apl-ann-250104
https://fitness.apple.com/fitness-plus/explore?itscg=10000&itsct=fit-hero-hp_tile-apl-ann-250104
https://fitness.apple.com/subscribe?itscg=10000&itsct=fit-hero-hp_tile-apl-ann-250104
https://fitness.apple.com/subscribe?itscg=10000&itsct=fit-hero-hp_tile-apl-ann-250104
https://card.apple.com/apply/application?referrer=cid%3Dapy-200-10000036&start=false
https://wallet.apple.com/apple-card/setup/feature/ccs?referrer=cid%3Dapy-200-10000036
https://tv.apple.com/us/movie/f1-the-movie/umc.cmc.3t6dvnnr87zwd4wmvpdx5came?l=en-US?itscg=10000&itsct=atv-apl_hp-stream_

In [26]:
def select_relevant_links(url):
    response = gemini.chat.completions.create(
        model="gemini-2.5-flash",
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(url)},
        ],
        temperature=0.2,  # low temperature for more deterministic output
        max_tokens=1500,
        response_format={"type": "json_object"}
    )
    result =  response.choices[0].message.content
    links = json.loads(result)
    return links
    

In [27]:
print(select_relevant_links(url))

{'links': [{'type': 'support page', 'url': 'https://support.apple.com/?cid=gn-ols-home-hp-tab'}, {'type': 'company store app', 'url': 'https://apps.apple.com/us/app/apple-store/id375380948'}, {'type': 'investor relations page', 'url': 'https://investor.apple.com/'}, {'type': 'store locator page', 'url': 'https://locate.apple.com/'}]}


### Extract text content from all links on the website

```python

In [30]:
def fetch_link_contents(url):
    website_contents = Website(url).text
    relevent_links = select_relevant_links(url)

    result = f"==> Landing Page:\n{website_contents}\n\n==> Relevant Links:\n"

    for link in relevent_links['links']:
        link_url = link['url']
        link_contents = Website(link_url).text
        result += f"### Link: {link['type']}\n\n{link_contents}\n\n"
    
    return result

In [31]:
print(fetch_link_contents(url))

==> Landing Page:
Apple
Apple
Store
Mac
iPad
iPhone
Watch
Vision
AirPods
TV & Home
Entertainment
Accessories
Support
0
+
Apple Fitness Plus
Ring in your resolutions.
Get up to 3 months on us.
1
Learn more
Learn more
Try it free
Try it free
Scan code with iPhone or iPad.
Use the Camera app to activate your free trial.
1
Apple Watch Series 11
Turn resolutions into routines. Quit quitting your fitness goals.
Learn more
Buy
iPhone
Say hello to the latest generation of iPhone.
Learn more
Shop iPhone
AirPods Pro 3
The world’s best in-ear Active Noise Cancellation.
Learn more
Buy
iPad Air
Now supercharged by the M3 chip.
Learn more
Buy
MacBook Pro 14”
Supercharged by M5.
Learn more
Buy
iPad
Now with the speed of the A16 chip and double the starting storage.
Learn more
Buy
Apple Trade In
Get up to $180–$670 in credit when you trade in iPhone 13 or higher.
2
Get your estimate
Apple Card
Get up to 3% Daily Cash back with every purchase.
Learn more
Apply now
Apply now
Endless entertainment.
Item 

In [32]:
brochure_system_prompt = """
You are an assistant that analyzes the contents of several relevant pages from a company website
and creates a short brochure about the company for prospective customers, investors and recruits.
Respond in markdown without code blocks.
Include details of company culture, customers and careers/jobs if you have the information.
"""

In [33]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"""
You are looking at a company called: {company_name}
Here are the contents of its landing page and other relevant pages;
use this information to build a short brochure of the company in markdown without code blocks.\n\n
"""
    user_prompt += fetch_link_contents(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [34]:
get_brochure_user_prompt("Apple", "http://apple.com")

'\nYou are looking at a company called: Apple\nHere are the contents of its landing page and other relevant pages;\nuse this information to build a short brochure of the company in markdown without code blocks.\n\n\n==> Landing Page:\nApple\nApple\nStore\nMac\niPad\niPhone\nWatch\nVision\nAirPods\nTV & Home\nEntertainment\nAccessories\nSupport\n0\n+\nApple Fitness Plus\nRing in your resolutions.\nGet up to 3 months on us.\n1\nLearn more\nLearn more\nTry it free\nTry it free\nScan code with iPhone or iPad.\nUse the Camera app to activate your free trial.\n1\nApple Watch Series 11\nTurn resolutions into\xa0routines. Quit quitting your fitness\xa0goals.\nLearn more\nBuy\niPhone\nSay hello to the latest generation of iPhone.\nLearn more\nShop iPhone\nAirPods Pro 3\nThe world’s best in-ear Active Noise Cancellation.\nLearn more\nBuy\niPad Air\nNow supercharged by the M3 chip.\nLearn more\nBuy\nMacBook\xa0Pro 14”\nSupercharged by M5.\nLearn more\nBuy\niPad\nNow with the speed of the A16 chip

In [35]:
def create_brochure(company_name, url):
    response = gemini.chat.completions.create(
        model="gemini-2.5-flash-lite",
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)},
        ]
    )  
    result = response.choices[0].message.content
    display(Markdown(result))

In [36]:
create_brochure("Apple", "http://apple.com")

RateLimitError: Error code: 429 - [{'error': {'code': 429, 'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits. To monitor your current usage, head to: https://ai.dev/usage?tab=rate-limit. \n* Quota exceeded for metric: generativelanguage.googleapis.com/generate_content_free_tier_requests, limit: 20, model: gemini-2.5-flash-lite\nPlease retry in 7.314534687s.', 'status': 'RESOURCE_EXHAUSTED', 'details': [{'@type': 'type.googleapis.com/google.rpc.Help', 'links': [{'description': 'Learn more about Gemini API quotas', 'url': 'https://ai.google.dev/gemini-api/docs/rate-limits'}]}, {'@type': 'type.googleapis.com/google.rpc.QuotaFailure', 'violations': [{'quotaMetric': 'generativelanguage.googleapis.com/generate_content_free_tier_requests', 'quotaId': 'GenerateRequestsPerDayPerProjectPerModel-FreeTier', 'quotaDimensions': {'location': 'global', 'model': 'gemini-2.5-flash-lite'}, 'quotaValue': '20'}]}, {'@type': 'type.googleapis.com/google.rpc.RetryInfo', 'retryDelay': '7s'}]}}]