In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

## For using OpenAI models in this project an API key is needed. Can be obtained from openAI websit.

In [4]:
# might need to rename a .env file to have a name before the '.'
# if uploaded like that just change the name like following

# import os
# os.rename('t.env', '.env')

In [5]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [6]:
# Some websites need you to use proper headers when fetching them:
# Use browser’s User-Agent, Go to: https://www.whatismybrowser.com/ , Copy the User-Agent string shown there
# try using the header used below

headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)     # retrieve url
        self.body = response.content                      # get content
        soup = BeautifulSoup(self.body, 'html.parser')    # parse the content
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]          # collect links in that webpage
        self.links = [link for link in links if link]                      # store the links

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [9]:
anthropic = Website('https://www.anthropic.com/')
# anthropic.get_contents()           # check the contetnt scraped
# anthropic.links                      # check the linkd scraped

## make call to gpt-4o-mini to read the links on a webpage, and respond in structured JSON.
We will use "one shot prompting" where we provide an example of how it should replace a relative link such as "/about" with "https://company.com/about" in the prompt.

In [10]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [12]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}



In [13]:
# Function to pass the links from website object to the prompt create the whole user prompt
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [15]:
# print(get_links_user_prompt(anthropic))              # Check the whole prompt

In [17]:
# create a website and pass the prompts in to openAI
# we use response format to get structured JSON output response from the model
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content                 # we can get multiple variation of responses, that's why choices.'[0]'
    return json.loads(result)

In [21]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 'inference/get-started',
 '/spaces',
 '/models',
 '/openai/gpt-oss-120b',
 '/openai/gpt-oss-20b',
 '/Qwen/Qwen-Image',
 '/tencent/Hunyuan-1.8B-Instruct',
 '/black-forest-labs/FLUX.1-Krea-dev',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/Qwen/Qwen-Image',
 '/spaces/Qwen/Qwen3-Coder-WebDev',
 '/spaces/black-forest-labs/FLUX.1-Krea-dev',
 '/spaces/Wan-AI/Wan-2.2-5B',
 '/spaces',
 '/datasets/spatialverse/InteriorGS',
 '/datasets/nvidia/Nemotron-Post-Training-Dataset-v1',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/UCSC-VLAA/GPT-Image-Edit-1.5M',
 '/datasets/AI-MO/NuminaMath-LEAN',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer

In [22]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/about'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'docs page', 'url': 'https://huggingface.co/docs'},
  {'type': 'company page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'discussion forum', 'url': 'https://discuss.huggingface.co'},
  {'type': 'GitHub page', 'url': 'https://github.com/huggingface'},
  {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'},
  {'type': 'LinkedIn page',
   'url': 'https://www.linkedin.com/company/huggingface/'}]}

## Assemble all the details into another prompt to GPT4-o to make the brochure!

In [24]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [27]:
# get_all_details("https://huggingface.co")         #check the function

In [28]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [31]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5000] # Truncate if more than 5,000 characters
    return user_prompt

In [30]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'company page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'community page', 'url': 'https://discuss.huggingface.co'}]}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nCommunity\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nNEW\nGet started with Inference in seconds 🚀\nReachy Mini: The Open Robot for AI Builders\nWelcome Cohere on the Hub 🔥\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nopenai/gpt-oss-120b\nUpdated\nabout 3 hours ago\n•\n3.61k\n•\n1.47k\nopenai/gpt-oss-20b\nUpdated\nabout 3 hours ago\n•\n6.82k\n•\n1.15k\nQwen/Qwen-Image\nUpdated\nabout 22 hours ago\n•\n19k\n•\n956\ntencent/Hunyuan-1.8B-Instruct\nUpdated\nabout 19 hours ago\n•\n802\n•\n50

In [32]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [33]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'home page', 'url': 'https://huggingface.co/'}, {'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'documentation page', 'url': 'https://huggingface.co/docs'}]}


# Hugging Face Brochure

## About Us
**Hugging Face** is at the forefront of the AI revolution, dedicated to building a collaborative community focused on machine learning (ML). We invite individuals and organizations alike to explore, create, and innovate within our dynamic ecosystem. Our platform empowers users to collaborate on models, share datasets, and develop cutting-edge AI applications.

## Our Offerings
- **Models**: Access over 1 million machine learning models, including trending models such as openai/gpt-oss-120b and Qwen/Qwen-Image.
- **Datasets**: Browse through more than 250,000 datasets tailored for a variety of ML tasks.
- **Spaces**: Explore over 400,000 applications running on our platform in real-time, ranging from web development to image generation.
- **Enterprise Solutions**: We offer advanced compute and enterprise features with dedicated support, ensuring that organizations can harness AI with confidence.

## Community & Collaboration
Hugging Face fosters a vibrant community made up of over 50,000 organizations, including industry leaders like **Google**, **Microsoft**, and **Amazon**. Collaborate with experts, enthusiasts, and companies to accelerate innovation in AI.

## Company Culture
At Hugging Face, our culture is rooted in collaboration and inclusivity. We believe that by joining forces, we can build and reflect the future of AI. Our team thrives in an environment that encourages creativity and the sharing of knowledge. We promote an open-source ethos, helping to drive transparency and community engagement in AI development.

## Career Opportunities
Join our mission to democratize AI! We are always on the lookout for talented individuals excited about AI and machine learning. Whether you are a developer, researcher, or marketer, there’s a place for you at Hugging Face. 
Explore our **Jobs** section on our website to find positions that suit your skills and aspirations.

## Get Started
Ready to dive into the world of AI? Sign up today and unlock the potential of machine learning! 

- **Sign Up**: [Get Started Here](#) 
- **Explore**: [Explore AI Apps](#)

Join us in building the future of AI—together, we can make a difference! 

---

*For more information, visit our website: [Hugging Face](https://huggingface.co)*

## streaming the output instead of a blok output 

In [34]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [35]:
stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company blog', 'url': 'https://huggingface.co/blog'}, {'type': 'company Twitter', 'url': 'https://twitter.com/huggingface'}, {'type': 'company LinkedIn', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face: The AI Community Building the Future

Welcome to Hugging Face, where innovation meets collaboration in the artificial intelligence landscape. Our mission is to empower individuals and organizations to cultivate and deploy machine learning models, datasets, and applications. Join us in shaping the future of AI!

## Our Offerings

- **Models**: Explore our extensive library of over 1 million models ranging from text and image to video AI applications. Our state-of-the-art `Transformers` and `Diffusers` libraries set the standard for machine learning excellence.
- **Datasets**: Access 250,000+ datasets tailored for every ML task. Our platform enables seamless collaboration through public model hosting and sharing.
- **Spaces**: Discover our collaborative environment where users can run applications and generate models collectively. With over 400K applications created by our community, creativity knows no bounds!

## We Support Your AI Journey

Hugging Face provides robust **Compute and Enterprise solutions** to accelerate your ML projects. Our offerings include:

- **Optimized Inference Endpoints** starting at $0.60/hour for GPU.
- **Enterprise Solutions** from $20/user/month, designed for teams requiring secure and dedicated support.

Join leading organizations like Amazon, Google, Microsoft, and Grammarly who trust Hugging Face for their AI and ML needs!

## Our Community

We are more than just a platform; we are a thriving **community of over 50,000 organizations**. Our commitment to collaboration and open source is evident through our active involvement in:

- Sharing the latest advancements in AI technology.
- Collaborating on diverse projects and sharing valuable insights through our forums and documentation.
- Hosting events that encourage dialogue among researchers, developers, and enthusiasts.

## Join Our Team

At Hugging Face, we foster an inclusive company culture where collaboration meets innovation. We believe in the power of diverse perspectives and experiences, and this fuels our creativity. 

### Career Opportunities

As we expand, we invite passionate individuals to explore careers in various fields within AI and machine learning. Join us to be a part of a forward-thinking team that values growth, collaboration, and the pursuit of knowledge.

- **Current Open Positions**: Visit our [Jobs page](https://huggingface.co/jobs) to see the latest opportunities and become part of our mission to democratize AI.

## Get Started with Hugging Face Today!

Whether you're an AI enthusiast, a developer, or a data scientist, Hugging Face provides the tools and community support to launch your ML initiatives. Create, collaborate, and contribute with us.

Visit us at [huggingface.co](https://huggingface.co) to learn more or sign up!

---

*Let’s build the future of AI together!* 🚀