In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
load_dotenv()

True

In [3]:
openai = OpenAI()
MODEL = 'gpt-4o-mini'

In [4]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [5]:
website = Website("https://huggingface.co")
website.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/blog/smolagents',
 '/deepseek-ai/DeepSeek-V3',
 '/PowerInfer/SmallThinker-3B-Preview',
 '/deepseek-ai/DeepSeek-V3-Base',
 '/black-forest-labs/FLUX.1-dev',
 '/hexgrad/Kokoro-82M',
 '/models',
 '/spaces/osanseviero/gemini-coder',
 '/spaces/JeffreyXiang/TRELLIS',
 '/spaces/lllyasviel/iclight-v2',
 '/spaces/Kwai-Kolors/Kolors-Virtual-Try-On',
 '/spaces/reach-vb/2024-ai-timeline',
 '/spaces',
 '/datasets/agibot-world/AgiBotWorld-Alpha',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/PowerInfer/QWQ-LONGCOT-500K',
 '/datasets/cfahlgren1/react-code-instructions',
 '/datasets/OpenLeecher/lmsys_chat_1m_clean',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft'

In [6]:
def get_links_prompt(website): 
    link_system_prompt = "You are provided with a list of links found on a webpage. \
    You are able to decide which of the links would be most relevant to include in a brochure about the company, \
    such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
    link_system_prompt += "You should respond in JSON as in this example:"
    link_system_prompt += """
    {
        "links": [
            {"type": "about page", "url": "https://full.url/goes/here/about"},
            {"type": "careers page": "url": "https://another.full.url/careers"}, 
            {"type": "github page": "url": "https://github.com/something"},
            {"type": "discourd page": "url": "https://discord.com/something"}, 
            {"type": "organization page" "url":"https://{website.link}/something"}, 
            {"type": "social media page" "url":"{website.link}/social"}
        ]
    }
    """

    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
    Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)

    messages=[{"role": "system", "content": link_system_prompt},{"role": "user", "content": user_prompt}]

    return messages

In [7]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=get_links_prompt(website), response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)


In [8]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'github page', 'url': 'https://github.com/huggingface'},
  {'type': 'twitter page', 'url': 'https://twitter.com/huggingface'},
  {'type': 'linkedin page',
   'url': 'https://www.linkedin.com/company/huggingface/'}]}

## Make the brochure!

In [9]:
def get_all_details(url): 
    result = "Landing page:\n" 
    result += Website(url).get_contents() 
    links = get_links(url) 
    print("Found links:", links) 
    for link in links["links"]: 
        result += f"\n\n{link['type']}\n" 
        result += Website(link["url"]).get_contents() 
    return result

In [10]:
def get_brochure_prompt(company_name, url): 
    system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
    and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
    Include details of company culture, customers and careers/jobs if you have the information."

    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    
    return ([
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
          ])

In [11]:
def create_brochure(company_name, url): 
    response = openai.chat.completions.create(model=MODEL, messages=get_brochure_prompt(company_name, url)) 
    result = response.choices[0].message.content 
    display(Markdown(result))

In [12]:
create_brochure("HuggingFace", "https://huggingface.com")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.com'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'github page', 'url': 'https://github.com/huggingface'}, {'type': 'twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'linkedin page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.com/blog'}, {'type': 'community forum', 'url': 'https://discuss.huggingface.co'}]}


# Hugging Face Brochure

## Overview
**Hugging Face** is a leading AI community and collaboration platform dedicated to building the future of machine learning. Our platform empowers developers and researchers to create, discover, and collaborate on diverse machine learning models, datasets, and applications. With over 400,000 models and 100,000 datasets available, Hugging Face is at the forefront of the AI revolution.

## Our Platform
- **Models**: Access a vast range of state-of-the-art machine learning models for diverse applications.
- **Datasets**: Explore numerous datasets spanning various domains, enabling innovative research and development.
- **Spaces**: Collaborate and run custom applications within the Hugging Face ecosystem.

## Community
More than **50,000 organizations** utilize Hugging Face, including notable names like Google, Microsoft, Amazon Web Services, and Meta. Our community thrives on collaboration, sharing knowledge, and pushing the boundaries of artificial intelligence.

## Company Culture
At Hugging Face, we believe in the power of **open-source** collaboration. Our culture encourages innovation, inclusivity, and the constant pursuit of knowledge. We are committed to building a diverse and welcoming environment where every team member's unique perspective is valued.

## Careers at Hugging Face
Join a passionate team dedicated to shaping the future of machine learning. We are always on the lookout for talented individuals in various fields, from engineering to product management. **Explore exciting career opportunities**[here](#) and be a part of the AI community building tomorrow.

## Customers and Partnerships
Hugging Face partners with numerous organizations across different sectors, providing enterprise-grade security, access controls, and dedicated support. Our customers range from innovative startups to established enterprises.

## Get Involved
Ready to contribute to the AI community? 
- **Sign Up** for free to access our platform.
- **Explore** our extensive documentation and resources to enhance your knowledge and skills.
- **Join our community** on GitHub, Twitter, LinkedIn, and Discord to connect with fellow AI enthusiasts.

## Closing
Join **Hugging Face** on our mission to build the future of artificial intelligence. Together, we can accelerate innovation, foster collaboration, and make a lasting impact on the world through machine learning.

[Visit Our Website](https://huggingface.co) for more information!

## Enhancement
## Stream is True

In [13]:
def stream_brochure(company_name, url): 
    stream = openai.chat.completions.create(
        model=MODEL, messages=get_brochure_prompt(company_name, url), stream=True
    ) 

    response = ""
    display_handle = display(Markdown(""), display_id=True) 
    for chunk in stream: 
        response += chunk.choices[0].delta.content or '' 
        response = response.replace("``", "").replace("markdown", "") 
        update_display(Markdown(response), display_id=display_handle.display_id)

In [14]:
stream_brochure('HuggingFace', "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/'}, {'type': 'models page', 'url': 'https://huggingface.co/models'}, {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'}, {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'documentation page', 'url': 'https://huggingface.co/docs'}, {'type': 'github page', 'url': 'https://github.com/huggingface'}, {'type': 'twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'linkedin page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face Company Brochure

---

## **Welcome to Hugging Face!**
### *The AI community building the future.*
  
At Hugging Face, we are dedicated to creating a collaborative platform that empowers the machine learning community to build, share, and innovate. Our mission is to make it easier for individuals and teams to connect with powerful models, datasets, and applications, thereby accelerating the pace of machine learning advancements.

---

## **Our Offerings**

### **Models**
We host an extensive library of over 400,000 models, ranging from natural language processing to computer vision. Notable models include:
- **deepseek-ai/DeepSeek-V3** (74.1k downloads)
- **PowerInfer/SmallThinker-3B-Preview** (7k downloads)

### **Datasets**
With more than 100,000 datasets, use our collection to fuel your projects. Some trending datasets include:
- **agibot-world/AgiBotWorld-Alpha**
- **fka/awesome-chatgpt-prompts**

### **Spaces**
Explore applications and tools in the AI ecosystem with 150,000+ applications available for collaboration and innovation.

---

## **Company Culture**

At Hugging Face, we foster a **supportive and inclusive environment** where creativity and collaboration thrive. We believe that the best outcomes arise from diverse perspectives and open dialogue. Our team is driven by a passion for AI and a commitment to community engagement, contributing to an open-source ecosystem that benefits everyone.

---

## **Our Customers**

Currently, we proudly serve over **50,000 organizations**, including:
- **Meta**
- **Amazon Web Services**
- **Google**
- **Microsoft**
- and many others!

These partnerships enable us to continuously refine our tools and offerings based on real-world needs.

---

## **Careers at Hugging Face**

Join us in shaping the future of AI! We are constantly on the lookout for talented individuals passionate about machine learning and technology. By working with us, you will contribute to transformative projects and be part of a team dedicated to pushing the boundaries of what AI can achieve.

### **Why Work Here?**
- **Innovative Environment**: Collaborate with industry-leading experts and pioneer exciting technology.
- **Impactful Work**: Engage in projects that make a significant difference in the AI community.
- **Flexible Culture**: Enjoy a remote-friendly work environment with a strong emphasis on work-life balance.

---

## **Get In Touch**

Explore our platform, join the community, and let’s build the future together!

- **Website**: [Hugging Face](https://huggingface.co)
- **Join Us**: Sign up for our platform to start collaborating and exploring AI advancements today!

### *Together, we can achieve great things in AI!*