In [101]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [102]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [112]:
# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, "html.parser")
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for garbage in soup.body(["img", "script", "input", "style"]):
                garbage.decompose()
                self.text = soup.body.get_text(separator="\n",strip=True)
        else:
            self.text = ""

        links = [link.get("href") for link in soup.find_all("a")]
        self.links = [link for link in links if link]

    def getContents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
    

In [113]:
anthropic = website("https://www.anthropic.com")
anthropic.links

['/legal/aup',
 'https://trust.anthropic.com/',
 '/research',
 'https://support.anthropic.com/',
 '/news/claude-for-enterprise',
 '/legal/commercial-terms',
 'https://status.anthropic.com/',
 '/company',
 '/news/3-5-models-and-computer-use',
 '/claude/haiku',
 'https://www.linkedin.com/company/anthropicresearch',
 '/team',
 '/pricing',
 '/customers',
 '/news/core-views-on-ai-safety',
 'https://www.anthropic.com/claude',
 'mailto:press@anthropic.com',
 '/',
 '/enterprise',
 '/jobs',
 '/responsible-disclosure-policy',
 '/claude/sonnet',
 '/research/constitutional-ai-harmlessness-from-ai-feedback',
 'https://claude.ai/',
 'https://twitter.com/AnthropicAI',
 '/news',
 '/legal/privacy',
 'https://www.anthropic.com/research#entry:8@1:url',
 '/supported-countries',
 '/claude',
 '/api',
 'https://www.youtube.com/@anthropic-ai',
 '/legal/consumer-terms',
 '/careers']

In [114]:
system_prompt = """
You are given a list of links from a company website. Your task is to decide which links are relevant to include in the company brochure. Review the links and select only those that best represent the company’s core content, such as the About page, Careers page, Products/Services page, and similar. Avoid links like contact forms, blog posts, legal pages, or support pages that do not directly contribute to a high-level company overview.

You should respond in JSON exactly in the format shown in the examples below. Do not include any extra text.

Example 1:
Input Links:
[
    "https://company.com/about",
    "https://company.com/contact",
    "https://company.com/products",
    "https://company.com/blog"
]
Expected JSON Output:
{
    "links": [
        {"type": "about page", "url": "https://company.com/about"},
        {"type": "products page", "url": "https://company.com/products"}
    ]
}

Example 2:
Input Links:
[
    "https://company.com/careers",
    "https://company.com/legal",
    "https://company.com/support"
]
Expected JSON Output:
{
    "links": [
        {"type": "careers page", "url": "https://company.com/careers"}
    ]
}

Example 3:
Input Links:
[
    "https://company.com/about-us",
    "https://company.com/team",
    "https://company.com/press",
    "https://company.com/privacy"
]
Expected JSON Output:
{
    "links": [
        {"type": "about page", "url": "https://company.com/about-us"},
        {"type": "team page", "url": "https://company.com/team"},
        {"type": "press page", "url": "https://company.com/press"}
    ]
}
"""
print(system_prompt)


You are given a list of links from a company website. Your task is to decide which links are relevant to include in the company brochure. Review the links and select only those that best represent the company’s core content, such as the About page, Careers page, Products/Services page, and similar. Avoid links like contact forms, blog posts, legal pages, or support pages that do not directly contribute to a high-level company overview.

You should respond in JSON exactly in the format shown in the examples below. Do not include any extra text.

Example 1:
Input Links:
[
    "https://company.com/about",
    "https://company.com/contact",
    "https://company.com/products",
    "https://company.com/blog"
]
Expected JSON Output:
{
    "links": [
        {"type": "about page", "url": "https://company.com/about"},
        {"type": "products page", "url": "https://company.com/products"}
    ]
}

Example 2:
Input Links:
[
    "https://company.com/careers",
    "https://company.com/legal",
  

In [122]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [123]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        response_format={"type": "json_object"}
    )
    
    result = json.loads(response.choices[0].message.content)
    
    # Ensure 'links' key exists
    if "links" not in result:
        print("Error: 'links' key missing in OpenAI response. Returning empty list.")
        return {"links": []}  # Prevent KeyError by returning an empty list
    
    return result


In [124]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).getContents()
    
    links = get_links(url)
    print("Found links:", links)
    
    if not links.get("links"):  # Check if "links" key exists and is not empty
        print("No relevant links found.")
        return result  # Return only the landing page contents
    
    for link in links["links"]:
        result += f"\n\n{link.get('type', 'Unknown type')}\n"
        result += Website(link["url"]).getContents()
    
    return result


In [125]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [126]:
def get_brochure_user_prompt(company_name, url):  
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short company brochure in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt
    

In [127]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [128]:
create_brochure("HuggingFace", "https://huggingface.co")

Error: 'links' key missing in OpenAI response. Returning empty list.
Found links: {'links': []}
No relevant links found.


# Welcome to Hugging Face: Where the Future Gets a Hug! 🤗

## Who Are We?
**Hugging Face** is not just a name; it’s a feeling! We are the quirky AI community buzzing with energy and ideas, building the future one model at a time. Here, we aren't just about algorithms and data; we believe in hugging it out - even with our machines! 

## Our Hug-otastic Offerings

### 🤖 Models
With over **1 million models** to choose from, there's something for everyone. From deep learning to small learning (yes, that's a thing!), we've got the state-of-the-art tools. Want to hang out with **Janus-Pro-7B** or have a heart-to-heart with **Kokoro TTS**? Join the fun!

### 📊 Datasets
Datasets are the bread and butter of AI, and we've baked over **250,000 of them**! Whether you're into dolphins, thoughts, or custom recipes, we’ve got the data for your cravings.

### 🌌 Spaces
“Spaces” isn’t just for astronauts; it’s where you can run cool applications like **Text-to-3D** and embrace your inner digital magician! Watch as your text transforms into spectrums of 3D awesomeness right before your eyes! 🎇

## Join Our Community of Innovators
We boast a family of over **50,000 organizations** that trust us for their ML adventures, including major players like Google, Amazon, and Microsoft. Seriously, if we had a family reunion, you’d want to bring a LOT of chairs! 🪑

## Hugging Culture
At Hugging Face, we foster a vibrant, collaborative culture where everyone is encouraged to share ideas, beliefs, and the occasional hug - virtually, of course! Our teams grow together, learn together, and probably consume a few too many donuts together. (Calories don’t count in the metaverse, right?)

## Careers: Let's Get Cozy Together!
Looking for a career that feels like a warm embrace? We’re on the hunt for creative minds and tech-savvy explorers. Come join us at Hugging Face for a job where everyday feels like a Saturday! Explore opportunities where you can build, innovate, and maybe even create your own space for *Text-to-Candy*! 🍬

**No suits required. Just bring your enthusiasm!**

## In Conclusion
Why pick Hugging Face? Because we blend laughter, collaboration, and groundbreaking technology into everything we do! So, whether you're a budding engineer, an investor searching for the next big thing, or someone looking for friendship with some laughs – we welcome you to the Hugging Community! 🤝

**Join us! Let’s build a future that even robots would be excited to hug!**

In [129]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [130]:
stream_brochure("HuggingFace", "https://huggingface.co")

Error: 'links' key missing in OpenAI response. Returning empty list.
Found links: {'links': []}
No relevant links found.



# Welcome to Hugging Face: Where AI Hugs Back!

## Who Are We?
At Hugging Face, we’re not just another tech company – we’re the friendly neighborhood AI community building the future! 🌍 Here, we don’t just throw algorithms at the wall and see if they stick. We create, collaborate, and caffeinate (with only the finest of coffees, of course!) on cutting-edge models, datasets, and applications.

## Company Culture
We believe in teamwork, transparency, and a sprinkle of fun! Our team is a mix of data wranglers, AI wizards, and model maestros who enjoy hugging (metaphorically) more than just machines. Join us if you like to sprinkle your workday with humor and hands-on collaboration! 

### What’s Cooking?
- **Flexible Work Hours**: So you can work when your brain is buzzing at max efficiency.
- **Team Outings**: Occasionally, we escape from our screens for some team bonding (serious business, like competitive Mario Kart).
- **Open Source Love**: We’re passionate about contributing to the open-source community. Our motto? Why keep greatness to ourselves?

## Our Customers
Over **50,000 organizations** (not just cool kids, but major companies like Google, Microsoft, and Amazon) are already on the Hugging Face bandwagon! 🚀 They trust us to help shape the future with our jaw-dropping models and datasets.

### The Who’s Who of Hugging Face:
- AI at Meta has 2,060 models (yes, they’re flexing).
- Grammarly graced us with 10 models – and you thought they only checked your grammar!
- And we even have a **non-profit** – Ai2 – joining the fun to ensure AI doesn’t just become the next superhero; it becomes a team player.

## Careers with Us
🤖 Looking for a job? Grab your cape! We’ll supply the challenges, and you bring your superhero skills!

- **Job Openings**: We’re always on the lookout for innovative thinkers who want to be part of the AI revolution. Check out our website for **super awesome** roles.
- **Learning & Growth**: We support your growth like a sturdy LEGO tower—no wobbly bricks here!

### Extras: 
- **Convert to AI-ntastic**! Whether you want to dabble in text-to-3D wizardry or perfect your coding skills, we got courses, resources, and all the studying snacks you can handle!

## Join Us!
So, whether you’re looking to build **AI wonders**, **join a spirited team**, or simply find a community that gives the best *virtual hugs* (trust us, AI can take it!), look no further! We’re ready to take on the world together—one model, one dataset, and one hug at a time!

---

### Hugging Face – Where Future Meets Fun! 
- **Sign Up** for collaboration 
- **Explore** 1M+ models and growing! 
- **Join** a community that hugs!

*P.S.: No actual hugging required... unless you want to! 😉*
