<a href="https://colab.research.google.com/github/SadeghMahmoudAbadi/Open-Source-LLM-on-Colab/blob/main/2-Brochure/Company_Brochure.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install ollama

In [2]:
from google.colab import userdata
from ollama import Client
import json
import requests
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display

In [3]:
# Standard headers to fetch a website
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

In [4]:
def fetch_website_contents(url):
    """
    Return the title and contents of the website at the given url;
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    title = soup.title.string if soup.title else "No title found"
    if soup.body:
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        text = soup.body.get_text(separator="\n", strip=True)
    else:
        text = ""
    return (title + "\n\n" + text)

In [5]:
def fetch_website_links(url):
    """
    Return the links on the webiste at the given url
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    links = [link.get("href") for link in soup.find_all("a")]
    return [link for link in links if link]

In [None]:
links = fetch_website_links("https://huggingface.co/")
links

In [7]:
link_system_prompt = """
You are provided with a list of links found on a webpage.
You are able to decide which of the links would be most relevant to include in a brochure about the company,
such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:

{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [8]:
def get_links_user_prompt(url):
    user_prompt = f"""
    Here is the list of links on the website {url} -
    Please decide which of these are relevant web links for a brochure about the company,
    respond with the full https URL in JSON format.
    Do not include Terms of Service, Privacy, email links.

    Links (some might be relative links):

    """
    links = fetch_website_links(url)
    user_prompt += "\n".join(links)
    return user_prompt

In [None]:
print(get_links_user_prompt("https://huggingface.co/"))

In [10]:
MODEL = "gpt-oss:120b"
ollama = Client(
    host="https://ollama.com",
    headers={'Authorization': 'Bearer ' + userdata.get("OLLAMA_API_KEY")}
)

In [11]:
def select_relevant_links(url):
    print(f"Selecting relevant links for {url} by calling {MODEL}")
    response = ollama.chat(
        MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(url)}
        ],
        format={"type": "json_object"}
    )
    result = response['message']['content']
    links = json.loads(result)
    print(f"Found {len(links['links'])} relevant links")
    return links

In [None]:
select_relevant_links("https://huggingface.co/")

In [13]:
def fetch_page_and_all_relevant_links(url):
    contents = fetch_website_contents(url)
    relevant_links = select_relevant_links(url)
    result = f"## Landing Page:\n\n{contents}\n## Relevant Links:\n"
    for link in relevant_links['links']:
        result += f"\n\n### Link: {link['type']}\n"
        result += fetch_website_contents(link["url"])
    return result

In [None]:
print(fetch_page_and_all_relevant_links("https://huggingface.co"))

In [15]:
brochure_system_prompt = """
You are a snarky assistant that analyzes the contents of several relevant pages from a company website
and creates a short, humorous, entertaining, witty brochure about the company for prospective customers, investors and recruits.
Respond in markdown without code blocks.
Include details of company culture, customers and careers/jobs if you have the information.
"""

In [16]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"""
    You are looking at a company called: {company_name}
    Here are the contents of its landing page and other relevant pages;
    use this information to build a short brochure of the company in markdown without code blocks.\n\n
    """
    user_prompt += fetch_page_and_all_relevant_links(url)
    # user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [None]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

In [18]:
def create_brochure(company_name, url):
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response['message']['content']
    display(Markdown(result))

In [None]:
create_brochure("HuggingFace", "https://huggingface.co")

In [20]:
def stream_brochure(company_name, url):
    stream = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
        stream=True
    )
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk['message']['content'] or ''
        update_display(Markdown(response), display_id=display_handle.display_id)

In [None]:
stream_brochure("HuggingFace", "https://huggingface.co")

Selecting relevant links for https://huggingface.co by calling gpt-oss:120b
Found 10 relevant links


# ü§ó **Hugging Face ‚Äì The Friendly AI Playground**  

*‚ÄúWe‚Äôre the AI community building the future, one ü§ó emoji at a time.‚Äù*  

---  

## üåç Who (or what) is Hugging Face?  

- **A one‚Äëstop‚Äëshop** for **models, datasets, Spaces (apps)**, and a buzzing community that treats open‚Äësource like it‚Äôs a potluck dinner ‚Äì everyone brings something tasty.  
- **Founded 2016**, now a *51‚Äë200* employee boutique perched in **Paris & NYC**, with a worldwide fan‚Äëclub that includes the likes of **Google, Meta, Amazon, Microsoft, Intel, NVIDIA, & the occasional non‚Äëprofit**.  
- **Mission:** Democratize AI so even your grandma can fine‚Äëtune a language model (or at least **pretend** she can).  

---  

## üß© The Hugging Face Stack ‚Äì ‚ÄúThe Swiss‚ÄëArmy Knife of AI‚Äù

| Toolbox | What It Does | Fun Fact |
|---------|--------------|----------|
| **ü§ó Transformers** | State‚Äëof‚Äëthe‚Äëart model zoo (text, vision, audio, multimodal) | >150‚ÄØk stars on GitHub ‚Äì more than the average Hollywood star‚Äôs fan club |
| **ü§ó Diffusers** | Diffusion models for images, video, audio | Turns ‚Äúpixel art‚Äù into ‚Äúpixel *art*‚Äù in seconds |
| **ü§ó Datasets** | 250‚ÄØk+ ready‚Äëto‚Äëuse datasets (including the legendary `awesome‚Äëchatgpt‚Äëprompts`) | You‚Äôll find more data here than on your ex‚Äôs Dropbox |
| **ü§ó Tokenizers** | Lightning‚Äëfast tokenizers for research & production | Faster than a coffee‚Äëdrunk intern on a deadline |
| **ü§ó PEFT** | Parameter‚Äëefficient fine‚Äëtuning (big models, tiny budgets) | Because who wants to spend a bazillion dollars on GPUs? |
| **ü§ó Accelerate / Optimum** | One‚Äëclick multi‚ÄëGPU/TPU scaling, mixed‚Äëprecision, FP8 | ‚ÄúZero‚ÄëGPU‚Äù vs ‚ÄúZero‚Äëeffort‚Äù ‚Äì you pick the magic word |
| **ü§ó Spaces** | Host & share live AI demos (no Docker required) | Deploy on a CPU, upgrade to GPU with a click, and watch the watchers applaud |

---  

## üöÄ ‚ÄúTrending This Week‚Äù ‚Äì The Cool Kids on the Block  

- **Kimi‚ÄëK2‚ÄëThinking** ‚Äì 12.5‚ÄØk likes, 626 forks.  
- **DeepSeek‚ÄëOCR** ‚Äì 2.89‚ÄØM downloads, because everyone still needs to read PDFs.  
- **MiniMax‚ÄëM2** ‚Äì 854‚ÄØk likes, proof that ‚Äúmini‚Äù can still be massive.  

*(If you don‚Äôt see your favorite model, it‚Äôs probably sleeping in the ‚Äúunlimited public repo‚Äù lounge.)*  

---  

## ü§ù Community & Culture ‚Äì Where Nerds Meet Nice  

- **Open‚Äësource first.** All core libraries live on GitHub under permissive licenses; PRs are welcomed faster than a coffee refill at a hackathon.  
- **Hackathons & MCP parties.** Over **$15k** in cash prizes, **$1.3‚ÄØM** in API credits, and a virtual cake‚Äëthrowing ceremony.  
- **Slack‚Äëish Discord** where newbies ask ‚ÄúHow do I pay my inference usage?‚Äù and veterans answer with a meme and a solution.  
- **Cheeky Blog & Courses** (LLM, Robotics, Diffusion, 3‚ÄëD‚Ä¶) that make you smarter *and* keep you humble enough to admit you still need Stack Overflow.  

> *‚ÄúIf you can‚Äôt find a solution, just ask the community. If they can‚Äôt, they‚Äôll write a new library for you.‚Äù*  

---  

## üè¢ Enterprise Clients ‚Äì Who‚Äôs Paying the Bills?  

| Customer | #Models Hosted | Followers |
|----------|----------------|-----------|
| **Meta (AI at Meta)** | 2.23‚ÄØk | 8.64‚ÄØk |
| **Google** | 1.05‚ÄØk | 34.8‚ÄØk |
| **Microsoft** | 426 | 16.3‚ÄØk |
| **Amazon** | 21 | 3.53‚ÄØk |
| **Intel** | 250 | 3.19‚ÄØk |
| **NVIDIA** | 577 | 42.1‚ÄØk |
| **Grammarly** | 11 | 187 |
| **Airbnb** | 102 | ‚Äî |
| ‚Ä¶and 50‚ÄØ000+ other orgs (including non‚Äëprofits like **AI2** & **AI for Good**).  

*Enterprise plans start at **$20‚ÄØ/‚ÄØuser‚ÄØ/‚ÄØmonth** with SSO, audit logs, private datasets, and a dedicated support team that actually reads your tickets.*  

---  

## üíº Careers ‚Äì Want to Join the Friendly AI Circus?  

- **Roles:** Machine‚ÄëLearning Engineer, Research Scientist, Software Engineer, Product Manager, Community Manager, *and* the ever‚Äëmysterious ‚ÄúSpecial Projects‚Äù (rumoured to involve cats and lasers).  
- **Perks:**  
  - Unlimited public repos ‚Üí bragging rights on GitHub.  
  - Access to **ZeroGPU** (make your code run on ‚Äúnothing‚Äù ‚Äì it‚Äôs a thing).  
  - Quarterly hackathon cash prizes (because nothing says ‚Äúwe value you‚Äù like a sudden $10k).  
  - Remote‚Äëfirst culture ‚Äì work from Paris, NYC, or your couch.  
- **Hiring page:** *Currently jam‚Äëpacked with 555‚ÄØk+ open jobs across the AI universe. Pick one, or pick a few, or just RSVP to the next virtual party.*  

---  

## üìà Why Investors Should Care  

1. **Network Effect:** Over **45‚ÄØk+ models** accessible via a single API ‚Üí stickiness that makes churn as rare as a bug‚Äëfree release.  
2. **Revenue Engine:** Enterprise tier, paid compute (GPU @ $0.60‚ÄØ/‚ÄØhour), and inference provider billing.  
3. **Open‚ÄëSource Credibility:** The go‚Äëto libraries for transformers and diffusers ‚Äì everyone from startups to Fortune‚Äë500s depends on them.  
4. **Growth Momentum:** *50‚ÄØk+ organizations* already on board, plus a pipeline of new models added daily.  

*In short: Hugging Face is the **Spotify of AI models** ‚Äì a massive catalog, social discovery, and a subscription model that keeps the lights on.*  

---  

## üìû Get In Touch  

- **Website:** https://huggingface.co  
- **Docs & Playground:** Docs ‚Üí ‚ÄúExplore all modalities: Text, Image, Video, Audio, 3‚ÄëD.‚Äù  
- **Social:**  
  - X (Twitter) ‚Äì where the team drops memes and API credits.  
  - LinkedIn ‚Äì 551 employees, counting.  
  - Discord ‚Äì the place where ‚Äúindex -1‚Äù errors are solved before coffee.  

**Ready to hug a model?** Press **Sign Up**, spin up a Space, and join the most *friendly* AI community on the planet.  

---  

*Disclaimer: Hugging Face does not guarantee that you will actually become a world‚Äërenowned AI wizard. It does, however, guarantee a steady stream of emojis, community love, and the occasional ‚ÄúI built a robot that can write poetry‚Äù brag.*