In [1]:
import os
import re
import requests
import socket
from requests.exceptions import ConnectionError
from requests.exceptions import MissingSchema
from requests.exceptions import InvalidSchema
from urllib3.exceptions import MaxRetryError, NameResolutionError
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import ollama

In [2]:
load_dotenv(override=True)
MODEL ='llama3.2'

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
    (KHTML. like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
page = Website("https://huggingface.co")
page.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nvidia/parakeet-tdt-0.6b-v2',
 '/Wan-AI/Wan2.1-VACE-14B',
 '/nari-labs/Dia-1.6B',
 '/multimodalart/isometric-skeumorphic-3d-bnb',
 '/lodestones/Chroma',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/Lightricks/ltx-video-distilled',
 '/spaces/smolagents/computer-agent',
 '/spaces/ByteDance/DreamO',
 '/spaces/NihalGazi/FLUX-Pro-Unlimited',
 '/spaces',
 '/datasets/openbmb/Ultra-FineWeb',
 '/datasets/PrimeIntellect/INTELLECT-2-RL-Dataset',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/DMindAI/DMind_Benchmark',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',


In [5]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in \
a brochure about the company, such as links to an About page, or a Company page, \
or Careers/Jobs pages.\n"

link_system_prompt += "you should respond only in JSON, without text, object as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [6]:
print (link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
you should respond only in JSON, without text, object as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [7]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for \
    a brochure about the company, respond with the full https URL in clean JSON format \
    without text json on the beggining of the response. \
    Do not include Terms of Service, Privacy, email links. \n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
print(get_links_user_prompt(page))

Here is the list of links on the website of https://huggingface.co - please decide which of these are relevant web links for     a brochure about the company, respond with the full https URL in clean JSON format     without text json on the beggining of the response.     Do not include Terms of Service, Privacy, email links. 
Links (some might be relative links):
/
/models
/datasets
/spaces
/posts
/docs
/enterprise
/pricing
/login
/join
/spaces
/models
/nvidia/parakeet-tdt-0.6b-v2
/Wan-AI/Wan2.1-VACE-14B
/nari-labs/Dia-1.6B
/multimodalart/isometric-skeumorphic-3d-bnb
/lodestones/Chroma
/models
/spaces/enzostvs/deepsite
/spaces/Lightricks/ltx-video-distilled
/spaces/smolagents/computer-agent
/spaces/ByteDance/DreamO
/spaces/NihalGazi/FLUX-Pro-Unlimited
/spaces
/datasets/openbmb/Ultra-FineWeb
/datasets/PrimeIntellect/INTELLECT-2-RL-Dataset
/datasets/nvidia/OpenCodeReasoning
/datasets/nvidia/OpenMathReasoning
/datasets/DMindAI/DMind_Benchmark
/datasets
/join
/pricing#endpoints
/pricing#sp

In [9]:
def get_links(url):
    website = Website(url)
    response = ollama.chat(
        model=MODEL,
    messages=[
        {"role": "system", "content": link_system_prompt},
        {"role": "user", "content": get_links_user_prompt(website)}
    ],
    options={"format": "json"}
    )
    result = response['message']['content']

    print(result)
    try:
        content_json = json.loads(result)
        return content_json
    except json.JSONDecodeError:
        print("Odpowiedz nie jest poprawnym JSON")

In [10]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nvidia/parakeet-tdt-0.6b-v2',
 '/Wan-AI/Wan2.1-VACE-14B',
 '/nari-labs/Dia-1.6B',
 '/multimodalart/isometric-skeumorphic-3d-bnb',
 '/lodestones/Chroma',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/Lightricks/ltx-video-distilled',
 '/spaces/smolagents/computer-agent',
 '/spaces/ByteDance/DreamO',
 '/spaces/NihalGazi/FLUX-Pro-Unlimited',
 '/spaces',
 '/datasets/openbmb/Ultra-FineWeb',
 '/datasets/PrimeIntellect/INTELLECT-2-RL-Dataset',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/DMindAI/DMind_Benchmark',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',


In [11]:
get_links("https://huggingface.co")

{
  "links": [
    "https://huggingface.co/team",
    "https://huggingface.co/mission",
    "https://huggingface.co/blog",
    "https://status.huggingface.co/",
    "https://discuss.huggingface.co",
    "https://github.com/huggingface",
    "https://twitter.com/huggingface",
    "https://www.linkedin.com/company/huggingface/",
    "https://apply.workable.com/huggingface/"
  ]
}


{'links': ['https://huggingface.co/team',
  'https://huggingface.co/mission',
  'https://huggingface.co/blog',
  'https://status.huggingface.co/',
  'https://discuss.huggingface.co',
  'https://github.com/huggingface',
  'https://twitter.com/huggingface',
  'https://www.linkedin.com/company/huggingface/',
  'https://apply.workable.com/huggingface/']}

In [14]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    
    links = get_links(url)
    if links and "links" in links:
        for link in links["links"]:
            try:
                result += f"\n\nLink:\n{link}\n"
                result += Website(link).get_contents()
            except socket.gaierror as e:
                print(f"DNS resolution failed: {e}")
            except NameResolutionError as e:
                print(f"Name resolution error: {e}")
            except MaxRetryError as e:
                print(f"Max retries exceeded: {e}")
            except ConnectionError as e:
                print(f"Connection error: {e}")
            except MissingSchema as e:
                print(f"Invalid URL schema: {e}")
            except InvalidSchema as e:
                print(f"Omitted unsupported URL (InvalidSchema): {e}")
    else:
        print("No links found or failed to retrieve links.")
    
    return result


print(get_all_details("https://huggingface.co"))

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co"},
        {"type": "Company page", "url": "https://huggingface.co/brand"},
        {"type": "Careers page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "Research tasks", "url": "https://github.com/huggingface"}
    ]}
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'About page', 'url': 'https://huggingface.co'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'Company page', 'url': 'https://huggingface.co/brand'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'Careers page', 'url': 'https://apply.workable.com/huggingface/'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'Research tasks', 'url': 'https://github.com/huggingface'}"
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage C

In [15]:
print(get_all_details("https://huggingface.co"))

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co"},
        {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "Company page", "url": "https://www.linkedin.com/company/huggingface/"},
        {"type": "Blog", "url": "https://blog.huggingface.co"},
        {"type": "GitHub", "url": "https://github.com/huggingface"},
        {"type": "Twitter", "url": "https://twitter.com/huggingface"}
    ]
}
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'About page', 'url': 'https://huggingface.co'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'Careers/Jobs page', 'url': 'https://apply.workable.com/huggingface/'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'Company page', 'url': 'https://www.linkedin.com/company/huggingface/'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were

In [16]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
include details of company culture, customers and careers/jobs if you have the information."

In [17]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages: \
    use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt

In [18]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co/"},
        {"type": "Company page", "url": "https://huggingface.co/brand"},
        {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "Enterprise page", "url": "https://endpoints.huggingface.co"}
    ]
}
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'About page', 'url': 'https://huggingface.co/'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'Company page', 'url': 'https://huggingface.co/brand'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'Careers/Jobs page', 'url': 'https://apply.workable.com/huggingface/'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'Enterprise page', 'url': 'https://endpoints.huggingface.co'}"


"You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages:     use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nnvidia/parakeet-tdt-0.6b-v2\nUpdated\n4 days ago\n•\n56.6k\n•\n984\nWan-AI/Wan2.1-VACE-14B\nUpdated\n1 day ago\n•\n8.8k\n•\n201\nnari-labs/Dia-1.6B\nUpdated\n6 days ago\n•\n143k\n•\n2.27k\nmultimodalart/isometric-skeumorphic-3d-bnb\nUpdated\n5 days ago\n•\n550\n•\n196\nlodestones/Chroma\nUpdated\n2 days ago\n•\n620\nBrowse 1M+ models\nSpaces\nRunning\n6.8k\n6.8k\nDeepSite\n🐳\nGenerate any 

In [19]:
def create_brochure(company_name, url):
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response['message']['content']

    display(Markdown(result))

In [20]:
create_brochure("HuggingFace", "https://huggingface.co")

{
    "links": [
        {"type": "Company page", "url": "https://huggingface.co/"},
        {"type": "About page", "url": "https://huggingface.co/about"}
    ]
}
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'Company page', 'url': 'https://huggingface.co/'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'About page', 'url': 'https://huggingface.co/about'}"


# Hugging Face Brochure

[Cover Image: A logo of a neural network, symbolizing the intersection of AI and human connection]

**Welcome to Hugging Face**

The AI Community Building the Future

At Hugging Face, we're on a mission to empower the world's most talented individuals in machine learning (ML) to build innovative applications that shape the future. Our platform provides a collaborative environment for ML enthusiasts, researchers, and professionals to create, discover, and deploy cutting-edge models.

**Our Story**

We believe that AI should be accessible to everyone, not just a select few. That's why we've built an open-source platform that allows anyone to create, share, and use machine learning models. Our community-driven approach has led to the creation of over 1 million ML models, datasets, and applications.

**What We Do**

* **Model Hub**: Browse or contribute to our vast library of pre-trained models for text, image, video, audio, and more.
* **Dataset Hub**: Access and share high-quality datasets for various ML tasks.
* **Spaces**: Collaborate with others on unlimited public models, datasets, and applications.
* **Compute**: Deploy models on optimized inference endpoints or update your Spaces applications to a GPU in a few clicks.
* **Enterprise**: Unlock advanced platform features with enterprise-grade security, access controls, and dedicated support.

**Our Community**

* **Over 50,000 Organizations**: Trust Hugging Face as part of their ML strategy.
* **Leading Companies**: Join the ranks of AI2, Meta, Amazon, Google, Intel, Microsoft, Grammarly, Writer, and more.
* **Influential Researchers**: Discover cutting-edge research in our open-source libraries, including Transformers, Diffusers, Safetensors, and more.

**Join Our Community**

Create your account today and start building your ML portfolio. Share your work with the world and contribute to the growth of our vibrant community.

[Call-to-Action Button: Sign Up]

**Stay Connected**

Follow us on social media:

* GitHub
* Twitter
* LinkedIn
* Discord

Visit our blog for the latest news, updates, and insights on AI and ML.

In [21]:
def stream_brochure(company_name, url):
    stream=ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
        stream = True
    )
    result = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        result += chunk['message']['content'] or ''
        result = result.replace("```", "").replace("markdown", "")
        update_display(Markdown(result), display_id=display_handle.display_id)

In [22]:
stream_brochure("HuggingFace", "https://huggingface.co")

{
  "links": [
    {"type": "About page", "url": "https://huggingface.co"},
    {"type": "Company page", "url": "https://huggingface.co/brand"},
    {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"},
    {"type": "Blog page", "url": "https://blog.huggingface.co"},
    {"type": "GitHub page", "url": "https://github.com/huggingface"},
    {"type": "Twitter page", "url": "https://twitter.com/huggingface"},
    {"type": "LinkedIn page", "url": "https://www.linkedin.com/company/huggingface/"}
  ]
Odpowiedz nie jest poprawnym JSON
No links found or failed to retrieve links.


**Hugging Face Brochure**
==========================

**Introduction**
---------------

Welcome to Hugging Face, the leading platform for building and collaborating on AI models. Our mission is to empower the machine learning community to create innovative solutions that shape the future.

**About Us**
-------------

At Hugging Face, we are dedicated to providing a collaboration platform where researchers, developers, and businesses can come together to build, share, and apply AI models. With our open-source stack, you can explore all modalities of AI, from text to image, video, audio, and even 3D.

**Our Community**
----------------

We have over 50,000 organizations using Hugging Face, including leading companies like Meta, Amazon, Google, Intel, Microsoft, and Grammarly. Our community is passionate about advancing the field of AI, and we are committed to providing the tools and resources needed to make it happen.

**What We Offer**
-----------------

* **Unlimited Public Models**: Host and collaborate on unlimited public models, datasets, and applications.
* **Compute**: Deploy on optimized inference endpoints or update your Spaces applications to a GPU in a few clicks.
* **Enterprise Solutions**: Provide paid compute and enterprise solutions with enterprise-grade security, access controls, and dedicated support.

**Popular Models**
------------------

Browse over 1 million+ models, including:

* `nvidia/parakeet-tdt-0.6b-v2`
* `Wan-AI/Wan2.1-VACE-14B`
* `multimodalart/isometric-skeumorphic-3d-bnb`

**Our Open Source**
------------------

We are building the foundation of ML tooling with the community. Explore our open-source projects, including:

* **Transformers**: State-of-the-art ML for PyTorch, TensorFlow, and JAX
* **Diffusers**: State-of-the-art Diffusion models in PyTorch
* **Tokenizers**: Fast tokenizers optimized for research & production

**Join Our Community**
---------------------

Sign up to accelerate your ML journey with Hugging Face. Join our community of innovators, researchers, and businesses who are shaping the future of AI.

**Get Started**
---------------

Visit our website: [https://huggingface.co/](https://huggingface.co/)

Follow us on social media:

* Twitter: [@HuggingFace](https://twitter.com/HuggingFace)
* LinkedIn: [https://www.linkedin.com/company/hugging-face](https://www.linkedin.com/company/hugging-face)
* GitHub: [https://github.com/huggingface](https://github.com/huggingface)