In [96]:
import os
import requests
from bs4 import BeautifulSoup
import ollama


In [97]:
MODEL = 'llama3.2'

In [98]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [99]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'ht

In [100]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [101]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [102]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company in very detail, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links, social media information.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [103]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company in very detail, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links, social media information.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/
https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resour

In [104]:


def get_links(url):
    website = Website(url)

    # Prepare user prompt
    user_prompt = (
        f"Extract all important links from the following website: {website.url}. "
        "please decide which of these are relevant web links for a brochure about the company in very detail"
        "I need including links to the following pages but not limited to if they exist: "
        "Team Page , Pricing Page , Careers Page , Blog Page ,news page "
        "Make sure all other important links are included also"
        "Return the result strictly as a JSON object in this format: "
        '{"links": [{"type": "string", "url": "string"}]}. '
        "Ensure no extra text, explanations, or disclaimers in the response."
    )

    # Make request to Ollama model
    response = ollama.chat(
        model="llama3.2",  # Replace with the appropriate Ollama model
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": user_prompt}
        ]
    )

    # Extract and clean up response
    result = response['message']['content'].strip()

    try:
        # Try parsing the JSON content
        return json.loads(result)
    except json.JSONDecodeError:
        print("❌ Invalid JSON received. Raw content:", result)
        return {"links": []}  # Return empty list to avoid crashes




In [105]:
get_links("https://huggingface.co")

{'links': [{'type': 'About Us', 'url': 'https://huggingface.co/team'},
  {'type': 'Careers', 'url': 'https://huggingface.co/careers'},
  {'type': 'Blog', 'url': 'https://blog.huggingface.co/'},
  {'type': 'News', 'url': 'https://huggingface.co/newsletter'},
  {'type': 'Pricing', 'url': 'Not Found'},
  {'type': 'Support', 'url': 'https://huggingface.co/support'},
  {'type': 'Terms of Service',
   'url': 'https://huggingface.co/terms-of-service'}]}

In [108]:
import requests

def get_all_details(url):
    result = "Landing page:\n"
    try:
        result += Website(url).get_contents()
    except requests.exceptions.RequestException as e:
        result += f"\n[ERROR] Failed to fetch {url}: {e}\n"

    links = get_links(url)
    print("Found links:", links)

    for link in links["links"]:
        try:
            result += f"\n\n{link['type']}\n"
            result += Website(link["url"]).get_contents()
        except requests.exceptions.RequestException as e:
            result += f"\n[ERROR] Failed to fetch {link['url']}: {e}\n"

    return result


In [109]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'Home Page', 'url': 'https://huggingface.co'}, {'type': 'Blog Page', 'url': 'https://huggingface.co/blog/'}, {'type': 'About Us', 'url': 'https://huggingface.co/team/'}, {'type': 'News Page', 'url': 'https://huggingface.co/news/'}, {'type': 'Pricing Page', 'url': 'https://huggingface.co/pricing'}, {'type': 'Careers Page', 'url': 'https://huggingface.co/careers/'}, {'type': 'Company Page', 'url': 'https://huggingface.co/about'}]}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
Qwen/Qwen2.5-Omni-7B
Updated
2 days ago
•
53k
•
1.02k
deepseek-ai/DeepSeek-V3-0324
Updated
6 days ago
•
86.6k
•
2.19k
manycore-research/SpatialLM-L

In [110]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."


In [111]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [112]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'About Page', 'url': 'https://huggingface.co/team'}, {'type': 'Blog Page', 'url': 'https://huggingface.co/blog'}, {'type': 'Careers Page', 'url': 'https://huggingface.co/careers'}, {'type': 'News Page', 'url': 'https://huggingface.co/newsletter'}, {'type': 'Pricing Page', 'url': 'Not Found'}, {'type': 'FAQ Page', 'url': 'https://huggingface.co/faq'}, {'type': 'Support Page', 'url': 'https://huggingface.co/support'}, {'type': 'Contact Page', 'url': 'https://huggingface.co/contact'}]}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nQwen/Qwen2.5-Omni-7B\nUpdated\n2 days ago\n•\n53k\n•\n1.02k\ndeepseek-ai/DeepSeek-V3-0324\nUpdated\n6 days ago\n•\n86.6k\n•\n2.19k\nmanycore-research/SpatialLM-Llama-1B\nUpdated\n12 days ago\n•\n12.6k\n•\n853\nds4sd/SmolDocling-256M-preview\nUpdated\n10 days ago\n•\n57.8k\n•\n1.1k\nByteDance/InfiniteYou\nUpdated\n1 minute ago\n•\n516\nBrowse 1M+ models\nSpaces\nRunning\n1.28k\n1.28k\nDeepSite\

In [125]:
from IPython.display import display, Markdown
def create_brochure(company_name, url):
    response = ollama.chat(
        model="llama3.2",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response["message"]["content"]  # Corrected way to extract text
    display(Markdown(result))

In [126]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'team page', 'url': 'https://huggingface.co/team'}, {'type': 'careers page', 'url': 'https://huggingface.co/careers'}, {'type': 'blog page', 'url': 'https://blog.huggingface.co'}, {'type': 'news page', 'url': 'https://huggingface.co/news'}, {'type': 'about page', 'url': 'https://huggingface.co/about'}, {'type': 'company page', 'url': 'https://huggingface.co/'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}]}


# Hugging Face: Revolutionizing AI Collaboration and Innovation

## About Us

Hugging Face is a pioneering platform that empowers the machine learning community to collaborate, discover, and build upon cutting-edge models, datasets, and applications. Our mission is to accelerate innovation in AI and make it accessible to everyone.

## Company Culture

At Hugging Face, we value collaboration, creativity, and inclusivity. We believe that AI should be a force for good and should be used to drive positive change. Our community-driven approach allows developers from around the world to share knowledge, resources, and ideas, accelerating progress in machine learning.

## Customers

We have a diverse range of customers across various industries, including:

* **Meta**: Utilizing our platform for their AI initiatives
* **Amazon**: Leveraging our models and datasets for their business needs
* **Google**: Collaborating with us on cutting-edge AI research projects
* **Intel**: Working together to develop innovative AI solutions
* **Microsoft**: Using our platform for their AI development efforts

## Careers and Opportunities

Join our team of passionate developers, researchers, and innovators who are shaping the future of machine learning. We offer a range of roles across various disciplines, including:

* **Research and Development**: Help us advance the state-of-the-art in machine learning
* **Engineering**: Contribute to the development of our platform and tools
* **Sales and Business Development**: Join our team and help us bring Hugging Face to customers worldwide

## Our Open Source Stack

We're committed to building an open source foundation for machine learning tooling. Explore our various projects, including:

* **Transformers**: State-of-the-art ML models for PyTorch, TensorFlow, and JAX
* **Diffusers**: Cutting-edge diffusion models in PyTorch
* **Safetensors**: A safe way to store and distribute neural network weights

## Features and Benefits

Our platform offers a range of features and benefits, including:

* **Unlimited public models, datasets, and applications**: Collaborate with others and build upon existing work
* **GPU acceleration**: Deploy models on optimized inference endpoints or update Spaces applications in a few clicks
* **Enterprise solutions**: Get access to our paid Compute and Enterprise offerings for secure and scalable AI development

## Pricing

Our pricing plans cater to various needs, including:

* **Compute**: Starting at $0.60/hour for GPU
* **Enterprise**: Customized solutions for large-scale AI initiatives

## Stay Connected

Follow us on social media to stay up-to-date on the latest news, updates, and industry insights.

* **Twitter**: @HuggingFace
* **LinkedIn**: Hugging Face
* **Discord**: Join our community server

## Learn More

Explore our documentation, blog, and resources to learn more about Hugging Face and our mission to revolutionize AI collaboration and innovation.

In [132]:
def stream_brochure(company_name, url):
    stream = ollama.chat(
        model="llama3.2",
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    stream=True
    )
        

    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        if "message" in chunk and "content" in chunk["message"]:
            response += chunk["message"]["content"] or ''
            response = response.replace("```", "").replace("markdown", "")
            display_handle.update(Markdown(response))

In [133]:
stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'home page', 'url': 'https://huggingface.co/'}, {'type': 'team page', 'url': 'https://huggingface.co/team'}, {'type': 'careers page', 'url': 'https://jobs.smith.ai/'}, {'type': 'pricing page', 'url': 'https://support.huggingface.co/hc/en-us/articles/360020315310-Pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'news page', 'url': 'https://huggingface.co/news'}, {'type': 'about page', 'url': 'https://huggingface.co/about'}]}


# Hugging Face: Building the Future of AI Together

## About Us

Hugging Face is a platform dedicated to building the future of artificial intelligence (AI) by fostering collaboration among machine learning practitioners. Our mission is to provide a comprehensive suite of tools and resources that enable researchers, developers, and businesses to create, discover, and deploy AI applications.

## Our Community

Our community is built on the principles of open-source collaboration, innovation, and diversity. With over 50,000 organizations using our platform, we have created a vibrant ecosystem where individuals from around the world come together to share knowledge, resources, and ideas.

## Models and Datasets

We offer a vast repository of pre-trained models (1 million+), datasets (250k+), and applications, which can be easily browsed, shared, and adapted. Our models are designed to support various AI modalities, including text, image, video, audio, and 3D.

## Spaces

Our Spaces feature allows users to host, collaborate on, and deploy unlimited public models, datasets, and applications. This platform enables users to move faster, build their portfolio, and share their work with the world.

## Enterprise Solutions

We provide enterprise-grade solutions for businesses seeking advanced AI capabilities, including paid Compute, Enterprise features, and dedicated support. Our solutions are designed to meet the needs of large-scale organizations and ensure the security, access controls, and compliance required in these environments.

### Features

*   **GPU-based Inference Endpoints**: Deploy models on optimized inference endpoints or update Spaces applications to GPU in a few clicks.
*   **Enterprise-grade Security**: Access controls, dedicated support, and audit logs for maximum security and compliance.
*   **Single Sign-On**: Easy integration with popular single sign-on solutions.

### Pricing

*   **Compute**: Starting at $0.60/hour for GPU-based inference endpoints.
*   **Enterprise**: Starting at $20/user/month for premium features and support.

## Our Open Source Initiatives

We are committed to building the foundation of machine learning (ML) tooling with our community. Some of our notable open-source initiatives include:

*   **Transformers**: State-of-the-art ML for PyTorch, TensorFlow, JAX
*   **Diffusers**: State-of-the-art Diffusion models in PyTorch
*   **Safetensors**: Safe way to store/distribute neural network weights

## Join the Hugging Face Community

Ready to be part of our AI-powered community? Explore our platform, contribute to our open-source initiatives, and connect with like-minded individuals who share your passion for machine learning.

### Resources

*   [Documentation](https://huggingface.co/docs)
*   [Blog](https://huggingface.co/blog)
*   [GitHub](https://github.com/huggingface/transformers)
*   [Twitter](https://twitter.com/huggingface)
*   [LinkedIn](https://www.linkedin.com/company/hugging-face)

### Get Started

Sign up for our platform today and start exploring the vast world of AI together!

[Sign Up Now](https://huggingface.co/login)