In [3]:
import os 
import re 
import requests 
import socket 
from requests.exceptions import ConnectionError 
from requests.exceptions import MissingSchema 
from requests.exceptions import InvalidSchema 
from urllib3.exceptions import MaxRetryError, NameResolutionError 
import json 
from typing import List 
from dotenv import load_dotenv 
from bs4 import BeautifulSoup 
from IPython.display import Markdown, display, update_display 
import ollama

In [4]:
load_dotenv(override=True)
MODEL = 'llama3.2'

In [7]:
headers = { 
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
(KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36" 
}

class Website: 
    def __init__(self, url): 
        self.url = url 
        response = requests.get(url, headers=headers) 
        self.body = response.content 
        soup = BeautifulSoup(self.body, 'html.parser') 
        self.title = soup.title.string if soup.title else "No title found" 
        if soup.body: 
            for irrelevant in soup.body (["script", "style", "img", "input"]): 
                irrelevant.decompose() 
            self.text = soup.body.get_text(separator="\n", strip=True) 
        else: 
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')] 
        self.links = [link for link in links if link] 
    def get_contents(self): 
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [8]:
page = Website("https://huggingface.co")
page.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nvidia/parakeet-tdt-0.6b-v2',
 '/ACE-Step/ACE-Step-v1-3.5B',
 '/Lightricks/LTX-Video',
 '/nari-labs/Dia-1.6B',
 '/lodestones/Chroma',
 '/models',
 '/spaces/smolagents/computer-agent',
 '/spaces/enzostvs/deepsite',
 '/spaces/ByteDance/DreamO',
 '/spaces/ACE-Step/ACE-Step',
 '/spaces/NihalGazi/FLUX-Pro-Unlimited',
 '/spaces',
 '/datasets/DMindAI/DMind_Benchmark',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/nvidia/Nemotron-CrossThink',
 '/datasets/openbmb/Ultra-FineWeb',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers',
 '/docs/diffusers',


In [9]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in \
a brochure about the company, such as links to an About page, or a Company page, \
or Careers/Jobs pages.\n" 
link_system_prompt += "You should respond only in JSON, without text, object as in this example:" 
link_system_prompt += """
{ 
    "links": [ 
        {"type": "about page", "url": "https://full.url/goes/here/about"}, 
        {"type": "careers page": "url": "https://another.full.url/careers"} 
    ]
}
"""

In [10]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond only in JSON, without text, object as in this example:
{ 
    "links": [ 
        {"type": "about page", "url": "https://full.url/goes/here/about"}, 
        {"type": "careers page": "url": "https://another.full.url/careers"} 
    ]
}



In [11]:
def get_links_user_prompt(website): 
    user_prompt = f"Here is the list of links on the website of {website.url} - " 
    user_prompt += "please decide which of these are relevant web links for \
    a brochure about the company, respond with the full https URL in clean JSON format \
    wihout text json on the beginning of the response. \
    Do not include Terms of Service, Privacy, email links.\n" 
    user_prompt += "Links (some might be relative links):\n" 
    user_prompt += "\n".join(website.links) 
    return user_prompt

In [12]:
print(get_links_user_prompt(page))

Here is the list of links on the website of https://huggingface.co - please decide which of these are relevant web links for     a brochure about the company, respond with the full https URL in clean JSON format     wihout text json on the beginning of the response.     Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
/
/models
/datasets
/spaces
/posts
/docs
/enterprise
/pricing
/login
/join
/spaces
/models
/nvidia/parakeet-tdt-0.6b-v2
/ACE-Step/ACE-Step-v1-3.5B
/Lightricks/LTX-Video
/nari-labs/Dia-1.6B
/lodestones/Chroma
/models
/spaces/smolagents/computer-agent
/spaces/enzostvs/deepsite
/spaces/ByteDance/DreamO
/spaces/ACE-Step/ACE-Step
/spaces/NihalGazi/FLUX-Pro-Unlimited
/spaces
/datasets/DMindAI/DMind_Benchmark
/datasets/nvidia/OpenCodeReasoning
/datasets/nvidia/OpenMathReasoning
/datasets/nvidia/Nemotron-CrossThink
/datasets/openbmb/Ultra-FineWeb
/datasets
/join
/pricing#endpoints
/pricing#spaces
/pricing
/enterprise
/enterprise
/enterp

In [15]:
def get_links(url): 
    website = Website(url) 
    response = ollama.chat( 
        model=MODEL, 
        messages=[
            {"role": "system", "content": link_system_prompt}, 
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        options={"format": "json"} 
    ) 
    result = response['message'] ['content'] 
    #odkomentuj w razie potrzeby: 
    #result re.sub(r'<think>.*?</think>', '', result, flags re.DOTALL) 
    #result re.sub(r"^```json|```$", "", result.strip(), flags=re.MULTILINE).strip() 
    #result = result.strip() 
    print(result) 
    try: 
        content_json = json.loads(result) 
        return content_json 
    except json.JSONDecodeError: 
        print("Odpowiedź nie jest poprawnym JSON")

In [16]:
huggingface = Website("https://huggingface.co") 
huggingface.links 

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nvidia/parakeet-tdt-0.6b-v2',
 '/ACE-Step/ACE-Step-v1-3.5B',
 '/Lightricks/LTX-Video',
 '/nari-labs/Dia-1.6B',
 '/lodestones/Chroma',
 '/models',
 '/spaces/smolagents/computer-agent',
 '/spaces/enzostvs/deepsite',
 '/spaces/ByteDance/DreamO',
 '/spaces/ACE-Step/ACE-Step',
 '/spaces/NihalGazi/FLUX-Pro-Unlimited',
 '/spaces',
 '/datasets/DMindAI/DMind_Benchmark',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/nvidia/Nemotron-CrossThink',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/openbmb/Ultra-FineWeb',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers',
 '/docs/diffusers',


In [17]:
get_links("https://huggingface.co")

{
    "links": [
        {"type": "company page", "url": "https://huggingface.co/"},
        {"type": "brand page", "url": "https://huggingface.co/brand"},
        {"type": "about page", "url": "https://huggingface.co/"}, 
        {"type": "facebook", "url": "https://www.facebook.com/huggingface/"},
        {"type": "twitter", "url": "https://twitter.com/huggingface"},
        {"type": "linkedin", "url": "https://www.linkedin.com/company/huggingface/"},
        {"type": "github", "url": "https://github.com/huggingface"},
        {"type": "blog", "url": "https://huggingface.co/blog"}
    ]
}


{'links': [{'type': 'company page', 'url': 'https://huggingface.co/'},
  {'type': 'brand page', 'url': 'https://huggingface.co/brand'},
  {'type': 'about page', 'url': 'https://huggingface.co/'},
  {'type': 'facebook', 'url': 'https://www.facebook.com/huggingface/'},
  {'type': 'twitter', 'url': 'https://twitter.com/huggingface'},
  {'type': 'linkedin', 'url': 'https://www.linkedin.com/company/huggingface/'},
  {'type': 'github', 'url': 'https://github.com/huggingface'},
  {'type': 'blog', 'url': 'https://huggingface.co/blog'}]}

In [21]:
def get_all_details(url): 
    result = "Landing page: \n" 
    result += Website(url).get_contents() 
    links = get_links(url) 
    for link in links["links"]: 
        try: 
            result += f"\n\n {link['type']}\n" 
            result += Website (link["url"]).get_contents() 
        except socket.gaierror as e: 
            print(f"DNS resolution failed: (e)") 
        except NameResolutionError as e: 
            print(f"Name resolution error: (e)") 
        except MaxRetryError as e: 
            print (f"Max retries exceeded: (e)") 
        except ConnectionError as e: 
            print(f"Connection error: (e)") 
        except MissingSchema as e: 
            print(f"Invalid URL schema: (e)") 
        except InvalidSchema as e: 
            print(f"Omitted unsupported URL (InvalidSchema): (e)") 
    return result

In [22]:
print(get_all_details("https://huggingface.co"))

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co/"},
        {"type": "Company page", "url": "https://huggingface.co/brand"},
        {"type": "Blog", "url": "https://blog.huggingface.co"},
        {"type": "Discussions forum", "url": "https://discuss.huggingface.co"},
        {"type": "GitHub repository", "url": "https://github.com/huggingface"},
        {"type": "Twitter page", "url": "https://twitter.com/huggingface"},
        {"type": "LinkedIn company page", "url": "https://www.linkedin.com/company/huggingface/"},
        {"type": "Jobs/Jobs page", "url": "https://apply.workable.com/huggingface/"}
    ]
}
Connection error: (e)
Landing page: 
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore 

In [24]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown. \
Include details of company culture, customers and careers/jobs if you have the information."

In [25]:
def get_brochure_user_prompt(company_name, url): 
    user_prompt = f"You are looking at a company called: [company_name]\n" 
    user_prompt += f"Here are the contents of its landing page and other relevant pages; \
    use this information to build a short brochure of the company in markdown.\n" 
    user_prompt += get_all_details(url) 
    user_prompt = user_prompt[:5_000]
    return user_prompt

In [26]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

{ 
    "links": [ 
        {"type": "Company page", "url": "https://huggingface.co/"},
        {"type": "About page", "url": "https://brand.huggingface.co/"},
        {"type": "Blog page", "url": "https://blog.huggingface.co/"},
        {"type": "Discussions", "url": "https://discuss.huggingface.co/"},
        {"type": "GitHub", "url": "https://github.com/huggingface"},
        {"type": "Twitter", "url": "https://twitter.com/huggingface"},
        {"type": "LinkedIn", "url": "https://www.linkedin.com/company/huggingface/"},
        {"type": "All Teams", "url": "https://huggingface.co/all-teams"}
    ]
}
Connection error: (e)
Connection error: (e)


'You are looking at a company called: [company_name]\nHere are the contents of its landing page and other relevant pages;     use this information to build a short brochure of the company in markdown.\nLanding page: \nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nnvidia/parakeet-tdt-0.6b-v2\nUpdated\n13 days ago\n•\n109k\n•\n794\nACE-Step/ACE-Step-v1-3.5B\nUpdated\nabout 23 hours ago\n•\n396\nLightricks/LTX-Video\nUpdated\n8 days ago\n•\n250k\n•\n1.45k\nnari-labs/Dia-1.6B\nUpdated\nabout 9 hours ago\n•\n159k\n•\n2.11k\nlodestones/Chroma\nUpdated\nabout 5 hours ago\n•\n477\nBrowse 1M+ models\nSpaces\nRunning\non\nCPU Upgrade\n541\n541\nComputer 

In [28]:
def create_brochure (company_name, url): 
    response = ollama.chat( 
        model=MODEL, 
        messages=[ 
        {"role": "system", "content": system_prompt}, 
        {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
         ], 
    ) 
    result = response['message']['content'] 
    #w razie czego usuwamy tekst wnioskowania z modeli typu reasoning 
    #odkomentuj w razie potrzeby: 
    #result = re.sub(r'<think>.*?</think>', '', result, flags re. DOTALL) 
    #result = result.strip() 
    display (Markdown(result))

In [29]:
create_brochure("Hugging Face", "https://huggingface.co")

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co/"},
        {"type": "Company page", "url": "https://huggingface.co/"},
        {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "GitHub page", "url": "https://github.com/huggingface"},
        {"type": "Twitter page", "url": "https://twitter.com/huggingface"},
        {"type": "LinkedIn page", "url": "https://www.linkedin.com/company/huggingface/"},
        {"type": "Blog page", "url": "https://discuss.huggingface.co/"},
        {"type": "Status page", "url": "https://status.huggingface.co/"},
        {"type": "Docs/Transformers page", "url": "docs/transformers"}
    ]
}
Invalid URL schema: (e)


# Hugging Face Brochure

[Cover Image: A group of people from diverse backgrounds collaborating on a laptop, with a cityscape in the background]

## About Us

Hugging Face is the AI community building the future. We are a platform where machine learning enthusiasts collaborate on models, datasets, and applications. Our mission is to accelerate your ML journey by providing a collaborative space for developers, researchers, and businesses.

## Our Values

* **Community-driven**: We believe in the power of collaboration and open-source development.
* **Innovation**: We strive to push the boundaries of AI research and innovation.
* **Accessibility**: We make high-quality AI models and datasets accessible to everyone.

## Models and Datasets

We offer over 1 million pre-trained models and datasets for various AI applications, including:

* Natural Language Processing (NLP)
* Computer Vision
* Speech Recognition
* Reinforcement Learning

Some of our popular models include:

* NVIDIA/Parakeet-tdt-0.6b-v2
* ACE-Step/ACE-Step-v1-3.5B
* Lightricks/LTX-Video

## Spaces

Our platform allows users to host and collaborate on unlimited public models, datasets, and applications. With our AI-powered computer agent, you can run tasks using a powerful computer infrastructure.

## Careers and Community

Join our community of over 50,000 organizations and become part of the future of AI research and innovation. Check out our job openings and career development resources.

* **Jobs**: [link to jobs page]
* **Career Development**: [link to career development page]

## Partnerships and Collaborations

We partner with leading companies like:

* Meta
* Amazon
* Google
* Intel
* Microsoft
* Grammarly

These partnerships enable us to provide high-quality AI models and datasets to a broader audience.

## Get Started

Sign up for our platform today and start exploring the world of AI!

[Call-to-Action Button: Sign Up Now]

Note: This brochure is a summary of the information available on the Hugging Face website. It may not be comprehensive or up-to-date, but it provides an overview of the company's mission, values, and offerings.

In [31]:
def stream_brochure(company_name, url): 
    stream = ollama.chat( 
        model=MODEL, 
        messages=[ 
            {"role": "system", "content": system_prompt}, 
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ], 
        stream=True 
    ) 
    result = ""
    display_handle = display(Markdown(""), display_id=True) 
    for chunk in stream: 
        result += chunk['message']['content'] or ''
        #usuwamy tekst wnioskowania z modeli typu reasoning 
        #odkomentuj w razie potrzeby: 
        #result = re.sub(r'<think>.*?</think>', '', result, flags re.DOTALL) 
        #result = result.strip() 
        result = result.replace("```","").replace("markdown", "") 
        update_display (Markdown(result), display_id=display_handle.display_id) 

In [None]:
stream_brochure("Hugging Face", "https://huggingface.co")

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co/"},
        {"type": "Company page", "url": "https://brand.huggingface.co/"},
        {"type": "Blog", "url": "https://blog.huggingface.co/"},
        {"type": "GitHub repository", "url": "https://github.com/huggingface"},
        {"type": "Twitter handle", "url": "https://twitter.com/huggingface"},
        {"type": "LinkedIn page", "url": "https://www.linkedin.com/company/huggingface/"},
        {"type": "Discord server", "url": "https://join.discord.huggingface.co/"},
        {"type": "Docs", "url": "https://docs.huggingface.co/"},
        {"type": "Learn page", "url": "https://learn.huggingface.co/"},
        {"type": "FAQs", "url": "https://support.huggingface.co/"}
    ]
}
Connection error: (e)
Connection error: (e)
Connection error: (e)
Connection error: (e)
Connection error: (e)
Connection error: (e)


