In [30]:
import os
import re
import requests
import socket
from requests.exceptions import ConnectionError
from requests.exceptions import MissingSchema
from requests.exceptions import InvalidSchema
from urllib3.exceptions import MaxRetryError, NameResolutionError
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import ollama


In [31]:
load_dotenv(override=True)
MODEL = 'llama3.2'

In [32]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    def __init__(self,url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [33]:
page = Website("https://huggingface.co")
page.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/Wan-AI/Wan2.1-VACE-14B',
 '/multimodalart/isometric-skeumorphic-3d-bnb',
 '/nvidia/parakeet-tdt-0.6b-v2',
 '/nari-labs/Dia-1.6B',
 '/stabilityai/stable-audio-open-small',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/Lightricks/ltx-video-distilled',
 '/spaces/NihalGazi/FLUX-Pro-Unlimited',
 '/spaces/ByteDance/DreamO',
 '/spaces/smolagents/computer-agent',
 '/spaces',
 '/datasets/openbmb/Ultra-FineWeb',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/PrimeIntellect/INTELLECT-2-RL-Dataset',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/disco-eth/EuroSpeech',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Write

In [34]:
link_system_prompt = "You are provided with a list of links found on a webpage .\
You are able to decide which of the links would be most relevant to include in \
a brochure about the company, such as links to an About page, or a Company page, \
or Careers/Jobs pages. \n"
link_system_prompt += "You should respond only in JSON, without text, object as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [35]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for \
    a brochure about the company, respond with the full https URL in clean JSON format \
    without text json on the beginning of the response. \
    Do not include Terms of Service, Privacy, email links. \n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [36]:
def get_links(url):
    website = Website(url)
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        options={"format": "json"}
    )
    result = response['message']['content']

    print(result)
    try: 
        content_json = json.loads(result)
        return content_json
    except json.JSONDecodeError:
        print("Odpowiedź nie jest poprawnym json")

In [37]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/Wan-AI/Wan2.1-VACE-14B',
 '/multimodalart/isometric-skeumorphic-3d-bnb',
 '/nvidia/parakeet-tdt-0.6b-v2',
 '/nari-labs/Dia-1.6B',
 '/stabilityai/stable-audio-open-small',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/Lightricks/ltx-video-distilled',
 '/spaces/NihalGazi/FLUX-Pro-Unlimited',
 '/spaces/ByteDance/DreamO',
 '/spaces/smolagents/computer-agent',
 '/spaces',
 '/datasets/openbmb/Ultra-FineWeb',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/PrimeIntellect/INTELLECT-2-RL-Dataset',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/disco-eth/EuroSpeech',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Write

In [38]:
get_links("https://huggingface.co")

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co"},
        {"type": "Company page", "url": "https://huggingface.co/team"},
        {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "Blog", "url": "https://blog.huggingface.co"},
        {"type": "Documentation", "url": "https://huggingface.co/docs"},
        {"type": "GitHub repository", "url": "https://github.com/huggingface"},
        {"type": "Twitter profile", "url": "https://twitter.com/huggingface"},
        {"type": "LinkedIn company page", "url": "https://www.linkedin.com/company/huggingface/"}
    ]
Odpowiedź nie jest poprawnym json


In [56]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()

    links = get_links(url)
    if links and "links" in links:
        for link in links["links"]:
            try:
                result += f"\n\n{link['type']}\n"
                result += Website(link['url']).get_contents()
            except socket.gaierror as e:
                print(f"DNS resolution failed: {e}")
            except NameResolutionError as e:
                print(f"Name resolution error: {e}")
            except MaxRetryError as e:
                print(f"Max retries exceeded: {e}")
            except ConnectionError as e:
                print(f"Connection error: {e}")
            except MissingSchema as e:
                print(f"Invalid URL schema: {e}")
            except InvalidSchema as e:
                print(f"Omitted unsupported URL (InvalidSchema): {e}")
    else:
        print("No valid links returned by get_links()")

    return result


In [57]:
print(get_all_details("https://huggingface.co"))

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co"},
        {"type": "Company page", "url": "https://huggingface.co/brand"},
        {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "About page", "url": "https://discuss.huggingface.co"},
        {"type": "Status page", "url": "https://status.huggingface.co/"},
        {"type": "GitHub page", "url": "https://github.com/huggingface"},
        {"type": "Twitter handle", "url": "https://twitter.com/huggingface"},
        {"type": "LinkedIn company profile", "url": "https://www.linkedin.com/company/huggingface/"}
    ]
}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Brow

In [58]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [59]:
def get_brochure_user_prompt(company_name,url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; \
    use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt

In [60]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

{
    "links": [
        "https://huggingface.co/docs",
        "https://huggingface.co/models",
        "https://huggingface.co/spaces",
        "https://huggingface.co/pricing",
        "https://huggingface.co/enterprise",
        "https://huggingface.co/allenai",
        "https://huggingface.co/facebook",
        "https://huggingface.co/amazon",
        "https://huggingface.co/google",
        "https://huggingface.co/Intel",
        "https://huggingface.co/microsoft",
        "https://huggingface.co/grammarly",
        "https://huggingface.co/Writer"
    ]
}


TypeError: string indices must be integers, not 'str'

In [None]:
def create_brochure(company_name, url):
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response['message']['content']
    display(Markdown(result))

In [None]:
create_brochure("HuggingFace", "https://huggingface.co")

In [None]:
def stream_brochure(company_name, url):
    stream = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
        stream=True
    )
    result = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        result += chunk['message']['content'] or ''
        result = result.replace("```","").replace("markdown","")
        update_display(Markdown(result),display_id=display_handle.display_id)

In [None]:
stream_brochure("HuggingFace", "https://huggingface.co")