In [12]:
import os
import re
import requests
import socket
from requests.exceptions import ConnectionError
from requests.exceptions import MissingSchema
from requests.exceptions import InvalidSchema
from urllib3.exceptions import MaxRetryError, NameResolutionError
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import ollama

In [13]:
load_dotenv(override=True)
MODEL = 'llama3.2'

In [14]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}
class Website:
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]
    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [15]:
page = Website("https://huggingface.co")
page.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nvidia/parakeet-tdt-0.6b-v2',
 '/ACE-Step/ACE-Step-v1-3.5B',
 '/Lightricks/LTX-Video',
 '/nari-labs/Dia-1.6B',
 '/lodestones/Chroma',
 '/models',
 '/spaces/smolagents/computer-agent',
 '/spaces/enzostvs/deepsite',
 '/spaces/ByteDance/DreamO',
 '/spaces/ACE-Step/ACE-Step',
 '/spaces/NihalGazi/FLUX-Pro-Unlimited',
 '/spaces',
 '/datasets/DMindAI/DMind_Benchmark',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/nvidia/Nemotron-CrossThink',
 '/datasets/openbmb/Ultra-FineWeb',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers',
 '/docs/diffusers',


In [16]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in \
a brochure about the company, such as links to an About page, or a Company page, \
or Careers/Jobs pages. "
link_system_prompt += "You should respond only in JSON, without text, object as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [17]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages. You should respond only in JSON, without text, object as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}



In [18]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for \
    a brochure about the company, respond with the full https URL in clean JSON format \
    without text json on the beginning of the response. \
    Do not include Terms of Service, Privacy, email links. \n"
    user_prompt += "Links (some might be relative links): \n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [19]:
print(get_links_user_prompt(page))

Here is the list of links on the website of https://huggingface.co - please decide which of these are relevant web links for     a brochure about the company, respond with the full https URL in clean JSON format     without text json on the beginning of the response.     Do not include Terms of Service, Privacy, email links. 
Links (some might be relative links): 
/
/models
/datasets
/spaces
/posts
/docs
/enterprise
/pricing
/login
/join
/spaces
/models
/nvidia/parakeet-tdt-0.6b-v2
/ACE-Step/ACE-Step-v1-3.5B
/Lightricks/LTX-Video
/nari-labs/Dia-1.6B
/lodestones/Chroma
/models
/spaces/smolagents/computer-agent
/spaces/enzostvs/deepsite
/spaces/ByteDance/DreamO
/spaces/ACE-Step/ACE-Step
/spaces/NihalGazi/FLUX-Pro-Unlimited
/spaces
/datasets/DMindAI/DMind_Benchmark
/datasets/nvidia/OpenCodeReasoning
/datasets/nvidia/OpenMathReasoning
/datasets/nvidia/Nemotron-CrossThink
/datasets/openbmb/Ultra-FineWeb
/datasets
/join
/pricing#endpoints
/pricing#spaces
/pricing
/enterprise
/enterprise
/ent

In [20]:
def get_links(url):
    website = Website(url)
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        options={"format": "json"}
    )
    result = response['message']['content']
    #result = re.sub(r"<think>.*?</think>", "", result, flags=re.DOTALL)
    #result = result.strip()
    print(result)
    try:
        content_json = json.loads(result)
        return content_json
    except json.JSONDecodeError:
        print("Odpowiedź nie jest poprawnym JSON")

In [21]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nvidia/parakeet-tdt-0.6b-v2',
 '/ACE-Step/ACE-Step-v1-3.5B',
 '/Lightricks/LTX-Video',
 '/nari-labs/Dia-1.6B',
 '/lodestones/Chroma',
 '/models',
 '/spaces/smolagents/computer-agent',
 '/spaces/enzostvs/deepsite',
 '/spaces/ByteDance/DreamO',
 '/spaces/ACE-Step/ACE-Step',
 '/spaces/NihalGazi/FLUX-Pro-Unlimited',
 '/spaces',
 '/datasets/DMindAI/DMind_Benchmark',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/nvidia/Nemotron-CrossThink',
 '/datasets/openbmb/Ultra-FineWeb',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers',
 '/docs/diffusers',


In [22]:
get_links("https://huggingface.co")

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co/"},
        {"type": "Company page", "url": "https://huggingface.co/allenai"},
        {"type": "Company page", "url": "https://huggingface.co/amazon"},
        {"type": "Company page", "url": "https://huggingface.co/facebook"},
        {"type": "Company page", "url": "https://huggingface.co/google"},
        {"type": "Company page", "url": "https://huggingface.co/Intel"},
        {"type": "Company page", "url": "https://huggingface.co/microsoft"},
        {"type": "Company page", "url": "https://huggingface.co/grammarly"},
        {"type": "Company page", "url": "https://huggingface.co/Writer"},
        {"type": "About page", "url": "https://discuss.huggingface.co/"},
        {"type": "Blog page", "url": "https://blog.huggingface.co/"},
        {"type": "GitHub page", "url": "https://github.com/huggingface"},
        {"type": "Twitter page", "url": "https://twitter.com/huggingface"},
        {"type": "Linke

In [23]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    for link in links["links"]:
        try:
            result += Website(link["url"]).get_contents()
        except socket.gaierror as e:
            print(f"DNS resolution failed: {e}")
        except NameResolutionError as e:
            print(f"Name resolution error: {e}")
        except MaxRetryError as e:
            print(f"Max retries exceeded: {e}")
        except ConnectionError as e:
            print(f"Connection error: {e}")
        except MissingSchema as e:
            print(f"Invalid URL schema: {e}")
        except InvalidSchema as e:
            print(f"Omitted unsupported URL (InvalidSchema): {e}")
    return result

In [24]:
print(get_all_details("https://huggingface.co"))

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co/"},
        {"type": "Company page", "url": "https://huggingface.co/allenai"},
        {"type": "Company page", "url": "https://huggingface.co/facebook"},
        {"type": "Company page", "url": "https://huggingface.co/amazon"},
        {"type": "Company page", "url": "https://huggingface.co/google"},
        {"type": "Company page", "url": "https://huggingface.co/Intel"},
        {"type": "Company page", "url": "https://huggingface.co/microsoft"},
        {"type": "Company page", "url": "https://huggingface.co/grammarly"},
        {"type": "Company page", "url": "https://huggingface.co/Writer"},
        {"type": "Documentation", "url": "https://docs.huggingface.co/"}
    ]}
Connection error: HTTPSConnectionPool(host='docs.huggingface.co', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x000002629F6E2390>: Failed to resolve 'docs.hugg

In [25]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown. Include details of company culture, customers and careers/jobs if you have the information."

In [26]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; \
    use this information to build a short brochure of the company in markdown. \n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]  # Obcięcie, jeśli więcej niż 5000 znaków
    return user_prompt

In [None]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

In [None]:
def create_brochure(company_name, url):
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response['message']['content']
    #result = result.strip()
    display(Markdown(result))

In [None]:
create_brochure("HuggingFace", "https://huggingface.co")

In [None]:
def stream_brochure(company_name, url):
    stream = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
        stream=True
    )
    result = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        result += chunk['message']['content'] or ""
        #result = result.strip()
        result = result.replace("```", " ").replace("markdown", " ")
        update_display(Markdown(result), display_id=display_handle.display_id)

In [None]:
stream_brochure("HuggingFace", "https://huggingface.co")

In [8]:
import os
import re
import requests
import socket
from requests.exceptions import ConnectionError, MissingSchema, InvalidSchema
from urllib3.exceptions import MaxRetryError, NameResolutionError
import json
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import ollama

load_dotenv(override=True)
MODEL = 'gemma3'

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    def __init__(self, url):
        self.url = url
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            self.body = response.content
            soup = BeautifulSoup(self.body, 'html.parser')
            self.title = soup.title.string if soup.title else "No title found"
            if soup.body:
                for irrelevant in soup.body(["script", "style", "img", "input"]):
                    irrelevant.decompose()
                self.text = soup.body.get_text(separator="\n", strip=True)
            else:
                self.text = ""
            links = [link.get('href') for link in soup.find_all('a') if link.get('href')]
            self.links = [link for link in links if link]
        except Exception as e:
            print(f"Błąd podczas pobierania strony {url}: {e}")
            self.title = "Error"
            self.text = ""
            self.links = []
    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

link_system_prompt = """You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages. You should respond only in JSON, without text, object as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in clean JSON format without text json on the beginning of the response. Do not include Terms of Service, Privacy, email links.\nLinks (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

def get_links(url):
    website = Website(url)
    try:
        response = ollama.chat(
            model=MODEL,
            messages=[
                {"role": "system", "content": link_system_prompt},
                {"role": "user", "content": get_links_user_prompt(website)}
            ],
            options={"format": "json"}
        )
        result = response['message']['content']
        print("Odpowiedź modelu:", result)
        try:
            content_json = json.loads(result)
            return content_json
        except json.JSONDecodeError:
            print("Odpowiedź nie jest poprawnym JSON")
            return None
    except Exception as e:
        print(f"Błąd podczas komunikacji z modelem: {e}")
        return None

def get_all_details(url):
    result = "Landing page:\n"
    try:
        result += Website(url).get_contents()
        links = get_links(url)
        if links and "links" in links:
            for link in links["links"]:
                try:
                    result += Website(link["url"]).get_contents()
                except (socket.gaierror, NameResolutionError, MaxRetryError, ConnectionError, MissingSchema, InvalidSchema) as e:
                    print(f"Błąd dla linku {link['url']}: {e}")
    except Exception as e:
        print(f"Błąd podczas pobierania strony {url}: {e}")
    return result

def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5000]  # Ograniczenie do 5000 znaków
    return user_prompt

def create_brochure(company_name, url, prompt_type="standard"):
    system_prompt = system_prompt if prompt_type == "standard" else system_prompt_humorous
    try:
        response = ollama.chat(
            model=MODEL,
            messages=[
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
            ],
        )
        result = response['message']['content']
        display(Markdown(result))
    except Exception as e:
        print(f"Błąd podczas generowania broszury: {e}")

# Test dla wyspagier.pl
try:
    print("Test 1: Standardowy prompt")
    create_brochure("Wyspa Gier", "https://wyspagier.pl", prompt_type="standard")
    print("\nTest 2: Humorystyczny prompt")
    create_brochure("Wyspa Gier", "https://wyspagier.pl", prompt_type="humorous")
except Exception as e:
    print(f"Ogólny błąd: {e}")

SyntaxError: invalid syntax (2675785182.py, line 31)