In [26]:
import os
import re
import requests
import socket
import json
from requests.exceptions import ConnectionError, MissingSchema, InvalidSchema
from urllib3.exceptions import MaxRetryError, NewConnectionError  # Removed NameResolutionError
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import ollama
from typing import List

In [27]:
load_dotenv(override = True)
MODEL = 'llama3.2'

In [28]:


# Define headers (make sure it is a dictionary)
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

# Klasa reprezentująca zescrapowaną stronę internetową z linkami
class Website:
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)  # Fix the header and request line
        self.body = response.content  # Properly assign the response content to 'self.body'
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        
        # Check if there is a body and clean the page content
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()  # Remove irrelevant tags
            self.text = soup.body.get_text(separator="\n", strip=True)  # Clean text from body
        else:
            self.text = "No body found"

        # Extract links from <a> tags
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]  # Filter out None values

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\nLinks:\n{', '.join(self.links)}"

# Example usage
url = "https://example.com"
website = Website(url)
print(website.get_contents())


Webpage Title:
Example Domain
Webpage Contents:
Example Domain
This domain is for use in illustrative examples in documents. You may use this
    domain in literature without prior coordination or asking for permission.
More information...

Links:
https://www.iana.org/domains/example


In [29]:

page=Website("https://huggingface.co")
page.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nvidia/parakeet-tdt-0.6b-v2',
 '/nari-labs/Dia-1.6B',
 '/Lightricks/LTX-Video',
 '/ACE-Step/ACE-Step-v1-3.5B',
 '/lodestones/Chroma',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/smolagents/computer-agent',
 '/spaces/ByteDance/DreamO',
 '/spaces/NihalGazi/FLUX-Pro-Unlimited',
 '/spaces/ACE-Step/ACE-Step',
 '/spaces',
 '/datasets/openbmb/Ultra-FineWeb',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/DMindAI/DMind_Benchmark',
 '/datasets/PrimeIntellect/INTELLECT-2-RL-Dataset',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers',
 '/docs/d

In [30]:
link_system_prompt = "You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in \
a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond only in JSON, without text, object as in this example:\n"
link_system_prompt += '''
{
  "links": [
    {"type": "about page", "url": "https://full.url/goes/here/about"},
    {"type": "careers page", "url": "https://another.full.url/careers"}
  ]
}
'''

print(link_system_prompt)


You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond only in JSON, without text, object as in this example:

{
  "links": [
    {"type": "about page", "url": "https://full.url/goes/here/about"},
    {"type": "careers page", "url": "https://another.full.url/careers"}
  ]
}



In [31]:
import requests
import json
import re

# Assuming Website class and required imports are already defined

def get_links(url):
    # Fetch webpage content using the Website class
    website = Website(url)  # Create a Website object for the given URL
    links = website.links  # Assuming this gives the list of links

    # Construct the user prompt with the list of links
    get_links_user_prompt = json.dumps({"links": links})

    # Send to the model (here using Ollama as an example)
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt}
        ],
        options={"format": "json"}
    )

    # Process the response to extract the relevant content
    result = response['message']['content']
    
    # Clean unnecessary parts of the response (e.g., reasoning or extra text)
    result = re.sub(r'<think>.*?</think>', '', result, flags=re.DOTALL)  # Example: removing specific tags
    result = result.strip()  # Strip any leading/trailing whitespaces

    # Attempt to parse the cleaned result as JSON
    try:
        content_json = json.loads(result)  # Parse the cleaned string as JSON
        return content_json
    except json.JSONDecodeError:
        print("Odpowiedź nie jest poprawnym JSON")  # Print an error if JSON is invalid
        return None

# Example usage
huggingface_url = "https://huggingface.co"
huggingface_website = Website(huggingface_url)
huggingface_links = get_links(huggingface_url)
print(huggingface_links)


{'links': [{'type': 'about page', 'url': 'https://ui.endpoints.huggingface.co'}, {'type': 'blog', 'url': 'https://blog.huggingface.co'}]}


In [32]:
def get_all_details(url):
    """Fetch the content of a webpage and all linked pages."""
    result = "Landing page:\n"
    
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP errors
        result += response.text  # Add main page content

        # Get the links on the main page
        links = get_links(url)

        # Iterate through the links on the main page and fetch their content
        for link in links["links"]:
            try:
                # Ensure the URL is absolute by joining with the base URL if it's relative
                full_url = link["url"]
                if not full_url.startswith("http://") and not full_url.startswith("https://"):
                    full_url = urljoin(url, full_url)  # Combine base URL with relative URL
                
                # Fetch content for each linked page and append it to result
                result += f"\n\n{link['type']}\n"
                response = requests.get(full_url)
                response.raise_for_status()  # Check for HTTP errors
                result += response.text

            except requests.exceptions.RequestException as e:
                print(f"Error fetching content from {full_url}: {e}")
    
    except requests.exceptions.RequestException as e:
        print(f"Error fetching main page {url}: {e}")
    
    return result

# Test with a URL
print(get_all_details("https://huggingface.co"))

Error fetching content from https://huggingface.co/join: 403 Client Error: Forbidden for url: https://huggingface.co/join
Landing page:
<!doctype html>
<html class="">
	<head>
		<meta charset="utf-8" />
		<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no" />
		<meta name="description" content="We’re on a journey to advance and democratize artificial intelligence through open source and open science." />
		<meta property="fb:app_id" content="1321688464574422" />
		<meta name="twitter:card" content="summary_large_image" />
		<meta name="twitter:site" content="@huggingface" />
		<meta name="twitter:image" content="https://huggingface.co/front/thumbnails/v2-2.png" />
		<meta property="og:title" content="Hugging Face – The AI community building the future." />
		<meta property="og:type" content="website" />
		<meta property="og:url" content="https://huggingface.co/" />
		<meta property="og:image" content="https://huggingface.co/front/thumbnails/v2-2.png"

In [33]:
print(get_all_details("https://huggingface.co"))

Landing page:
<!doctype html>
<html class="">
	<head>
		<meta charset="utf-8" />
		<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no" />
		<meta name="description" content="We’re on a journey to advance and democratize artificial intelligence through open source and open science." />
		<meta property="fb:app_id" content="1321688464574422" />
		<meta name="twitter:card" content="summary_large_image" />
		<meta name="twitter:site" content="@huggingface" />
		<meta name="twitter:image" content="https://huggingface.co/front/thumbnails/v2-2.png" />
		<meta property="og:title" content="Hugging Face – The AI community building the future." />
		<meta property="og:type" content="website" />
		<meta property="og:url" content="https://huggingface.co/" />
		<meta property="og:image" content="https://huggingface.co/front/thumbnails/v2-2.png" />

		<link rel="stylesheet" href="/front/build/kube-24f8b19/style.css" />

		<link rel="preconnect" href="https://fonts.

In [34]:
# Define the system prompt with a humorous tone
system_prompt = """You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. \
Respond in markdown. Include details of company culture, customers and careers/jobs if you have the information."""
# Define the formal system prompt
system_prompt = """You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. \
Respond in markdown. Include details of company culture, customers and careers/jobs if you have the information."""


In [38]:

def get_brochure_user_prompt (company_name, url):
user_prompt = "You are looking at a company called: [company_name}\n"
user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
user_prompt += get_all_details(url)
user_prompt = user_prompt[:5_000] # Obcięcie, jeśli więcej niż 5000 znaków
return user_prompt
get_brochure_user_prompt ("HuggingFace", "https://huggingface.co")

IndentationError: expected an indented block (2544533241.py, line 2)

In [36]:
def get_brochure_user_prompt(company_name, url):
    # Construct the user prompt based on the company name and URL
    return f"Create a brochure for the company {company_name} based on information from the website {url}."

# Function to create the brochure
def create_brochure(company_name, url):
    # Assuming 'ollama.chat' is your model interface
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ]
    )

    # Extract the result content
    result = response['message']['content']
    
    # If needed, remove thinking/reasoning content
    result = re.sub(r'<think>.*?</think>', '', result, flags=re.DOTALL)
    
    # Display the result as Markdown
    display(Markdown(result))

# Call the function to generate a brochure
create_brochure("HuggingFace", "https://huggingface.co")

# Hugging Face Brochure
==========================

[Cover Image: A logo with a playful "Hugging Face" mascot]

Welcome to Hugging Face
------------------------

We're a community-driven AI research organization that's passionate about making it easy for everyone to use artificial intelligence (AI). Our mission is to accelerate the development of AI by providing high-quality, pre-trained models and tools.

[Image: A team photo of Hugging Face employees from various backgrounds]

Our Team
-----------

At Hugging Face, we're a diverse group of researchers, engineers, and enthusiasts who share a common goal: to democratize access to AI. Our team is spread across the globe, with offices in Europe, North America, and Asia.

[Image: A map showing Hugging Face's global presence]

Our Technology
----------------

We're known for our popular library, Transformers, which provides pre-trained models and tools for natural language processing (NLP) tasks such as text classification, sentiment analysis, and language translation. Our models are trained on large datasets from various sources, including books, articles, and websites.

[Image: A screenshot of the Hugging Face website's model repository]

Our Customers
--------------

We have a growing community of customers across industries, including:

*   **Research Institutions**: Universities and research centers that use our models for various AI-related projects.
*   **Technology Companies**: Startups and established companies that integrate our models into their products and services.
*   **Data Scientists**: Researchers and analysts who use our tools to build AI-powered applications.

[Image: A logo of a company using Hugging Face technology]

Our Values
------------

At Hugging Face, we're guided by the following values:

*   **Community-Driven**: We believe that open-source projects like ours should be accessible to everyone.
*   **Innovation**: We're committed to pushing the boundaries of AI research and development.
*   **Collaboration**: We work closely with researchers, developers, and industry partners to advance the field.

[Image: A diagram illustrating our values]

Careers
---------

Join us at Hugging Face and contribute to shaping the future of AI!

We offer various job openings across different departments, including:

*   **Research**: Work on developing new models and algorithms for NLP tasks.
*   **Engineering**: Contribute to the development of our libraries and tools.
*   **Sales**: Help businesses integrate our models into their products and services.

[Image: A call-to-action button with a link to Hugging Face's careers page]

Stay in Touch
-------------

Follow us on social media to stay up-to-date with our latest news, releases, and community events!

[Image: Social media logos]

Thank you for considering Hugging Face as your partner in AI research and development.

Best regards,
The Hugging Face Team

In [40]:
import re
from IPython.display import display, update_display, Markdown  # Use IPython's Markdown

def stream_brochure(company_name, url):
    # Start the chat stream with the ollama model
    stream = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
        stream=True  
    )
    
    result = ""  # Initialize the result string
    # Display an empty Markdown cell initially, to be updated as the stream proceeds
    display_handle = display(Markdown(""), display_id=True)
    
    # Loop over the chunks from the stream
    for chunk in stream:
        result += chunk['message']['content'] or ''

        
        result = result.replace("```", "").replace("markdown", "")
        
    
        update_display(Markdown(result), display_id=display_handle.display_id)

# Call the function to stream and display the brochure content
stream_brochure("HuggingFace", "https://huggingface.co")

# **Hugging Face: Revolutionizing Natural Language Processing**

[Image: A team of diverse individuals working together]

Welcome to Hugging Face, the leading open-source software company dedicated to advancing natural language processing (NLP) and making it more accessible to everyone.

## Our Mission

At Hugging Face, we are passionate about empowering developers, researchers, and businesses to unlock the full potential of NLP. We believe that language is a fundamental part of human interaction, and that access to effective NLP tools can revolutionize industries such as healthcare, education, customer service, and more.

## Our Culture

We are a diverse team of innovators, engineers, and enthusiasts who share a common passion for NLP and open-source software. Our company culture is built around the principles of collaboration, inclusivity, and continuous learning.

* **Collaborative Environment**: We believe that working together is key to innovation. Our team is comprised of individuals from diverse backgrounds and expertise, which fosters a rich exchange of ideas and perspectives.
* **Inclusive Community**: We are committed to creating an inclusive environment where everyone can contribute, learn, and grow. Our community-driven approach ensures that our software is developed with the needs of all users in mind.

## Customers

We serve a wide range of customers across industries, including:

* **Academic Researchers**: Hugging Face provides researchers with access to cutting-edge NLP tools, enabling them to advance their research and make new discoveries.
* **Businesses**: Our software helps businesses to improve customer service, automate processes, and gain insights from large datasets.
* **Developers**: We empower developers to build scalable and efficient NLP applications using our popular libraries such as Transformers and Hugging Face Datasets.

## Careers

If you're passionate about NLP, open-source software, and collaboration, we want to hear from you!

* **Software Engineers**: Join our team of talented engineers who develop innovative solutions for NLP challenges.
* **Research Scientists**: Collaborate with our research scientists on cutting-edge projects that advance the field of NLP.
* **Community Managers**: Help us build and grow our vibrant community of developers, researchers, and enthusiasts.

## Get Involved

Join our journey to revolutionize natural language processing. Visit our website to explore our software, contribute to open-source projects, or learn about career opportunities:

[Image: Hugging Face logo]

# Hugging Face
[Contact Information]