In [1]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
# Constants

OLLAMA_API = "http://localhost:11434/api/chat"
HEADERS = {"Content-Type": "application/json"}
MODEL = "llama3.2"

In [3]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/',
 'https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/',
 'https://edwarddonner.com/

In [5]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [6]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [7]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/
https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/
https://edwarddonner.com/2024/11/13/llm-engineering-resources/
https://edwarddonner.com/2024/11/13/ll

In [12]:
!ollama pull llama3.2

[?25lpulling manifest â ‹ [?25h[?25l[2K[1Gpulling manifest â ™ [?25h[?25l[2K[1Gpulling manifest â ¹ [?25h[?25l[2K[1Gpulling manifest â ¸ [?25h[?25l[2K[1Gpulling manifest â ¼ [?25h[?25l[2K[1Gpulling manifest â ´ [?25h[?25l[2K[1Gpulling manifest â ¦ [?25h[?25l[2K[1Gpulling manifest â § [?25h[?25l[2K[1Gpulling manifest â ‡ [?25h[?25l[2K[1Gpulling manifest â � [?25h[?25l[2K[1Gpulling manifest â ‹ [?25h[?25l[2K[1Gpulling manifest â ™ [?25h[?25l[2K[1Gpulling manifest â ¹ [?25h[?25l[2K[1Gpulling manifest â ¸ [?25h[?25l[2K[1Gpulling manifest â ¼ [?25h[?25l[2K[1Gpulling manifest â ´ [?25h[?25l[2K[1Gpulling manifest â ¦ [?25h[?25l[2K[1Gpulling manifest 
pulling dde5aa3fc5ff... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–� 2.0 GB                         
pulling 966de95ca8a6... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–� 1.4 KB                         
pulling fcc5a6bec9da... 100% â–•â–ˆâ–ˆâ–ˆâ–ˆâ–ˆ

In [16]:
from openai import OpenAI
ollama_via_openai = OpenAI(base_url='http://localhost:11434/v1', api_key='ollama')

In [17]:
def get_links(url):
    website = Website(url)
    response = ollama_via_openai.chat.completions.create(
    model=MODEL,
    messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [51]:
huggingface = Website("https://edwarddonner.com")
huggingface.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/',
 'https://edwarddonner.com/2024/10/16/from-software-engineer-to-ai-data-scientist-resources/',
 'https://edwarddonner.com/

In [52]:
get_links("https://edwarddonner.com")

{'links': [{'type': 'about page',
   'url': 'https://edwarddonner.com/about-me-and-about-nebula/'},
  {'type': 'company website', 'url': 'https://edwarddonner.com/'},
  {'type': 'careers page', 'url': 'https://www.linkedin.com/in/eddonner/'}]}

In [53]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [54]:
print(get_all_details("https://edwarddonner.com"))

Found links: {'links': [{'type': 'About page', 'url': 'https://edwarddonner.com/about-me-and-about-nebula/'}, {'type': 'Company page', 'url': 'https://edwarddonner.com/outsmart/'}, {'type': 'LinkedIn profile', 'url': 'https://www.linkedin.com/in/eddonner/'}, {'type': 'Twitter profile', 'url': 'https://twitter.com/edwarddonner'}, {'type': 'About page alternative', 'url': 'https://edwarddonner.com/'}]}
Landing page:
Webpage Title:
Home - Edward Donner
Webpage Contents:
Home
Outsmart
An arena that pits LLMs against each other in a battle of diplomacy and deviousness
About
Posts
Well, hi there.
I’m Ed. I like writing code and experimenting with LLMs, and hopefully you’re here because you do too. I also enjoy DJing (but I’m badly out of practice), amateur electronic music production (
very
amateur) and losing myself in
Hacker News
, nodding my head sagely to things I only half understand.
I’m the co-founder and CTO of
Nebula.io
. We’re applying AI to a field where it can make a massive, pos

In [55]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."


In [58]:
import requests

def validate_url(url):
    if not url.startswith(("http://", "https://")):
        url = "https://" + url
    return url

def get_all_details(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an HTTPError for bad responses (4xx and 5xx)
        return response.text  # Assuming you want the raw HTML content
    except requests.exceptions.RequestException as e:
        print(f"Error fetching details from URL: {e}")
        return "Error fetching content. Please check the URL."

def get_brochure_user_prompt(company_name, url):
    url = validate_url(url)  # Ensure the URL is valid
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    details = get_all_details(url)
    user_prompt += details[:5_000]  # Truncate if more than 5,000 characters
    return user_prompt

# Example usage
print(get_brochure_user_prompt("Edward", "edwarddonner.com"))


You are looking at a company called: Edward
Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.
<!DOCTYPE html>
<html lang="en-US">
<head>
	<meta charset="UTF-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name='robots' content='index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1' />
	<style>img:is([sizes="auto" i], [sizes^="auto," i]) { contain-intrinsic-size: 3000px 1500px }</style>
	
	<!-- This site is optimized with the Yoast SEO plugin v24.2 - https://yoast.com/wordpress/plugins/seo/ -->
	<link rel="canonical" href="https://edwarddonner.com/" />
	<meta property="og:locale" content="en_US" />
	<meta property="og:type" content="website" />
	<meta property="og:title" content="Home - Edward Donner" />
	<meta property="og:description" content="Homepage &#8211; this is a test of what happens to text on the homepage page" />
	<meta prope

In [59]:
print(get_brochure_user_prompt("Edward", "edwarddonner.com"))

You are looking at a company called: Edward
Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.
<!DOCTYPE html>
<html lang="en-US">
<head>
	<meta charset="UTF-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name='robots' content='index, follow, max-image-preview:large, max-snippet:-1, max-video-preview:-1' />
	<style>img:is([sizes="auto" i], [sizes^="auto," i]) { contain-intrinsic-size: 3000px 1500px }</style>
	
	<!-- This site is optimized with the Yoast SEO plugin v24.2 - https://yoast.com/wordpress/plugins/seo/ -->
	<link rel="canonical" href="https://edwarddonner.com/" />
	<meta property="og:locale" content="en_US" />
	<meta property="og:type" content="website" />
	<meta property="og:title" content="Home - Edward Donner" />
	<meta property="og:description" content="Homepage &#8211; this is a test of what happens to text on the homepage page" />
	<meta prope

In [60]:
def create_brochure(company_name, url):
    response = ollama_via_openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [61]:
create_brochure("Tekworks", "https://edwarddonner.com")

# Tekworks Brochure
=====================================

**About Us**
-------------

Tekworks is a company that specializes in [insert technology/field] and has been leading the way since its inception. With a team of passionate engineers, data scientists, and innovators, we are committed to delivering cutting-edge solutions that drive results.

### Company Culture

At Tekworks, we value creativity, collaboration, and innovation. Our culture is built around empowering our employees to take ownership of their work and strive for excellence in everything they do. We believe in fostering a supportive environment where ideas flow freely and growth is encouraged.

### Customer Focus

Our customers are at the heart of everything we do. We pride ourselves on delivering exceptional service, support, and results that meet and exceed our clients' expectations. Our goal is to build long-lasting relationships with our customers and help them achieve their goals.

### Careers and Opportunities

We're always looking for talented individuals who share our passion for innovation and excellence. If you're passionate about [insert technology/field], we want to hear from you! Check out our job listings at [insert website URL].

**Meet Our Team**
-----------------

While there is no information on the team members, a brief note: Tekworks has an experienced management team that drives the company's vision and strategy. Meet Edward Donner, our CEO, who brings with him years of experience in tech leadership and entrepreneurship.

### Contact Us

Ready to learn more about how Tekworks can help you achieve your goals? Fill out the contact form at [insert website URL] or reach out to us on social media:

* Twitter: [insert Twitter handle]
* LinkedIn: [insert LinkedIn profile]
* Facebook: [insert Facebook page]

**Let's Connect**
----------------

Join our community and stay up-to-date with the latest news, trends, and insights from Tekworks.

[Insert Call-to-Action button, e.g., "Connect" or "Learn More"]

In [62]:
def stream_brochure(company_name, url):
    stream = ollama_via_openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [64]:
stream_brochure("Edward", "https://edwarddonner.com")

# Edward Donner
## About Us

Edward Donner is a tech leader and entrepreneur with a passion for coding and data science. With a strong focus on innovation and growth, our company strives to deliver cutting-edge solutions that make a real impact.

### Mission Statement
Empowering individuals and organizations to harness the power of technology and drive positive change in the world.

## Our Values

* **Innovation**: Embracing new ideas and perspectives to stay ahead of the curve.
* **Collaboration**: Fostering open communication, trust, and mutual respect among team members.
* **Continuous Learning**: Committing to ongoing education and skill-building to deliver exceptional results.
* **Customer-Centricity**: Prioritizing the needs and satisfaction of our clients.

## Our Customers
At Edward Donner, we serve a diverse range of clients across various industries, including technology, healthcare, finance, and more. We take pride in delivering tailored solutions that meet the unique needs and objectives of each project.

## Work with Us

Are you passionate about innovation and making a difference? We're committed to building a team that reflects our values and is dedicated to driving success.

### Careers/Jobs

* **Tech Leaders**: Join our community of experienced professionals who are shaping the future of technology.
* **Data Scientists**: Collaborate with us on exciting projects that harness the power of data science to drive insights and impact.
* **Entrepreneurs**: Leverage our expertise and resources to turn your ideas into reality.

## Stay Connected

* **Contact Us**: Reach out to us at [info@edwarddonner.com](mailto:info@edwarddonner.com) or visit our website for more information.
* **Follow Us**: Join the conversation on Twitter, LinkedIn, and other social media platforms.

---

![logo](https://i0.wp.com/edwarddonner.com/wp-content/uploads/2023/12/cropped-edworkprofile2.png?fit=1128%2C1128&ssl=1)