In [6]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI
import gradio as gr

In [7]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [8]:
# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, "html.parser")
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for garbage in soup.body(["img", "script", "input", "style"]):
                garbage.decompose()
                self.text = soup.body.get_text(separator="\n",strip=True)
        else:
            self.text = ""

        links = [link.get("href") for link in soup.find_all("a")]
        self.links = [link for link in links if link]

    def getContents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
    

In [55]:
# anthropic = website("https://www.anthropic.com")
# anthropic.links

In [54]:
system_prompt = """
You are given a list of links from a company website. Your task is to decide which links are relevant to include in the company brochure. Review the links and select only those that best represent the company’s core content, such as the About page, Careers page, Products/Services page, and similar. Avoid links like contact forms, blog posts, legal pages, or support pages that do not directly contribute to a high-level company overview.

You should respond in JSON exactly in the format shown in the examples below. Do not include any extra text.

Example 1:
Input Links:
[
    "https://company.com/about",
    "https://company.com/contact",
    "https://company.com/products",
    "https://company.com/blog"
]
Expected JSON Output:
{
    "links": [
        {"type": "about page", "url": "https://company.com/about"},
        {"type": "products page", "url": "https://company.com/products"}
    ]
}

Example 2:
Input Links:
[
    "https://company.com/careers",
    "https://company.com/legal",
    "https://company.com/support"
]
Expected JSON Output:
{
    "links": [
        {"type": "careers page", "url": "https://company.com/careers"}
    ]
}

Example 3:
Input Links:
[
    "https://company.com/about-us",
    "https://company.com/team",
    "https://company.com/press",
    "https://company.com/privacy"
]
Expected JSON Output:
{
    "links": [
        {"type": "about page", "url": "https://company.com/about-us"},
        {"type": "team page", "url": "https://company.com/team"},
        {"type": "press page", "url": "https://company.com/press"}
    ]
}
"""

In [9]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [10]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        response_format={"type": "json_object"}
    )
    
    result = json.loads(response.choices[0].message.content)
    
    # Ensure 'links' key exists
    if "links" not in result:
        print("Error: 'links' key missing in OpenAI response. Returning empty list.")
        return {"links": []}  # Prevent KeyError by returning an empty list
    
    return result


In [11]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).getContents()
    
    links = get_links(url)
    print("Found links:", links)
    
    if not links.get("links"):  # Check if "links" key exists and is not empty
        print("No relevant links found.")
        return result  # Return only the landing page contents
    
    for link in links["links"]:
        result += f"\n\n{link.get('type', 'Unknown type')}\n"
        result += Website(link["url"]).getContents()
    
    return result


In [125]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
If you have the information, include details of company culture, customers and careers/jobs."

In [126]:
def get_brochure_user_prompt(company_name, url):  
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short company brochure in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt
    

In [127]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [40]:
# A class to represent a Webpage

class Website:
    url: str
    title: str
    text: str

    def __init__(self, url):
        self.url = url
        response = requests.get(url)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator="\n", strip=True)

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [51]:
system_message = "You are a helpful assistant that responds in markdown"
def stream_gpt(prompt):
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": prompt}
      ]
    stream = openai.chat.completions.create(
        model='gpt-4o-mini',
        messages=messages,
        stream=True
    )
    result = ""
    for chunk in stream:
        result += chunk.choices[0].delta.content or ""
        yield result

In [52]:
# A function to create a brochure by combining website contents with a prompt.
def stream_brochure(company_name, url):
    prompt = (
        f"You are an assistant that analyzes the contents of several relevant pages from {company_name} "
        f"(located at {url}) and creates a short humorous, entertaining, jokey brochure about the company "
        "for prospective customers, investors, and recruits. Respond in markdown. "
        "If you have the information, include details of company culture, customers, and careers/jobs."
    )
    # Append the website's contents to the prompt.
    prompt += Website(url).get_contents()
    
    # Choose the streaming method based on the global 'model' variable.
    if MODEL:
        result = stream_gpt(prompt)
    else:
        raise ValueError("Unknown model")
    yield from result

In [56]:
view = gr.Interface(
    fn=stream_brochure,
    inputs=[
        gr.Textbox(label="Company name:"),
        gr.Textbox(label="Landing page URL including http:// or https://")],
    outputs=[gr.Markdown(label="Brochure:")],
    flagging_mode="never"
)
view.launch()