## Imports

In [1]:
import os
import requests
import json
import ollama
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI
import gradio as gr

## Initialization

In [2]:
load_dotenv(override = True)

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")


OpenAI API Key exists and begins sk-proj-


## Models

In [3]:
MODEL_GPT_4O = "gpt-4o"
MODEL_GPT_4_TURBO = "gpt-4-turbo"
MODEL_GPT_35_TURBO = "gpt-3.5-turbo"
MODEL_LLAMA = 'llama3.2'

openai = OpenAI()

## A class to represent a Webpage
# Some websites need you to use proper headers when fetching them:

In [4]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

## System prompt for links

In [5]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "contact page", "url": "https://example.com/contact"}
    ]
}
"""

## User prompt

In [6]:
def get_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

## Openai links

In [7]:
def get_links(url, openai_model):
    website = Website(url)
    response = openai.chat.completions.create(
        model=openai_model,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_user_prompt(website)}
        ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

## All Details

In [8]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url, MODEL_GPT_4O_MINI)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

## System prompt for brochure gen

In [9]:
def system_prompt(lang):
    return f"You are an assistant that analyzes the contents of several relevant pages from a company website \
    and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
    Include details of company culture, customers and careers/jobs if you have the information.\
    And very important, generate it in {lang}"

## User prompt for brochure gen

In [10]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]
    return user_prompt

## Create Brochure

In [11]:
def create_brochure(company_name, url, lang, model):
    if model != MODEL_LLAMA:
        response = openai.chat.completions.create(
            model = model,
            messages=[
                {"role": "system", "content": system_prompt(lang)},
                {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
              ]
        )
        result = response.choices[0].message.content
    else:
        response = ollama.chat(
            model = model,
            messages=[
                {"role": "system", "content": system_prompt(lang)},
                {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
              ]
        )
        result = response["message"]["content"]
    return result

## Gradio UI

In [12]:
MODEL_OPTIONS = [
    MODEL_GPT_4O,
    MODEL_GPT_4_TURBO,
    MODEL_GPT_35_TURBO,
    MODEL_LLAMA
]

with gr.Blocks() as ui:
    
    with gr.Row():
        company_name = gr.Textbox(label="Company Name", placeholder="e.g., OpenAI")
        url = gr.Textbox(label="Website URL", placeholder="https://example.com")
    
    with gr.Row():
        lang = gr.Textbox(label="Language", value="English")
        model = gr.Dropdown(choices=MODEL_OPTIONS, value="gpt-4o", label="Model")

    brochure_output = gr.Markdown()

    with gr.Row():
        generate_btn = gr.Button("Generate Brochure")
        clear_btn = gr.Button("Clear")

    generate_btn.click(
        fn=create_brochure,
        inputs=[company_name, url, lang, model],
        outputs=brochure_output
    )
    
    clear_btn.click(lambda: "", None, brochure_output, queue=False)

ui.launch(inbrowser=True)

* Running on local URL:  http://127.0.0.1:7907
* To create a public link, set `share=True` in `launch()`.


