# Brochure Generator LLM

## 1. WebScraper

In [85]:
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
from dotenv import load_dotenv
import os
import json

In [74]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class webscraper:
    def __init__(self,url):
        self.url = url
        response = requests.get(url, headers = headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title if soup.title else 'No Title'
        if soup.body:
            for tobedeleted in soup.body(["script", "style", "img", "input"]):
                tobedeleted.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]
    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [75]:
gela = webscraper('https://en.wikipedia.org/wiki/Muhammadu_Buhari')
gela.links[:10]

['#bodyContent',
 '/wiki/Main_Page',
 '/wiki/Wikipedia:Contents',
 '/wiki/Portal:Current_events',
 '/wiki/Special:Random',
 '/wiki/Wikipedia:About',
 '//en.wikipedia.org/wiki/Wikipedia:Contact_us',
 '/wiki/Help:Contents',
 '/wiki/Help:Introduction',
 '/wiki/Wikipedia:Community_portal']

## 2. Prompting

In [76]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [77]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [78]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [79]:
get_links_user_prompt(gela)

"Here is the list of links on the website of https://en.wikipedia.org/wiki/Muhammadu_Buhari - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.\nLinks (some might be relative links):\n#bodyContent\n/wiki/Main_Page\n/wiki/Wikipedia:Contents\n/wiki/Portal:Current_events\n/wiki/Special:Random\n/wiki/Wikipedia:About\n//en.wikipedia.org/wiki/Wikipedia:Contact_us\n/wiki/Help:Contents\n/wiki/Help:Introduction\n/wiki/Wikipedia:Community_portal\n/wiki/Special:RecentChanges\n/wiki/Wikipedia:File_upload_wizard\n/wiki/Special:SpecialPages\n/wiki/Main_Page\n/wiki/Special:Search\nhttps://donate.wikimedia.org/?wmf_source=donate&wmf_medium=sidebar&wmf_campaign=en.wikipedia.org&uselang=en\n/w/index.php?title=Special:CreateAccount&returnto=Muhammadu+Buhari\n/w/index.php?title=Special:UserLogin&returnto=Muhammadu+Buhari\nhttps://donate.wikimedia.org/?wmf_source=donate

## 3. Call LLM

In [80]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')
MODEL = 'gpt-4o-mini'
openai = OpenAI()

In [83]:
def get_links(url):
    website = webscraper(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [86]:
important_links = get_links('https://en.wikipedia.org/wiki/Muhammadu_Buhari')

In [87]:
important_links

{'links': [{'type': 'about page',
   'url': 'https://www.wikidata.org/wiki/Q361567'},
  {'type': 'wikipedia page',
   'url': 'https://en.wikipedia.org/wiki/Muhammadu_Buhari'},
  {'type': 'official site', 'url': 'https://thisisbuhari.com'},
  {'type': 'news about company',
   'url': 'https://www.theguardian.com/world/2015/mar/31/muhammadu-buhari-military-dictator-nigeria-new-democratic-president'},
  {'type': 'career page',
   'url': 'https://www.bbc.com/news/world-africa-32139858'}]}