In [1]:
import os
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display

In [2]:
import ollama

In [3]:
class Website:
    def __init__(self,url):
        self.url = url
        self.links = []
        self.text=''
        self.title = ''
        res = requests.get(url)
        soup = BeautifulSoup(res.text,'html.parser')
        for i in soup.find_all('a'):
            try:
                self.links.append(i['href'])
            except:
                pass
        try:
            self.text+=soup.select('body')[0].get_text()
        except:
            pass
        try:
            self.title+=soup.select('title')[0].text
        except:
            pass
    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages. Respond with a comma seperated list of the links and no other text"

In [5]:
def get_links_user_prompt(url):
    website = Website(url)
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with a comma seperated list of the full https URL. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt 

In [6]:
def get_final_links(url):
    response = ollama.chat(model='llama3.2',messages=[{"role": "system", "content": link_system_prompt},{"role": "user", "content": get_links_user_prompt(url)}])
    return response['message']['content']

In [7]:
def all_details(url):
    result = Website(url).get_contents()
    links = get_final_links(url).split(',')[1::]
    for i in links:
        result += '\n'
        result += Website(i).get_contents()
    return result

In [8]:
system_prompt_brochure = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [9]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += all_details(url)
    user_prompt = user_prompt[:5000] # Truncate if more than 5,000 characters
    return user_prompt

In [10]:
def stream_brochure(company_name, url):
    response = ollama.chat(
        model='llama3.2',
        messages=[
            {"role": "system", "content": system_prompt_brochure},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream = True
    )
    steam = ''
    display_handle = display(Markdown(""), display_id=True)
    for chunk in response:
        steam += chunk['message']['content'] or ''
        steam = steam.replace("```","").replace("markdown", "")
        update_display(Markdown(steam), display_id=display_handle.display_id)

In [None]:
#stream_brochure(company name here, company url)