In [1]:
import requests
from bs4 import BeautifulSoup
from IPython.display import display, Markdown

## An AI-powered company brochure by OLlama

In [3]:
headers = {
     "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers = headers)
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        for irrelevant in soup.body(['script', 'img', 'style', 'input']):
            irrelevant.decompose()
        self.text = soup.body.get_text(separator = '\n', strip = True)
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"\nWebsite Title: {self.title} \nWebsite Content: \n{self.text}\n\n"

In [4]:
system_prompt_get_links = "You are provided with a list of links found on a webpage. You are able \
to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or a Careers/Jobs page.\n"
system_prompt_get_links += "You should respond in JSON as in the following example: "
system_prompt_get_links += """
{
 "links": [
     {"type": "about page", "url": "https://baseurl/about"},
     {"type": "careers page", "url": "https://another/url/careers"}
 ]   
}
"""

In [5]:
def get_user_prompt_get_links(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the \
    company. Respond with full https URLs in JSON format. Do not include Terms of Service, \
    Privacy, email links, pdf links.\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [6]:
def messages_for_get_links(website):
    return [
        {"role": "system", "content": system_prompt_get_links},
        {"role": "user", "content": get_user_prompt_get_links(website)}
    ]

In [7]:
OLLAMA_API = "http://localhost:11434/api/chat"
HEADERS = {"Content-Type": "application/json"}
MODEL = "llama3.2"

In [8]:
def get_links(url):
    website = Website(url)
    payload = {"messages": messages_for_get_links(website), "model": MODEL, "stream": False}
    response = requests.post(OLLAMA_API, headers = HEADERS, json = payload)
    print(response.json()["message"]["content"])

In [9]:
company = "https://investindia.gov.in"

In [10]:
def get_all_details(url):
    details = "\nLanding Page\n"
    details += Website(company).get_contents()
    details += "\nFound links\n"
    links = get_links(company)
    for link in links or "":
        if link != null:
            details += f"\n{link["type"]}\n"
            details += Website(link["url"]).get_contents()
    return details

In [11]:
system_prompt_get_brochure = "You are an assistant that analyzes the contents of several relevant \
pages from a company website and creates a short brochure of the company for prospective customers, \
investors and recruits. Respond in markdown. Include details of customers, growth and results, \
careers/jobs, policies if you have the information"

def get_user_prompt_get_brochure(company_name, url):
    user_prompt = f"You are looking at a company called {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this \
    information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    return user_prompt

def messages_for_get_brochure(company_name, url):
    return [
        {"role": "system", "content": system_prompt_get_brochure},
        {"role": "user", "content": get_user_prompt_get_brochure(company_name, url)}
    ]

In [12]:
def get_brochure(company_name, url):
    payload = {"messages": messages_for_get_brochure(company_name, url), "model": MODEL, "stream": False}
    response = requests.post(OLLAMA_API, headers = HEADERS, json = payload)
    return response.json()["message"]["content"]

In [13]:
print(get_brochure("Invest India", company))

{
  "links": [
    {
      "type": "About page",
      "url": "https://www.investindia.gov.in/"
    },
    {
      "type": "Company Overview",
      "url": "https://static.investindia.gov.in/s3fs-public/2025-01/fdi_policy_consolidated.pdf"
    },
    {
      "type": "Invest India initiative",
      "url": "https://www.investindia.gov.in/india-opportunity"
    },
    {
      "type": "Sectors and Industries",
      "url": "https://www.investindia.gov.in/sectors/list"
    },
    {
      "type": "States list",
      "url": "https://www.investindia.gov.in/states/list"
    },
    {
      "type": "Contact Us",
      "url": "https://www.investindia.gov.in/contact-us"
    },
    {
      "type": "Invest India Feedback",
      "url": "https://invest-india-feedback.gov.in/"
    }
  ]
}
This appears to be a website for Invest India, the National Investment Promotion and Facilitation Agency of India. The website provides information on various sectors in India that are open to investment, as well as