## 1. First step 
## Brochure for website

In [1]:
from pprint import pprint
from agent_tools.webscraper_tools import WebScraper
from IPython.display import Markdown, display
from openai import OpenAI
import json
from dotenv import load_dotenv
import os
import gradio as gr
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [13]:
MODEL = "gpt-4o-mini"
API_KEY = os.getenv("OPENAI_API_KEY")

In [3]:
link_system_prompt = f"You are provided with a list of links found on a webpage. Strictly follow the rules, Do not change anything\n"
link_system_prompt += f"You need to return all the links in JSON format as follows:\n"
link_system_prompt += """{
        links: [
            {"type": "about page", "url":"https://example.com/about"},
            {"type": "careers", "url":"https://example.com/careers"}
    ] 
}"""

In [4]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the webpage: {website.url}\n"
    user_prompt += "Please decide which of these are relevant web links for a brochure about company, respond with the full https URL: Do not include Terms of Service, Privacy Policy, Cookie Policy, or any other legal links such as linkedIn, youtube, X, twitter, etc. Please only include links for given URL from available URLs only\n"
    user_prompt += f"Links: (some might be relative links):\n"
    user_prompt += f"\n".join(website.get_all_links())
    return user_prompt

In [5]:
client = OpenAI(api_key=API_KEY)

In [6]:
def get_links(url):
    website = WebScraper(url)
    response = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)},
        ],
        response_format={"type": "json_object"},
    )
    result = response.choices[0].message.content
    if result is None:
        return {"links": []}
    cleaned_result = result.strip("```json").strip("```")
    display(Markdown(result))
    return json.loads(cleaned_result)

In [7]:
def get_all_details(url):
    result = "Landing page:\n"
    
    landing_content = WebScraper(url).get_body()
    result += landing_content if landing_content else "[Failed to fetch landing page]\n"

    links_dict = get_links(url)

    for link in links_dict['links']:
        result += f"\n\n{link['type'].capitalize()}:\n"
        link_content = WebScraper(link['url']).get_body()
        result += link_content if link_content else f"[Failed to fetch {link['url']}]\n"

    return result

In [8]:
brochure_system_prompt = "You are an assistant thaat analyzes the content of several relevant pages from copany website and creates a short brochre about the company for prospective customers, \
    investors. Respond in markdown format. Include details of comppany culture, custumers and careers/jobs if you have the information."

In [9]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here is the content from several relevant pages on their website: {url}\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:20_000] # Truncate to first 10,000 characters
    return user_prompt

In [10]:
def create_brochure_sync(company_name, url):
    stream = client.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": brochure_system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)},
        ],
        stream=True
    )
    response = ""
    for chunk in stream:
        response += chunk.choices[0].delta.content or ""
        yield response

In [None]:
gr.Interface(
    fn=create_brochure_sync,
    inputs=[
        gr.Textbox(label="🧾 Company Name", placeholder="e.g. Acme Corp", lines=1),
        gr.Textbox(label="🌐 Company Website URL", placeholder="e.g. https://www.acme.com", lines=1)
    ],
    outputs=gr.Markdown(label="📄 Generated Brochure"),
    title="✨ AI-Powered Brochure Generator",
    description=(
        "Provide your company's name and website URL, and let the AI craft a professional brochure "
        "based on your site's content. Ideal for marketing, presentations, or quick overviews."
    ),
    theme="soft",  # Optional: adds a modern, rounded aesthetic
    allow_flagging="never"  # Optional: disables user flagging if not needed
).launch()



* Running on local URL:  http://0.0.0.0:7860
* To create a public link, set `share=True` in `launch()`.




[INFO] Initialized WebScraper for URL: https://www.google.com
[INFO] Extracting body content...
[INFO] Fetching content from https://www.google.com
[INFO] Successfully fetched content from https://www.google.com
[INFO] Successfully extracted body content.
[INFO] Initialized WebScraper for URL: https://www.google.com
[INFO] Extracting all hyperlinks...
[INFO] Fetching content from https://www.google.com
[INFO] Successfully fetched content from https://www.google.com
[INFO] Found 27 hyperlinks.


{
    "links": [
        {"type": "about page", "url": "https://www.google.co.in/intl/en/about/products?tab=wh"},
        {"type": "about page", "url": "http://www.google.co.in/services/"},
        {"type": "about page", "url": "https://www.google.co.in/intl/en/about.html"}
    ]
}

[INFO] Initialized WebScraper for URL: https://www.google.co.in/intl/en/about/products?tab=wh
[INFO] Extracting body content...
[INFO] Fetching content from https://www.google.co.in/intl/en/about/products?tab=wh
[INFO] Successfully fetched content from https://www.google.co.in/intl/en/about/products?tab=wh
[INFO] Successfully extracted body content.
[INFO] Initialized WebScraper for URL: http://www.google.co.in/services/
[INFO] Extracting body content...
[INFO] Fetching content from http://www.google.co.in/services/
[INFO] Successfully fetched content from http://www.google.co.in/services/
[INFO] Successfully extracted body content.
[INFO] Initialized WebScraper for URL: https://www.google.co.in/intl/en/about.html
[INFO] Extracting body content...
[INFO] Fetching content from https://www.google.co.in/intl/en/about.html
[INFO] Successfully fetched content from https://www.google.co.in/intl/en/about.html
[INFO] Successfully extracted body content.
[INFO] Initialized WebScraper for URL: h

{
    "links": [
        {"type": "about page", "url": "https://www.google.co.in/intl/en/about/products?tab=wh"},
        {"type": "careers", "url": "http://www.google.co.in/services/"}
    ]
}

[INFO] Initialized WebScraper for URL: https://www.google.co.in/intl/en/about/products?tab=wh
[INFO] Extracting body content...
[INFO] Fetching content from https://www.google.co.in/intl/en/about/products?tab=wh
[INFO] Successfully fetched content from https://www.google.co.in/intl/en/about/products?tab=wh
[INFO] Successfully extracted body content.
[INFO] Initialized WebScraper for URL: http://www.google.co.in/services/
[INFO] Extracting body content...
[INFO] Fetching content from http://www.google.co.in/services/
[INFO] Successfully fetched content from http://www.google.co.in/services/
[INFO] Successfully extracted body content.
[INFO] Initialized WebScraper for URL: 
[INFO] Extracting body content...
[INFO] Fetching content from 
[ERROR] Exception while fetching : Invalid URL '': No scheme supplied. Perhaps you meant https://?
[ERROR] Failed to retrieve content for body extraction.
[INFO] Initialized WebScraper for URL: 
[INFO] Extracting all hyperlinks...
[INFO] Fetching content 

{
    "links": []
}

[INFO] Initialized WebScraper for URL: https://www.google.com
[INFO] Extracting body content...
[INFO] Fetching content from https://www.google.com
[INFO] Successfully fetched content from https://www.google.com
[INFO] Successfully extracted body content.
[INFO] Initialized WebScraper for URL: https://www.google.com
[INFO] Extracting all hyperlinks...
[INFO] Fetching content from https://www.google.com
[INFO] Successfully fetched content from https://www.google.com
[INFO] Found 27 hyperlinks.


{
    "links": [
        {"type": "about page", "url":"https://www.google.co.in/intl/en/about/products?tab=wh"}
    ] 
}

[INFO] Initialized WebScraper for URL: https://www.google.co.in/intl/en/about/products?tab=wh
[INFO] Extracting body content...
[INFO] Fetching content from https://www.google.co.in/intl/en/about/products?tab=wh
[INFO] Successfully fetched content from https://www.google.co.in/intl/en/about/products?tab=wh
[INFO] Successfully extracted body content.
