In [1]:
import os
import requests
from bs4 import BeautifulSoup
from typing import List
from dotenv import load_dotenv
from openai import OpenAI
import google.generativeai
import anthropic
import gradio as gr
import json
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display

In [2]:
load_dotenv(override=True)
google_api_key = os.getenv('GOOGLE_API_KEY')


In [3]:
MODEL = "gemini-2.0-flash-exp"
gemini = OpenAI(
    api_key=google_api_key, 
    base_url="https://generativelanguage.googleapis.com/v1beta/openai/"
)

In [4]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [5]:
web = Website("https://huggingface.co/")
web.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/mistralai/Devstral-Small-2505',
 '/google/gemma-3n-E4B-it-litert-preview',
 '/ByteDance-Seed/BAGEL-7B-MoT',
 '/nari-labs/Dia-1.6B',
 '/multimodalart/isometric-skeumorphic-3d-bnb',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/Lightricks/ltx-video-distilled',
 '/spaces/NihalGazi/FLUX-Pro-Unlimited',
 '/spaces/ByteDance/DreamO',
 '/spaces/smolagents/computer-agent',
 '/spaces',
 '/datasets/openbmb/Ultra-FineWeb',
 '/datasets/disco-eth/EuroSpeech',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/PrimeIntellect/INTELLECT-2-RL-Dataset',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',

In [6]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [7]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [8]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [9]:
print(get_links_user_prompt(web))

Here is the list of links on the website of https://huggingface.co/ - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
/
/models
/datasets
/spaces
/docs
/enterprise
/pricing
/login
/join
/spaces
/models
/mistralai/Devstral-Small-2505
/google/gemma-3n-E4B-it-litert-preview
/ByteDance-Seed/BAGEL-7B-MoT
/nari-labs/Dia-1.6B
/multimodalart/isometric-skeumorphic-3d-bnb
/models
/spaces/enzostvs/deepsite
/spaces/Lightricks/ltx-video-distilled
/spaces/NihalGazi/FLUX-Pro-Unlimited
/spaces/ByteDance/DreamO
/spaces/smolagents/computer-agent
/spaces
/datasets/openbmb/Ultra-FineWeb
/datasets/disco-eth/EuroSpeech
/datasets/nvidia/OpenCodeReasoning
/datasets/PrimeIntellect/INTELLECT-2-RL-Dataset
/datasets/nvidia/OpenMathReasoning
/datasets
/join
/pricing#endpoints
/pricing#spaces
/pricing
/enterprise
/enterprise
/enterprise
/e

In [10]:
HEADERS = {"Content-Type": "application/json"}
def get_links(url):
    website = Website(url)
    response = gemini.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [11]:
get_links("https://huggingface.co")

{'links': [{'type': 'About', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'Enterprise', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'Pricing', 'url': 'https://huggingface.co/pricing'},
  {'type': 'Careers', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'Blog', 'url': 'https://huggingface.co/blog'},
  {'type': 'Documentation', 'url': 'https://huggingface.co/docs'},
  {'type': 'Brand', 'url': 'https://huggingface.co/brand'},
  {'type': 'Join', 'url': 'https://huggingface.co/join'}]}

In [12]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url) 
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [13]:
print(get_all_details("https://huggingface.co/"))

Found links: {'links': [{'type': 'models', 'url': 'https://huggingface.co/models'}, {'type': 'datasets', 'url': 'https://huggingface.co/datasets'}, {'type': 'spaces', 'url': 'https://huggingface.co/spaces'}, {'type': 'docs', 'url': 'https://huggingface.co/docs'}, {'type': 'enterprise', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing', 'url': 'https://huggingface.co/pricing'}, {'type': 'join', 'url': 'https://huggingface.co/join'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}, {'type': 'brand', 'url': 'https://huggingface.co/brand'}, {'type': 'huggingface', 'url': 'https://huggingface.co/huggingface'}, {'type': 'About', 'url': 'https://huggingface.co/huggingface'}]}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
Th

In [14]:
system_prompt_eng = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# For Thai version
system_prompt_thai = "You are an Thai specific assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information. And you will always answer in Thai Language"


In [15]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [16]:
def create_brochure(company_name, url):
    response = gemini.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt_eng},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))
    #return result

In [17]:
create_brochure("Hugging Face", "https://huggingface.co/")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing', 'url': 'https://huggingface.co/pricing'}, {'type': 'brand', 'url': 'https://huggingface.co/brand'}, {'type': 'join', 'url': 'https://huggingface.co/join'}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}]}


```markdown
# Hugging Face: The AI Community Building the Future

## Welcome to the Home of Machine Learning

Hugging Face is the leading platform where the machine learning community collaborates on models, datasets, and applications. We empower developers, researchers, and organizations to build, share, and deploy cutting-edge AI technologies.

## Key Highlights

*   **1M+ Models:** Explore a vast repository of pre-trained models for various tasks, including natural language processing, computer vision, and more.
*   **400k+ Applications (Spaces):** Discover and interact with a wide range of AI-powered applications created by the community.
*   **250k+ Datasets:** Access a diverse collection of datasets to fuel your machine learning projects.

## Why Choose Hugging Face?

*   **Collaboration:** Join a vibrant community of AI enthusiasts and experts, fostering collaboration and knowledge sharing.
*   **Open Source:** Leverage our open-source stack, including Transformers, Diffusers, and Datasets, to accelerate your development process.
*   **Modality Support:** Work with various data types, including text, image, video, audio, and even 3D.
*   **Portfolio Building:** Showcase your work, build your ML profile, and gain recognition within the community.

## Products & Services

*   **Models, Datasets, Spaces:** A central hub for discovering, sharing, and collaborating on AI resources.
*   **Inference Endpoints:** Deploy and scale your models with optimized inference endpoints, starting at $0.60/hour for GPU.
*   **Enterprise Solutions:** Equip your team with enterprise-grade security, access controls, dedicated support, and more, starting at $20/user/month.

    *   Single Sign-On
    *   Regions
    *   Priority Support
    *   Audit Logs
    *   Resource Groups
    *   Private Datasets Viewer

## Trusted by Leading Organizations

More than 50,000 organizations use Hugging Face, including:

*   AI2
*   AI at Meta
*   Amazon
*   Google
*   Intel
*   Microsoft
*   Grammarly
*   Writer

## Open Source Foundation

We are committed to building the foundation of ML tooling with the community:

*   **Transformers:** State-of-the-art ML for PyTorch, TensorFlow, JAX
*   **Diffusers:** State-of-the-art Diffusion models in PyTorch
*   **Safetensors:** Safe way to store/distribute neural network weights
*   **Hub Python Library:** Python client to interact with the Hugging Face Hub
*   **Tokenizers:** Fast tokenizers optimized for research & production
*   **TRL:** Train transformers LMs with reinforcement learning
*   **Transformers.js:** State-of-the-art ML running directly in your browser
*   **smolagents:** Smol library to build great agents in Python
*   **PEFT:** Parameter-efficient finetuning for large language models
*   **Datasets:** Access & share datasets for any ML tasks
*   **Text Generation Inference:** Serve language models with TGI optimized toolkit
*   **Accelerate:** Train PyTorch models with multi-GPU, TPU, mixed precision

## Community & Culture

Hugging Face is driven by a team of 200+ passionate individuals dedicated to democratizing AI and building a collaborative community. Our mission is to demo.

## Get Started

*   **Sign Up:** Join the Hugging Face community and start building!
*   **Explore AI Apps:** Discover innovative applications created by the community.
*   **Browse Models, Datasets, and Spaces:** Dive into our extensive collection of AI resources.

## Connect With Us

*   **Website:** huggingface.co
*   **GitHub:** github.com/huggingface
*   **Twitter:** twitter.com/huggingface
*   **LinkedIn:** linkedin.com/company/hugging-face
*   **Discord:** discord.gg/huggingface

## Join Our Team

Visit our website to explore exciting career opportunities and become part of the Hugging Face team!
```

In [18]:
def stream_gemini(system_prompt,user_prompt):
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
      ]
    stream_response = gemini.chat.completions.create(
        model=MODEL,
        messages=messages,
        stream=True
    )
    response = ""
    for chunk in stream_response:
        response += chunk.choices[0].delta.content or ''
        yield response

In [19]:
def stream_brochure(company_name, url, version):
    user_prompt_content = f"Please generate a company brochure for {company_name}. Here is their landing page:\n"
    user_prompt_content += Website(url).get_contents()

    active_system_prompt = ""
    if version == "Thai":
        active_system_prompt = system_prompt_thai 
    else:
        active_system_prompt = system_prompt_eng  
    result = stream_gemini(active_system_prompt,user_prompt_content) 
    yield from result

In [20]:
view = gr.Interface(
    fn=stream_brochure, 
    inputs=[
        gr.Textbox(label="Company name:"),
        gr.Textbox(label="Landing page URL including http:// or https://"),
        gr.Dropdown(["English", "Thai"], label="Select version"),
    ],  
    outputs=[gr.Markdown(label="Brochure:")],  
    flagging_mode="never"
)
view.launch()

* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.




In [21]:
with gr.Blocks(theme=gr.themes.Glass(), css="footer {display: none !important}") as view: # Added a theme and basic CSS to hide default footer
    gr.Markdown(
        """
        # 🚀 AI Brochure Generator
        Enter the company details below to generate a personalized brochure.
        """
    )

    with gr.Row():
        with gr.Column(scale=1):
            company_name_input = gr.Textbox(
                label="Company Name:",
                placeholder="e.g., Innovatech Solutions",
                info="The official name of the company."
            )
            landing_page_url_input = gr.Textbox(
                label="Landing Page URL:",
                placeholder="e.g., https://www.innovatech.com",
                info="Must include http:// or https://"
            )
            version_input = gr.Dropdown(
                ["English", "Thai"],
                label="Select Language Version",
                value="English",
                info="Choose the language for the brochure."
            )
            submit_button = gr.Button("Generate Brochure", variant="primary")

        with gr.Column(scale=2): 
            brochure_output = gr.Markdown(label="Generated Brochure:")

    submit_button.click(
        fn=stream_brochure,
        inputs=[company_name_input, landing_page_url_input, version_input],
        outputs=[brochure_output]
    )

view.launch(share=True,inbrowser=True)

* Running on local URL:  http://127.0.0.1:7863

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


