In [2]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [3]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [4]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

##### First step: Have GPT-4o-mini figure out which links are relevant
##### Use a call to gpt-4o-mini to read the links on a webpage, and respond in structured JSON.
It should decide which links are relevant, and replace relative links such as "/about" with "https://company.com/about".
We will use "one shot prompting" in which we provide an example of how it should respond in the prompt.

In [5]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [6]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}



In [7]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [16]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/microsoft/VibeVoice-1.5B',
 '/xai-org/grok-2',
 '/openbmb/MiniCPM-V-4_5',
 '/Qwen/Qwen-Image-Edit',
 '/deepseek-ai/DeepSeek-V3.1',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/zerogpu-aoti/wan2-2-fp8da-aoti-faster',
 '/spaces/syncora/synthetic-generation',
 '/spaces/multimodalart/Qwen-Image-Edit-Fast',
 '/spaces/Qwen/Qwen-Image-Edit',
 '/spaces',
 '/datasets/syncora/developer-productivity-simulated-behavioral-data',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/syncora/synthetic-healthcare-admissions',
 '/datasets/openai/healthbench',
 '/datasets/nvidia/Nemotron-Post-Training-Dataset-v2',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/m

In [17]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'community page', 'url': 'https://discuss.huggingface.co'},
  {'type': 'GitHub page', 'url': 'https://github.com/huggingface'},
  {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'},
  {'type': 'LinkedIn page',
   'url': 'https://www.linkedin.com/company/huggingface/'}]}

##### Second step: Make the brochure

Assemble all the details into another gtp4-o prompt

In [9]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [10]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."

In [11]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [22]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'docs page', 'url': 'https://huggingface.co/docs'}]}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nCommunity\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nmicrosoft/VibeVoice-1.5B\nUpdated\n3 days ago\n•\n67.6k\n•\n1.07k\nxai-org/grok-2\nUpdated\n7 days ago\n•\n3.99k\n•\n862\nopenbmb/MiniCPM-V-4_5\nUpdated\nabout 13 hours ago\n•\n8.46k\n•\n725\nQwen/Qwen-Image-Edit\nUpdated\n6 days ago\n•\n72.3k\n•\n1.53k\ndeepseek-ai/DeepSeek-V3.1\nUpdated\n4 days ago\n•\n72.4k\n•\n660\nBrowse 1M+ models\nSpaces\nRunning\n12.5k\n12.5k\nDeepSite v2\n🐳\nGener

In [12]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [13]:
get_brochure = create_brochure("HuggingFace", "https://huggingface.co")
get_brochure

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'company social media page', 'url': 'https://twitter.com/huggingface'}, {'type': 'company social media page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face: The Future of AI

Welcome to Hugging Face, the vibrant AI community dedicated to building the future of artificial intelligence. Our collaborative platform is designed for machine learning enthusiasts, businesses, and researchers alike. 

## Our Offerings

### **Explore Models and Datasets**
- **Models**: With over 1 million state-of-the-art models available, including options from major companies like Microsoft, Google, and Amazon, you can find exactly what you need to power your applications.
- **Datasets**: Access around 250,000 datasets tailored for various machine learning tasks, facilitating innovation and research.

### **Collaborative Spaces**
Our platform hosts over 400,000 applications. Collaborate in real-time in one of our many spaces, or use our tools to create your own.

### **Enterprise Solutions**
For teams and enterprises, we offer GPU compute resources and enterprise-grade security, ensuring powerful and secure AI model deployment, starting from just $20/user/month.

## Our Community 
With over 50,000 organizations leveraging Hugging Face, we foster a rich community of users who share and collaborate on ML models and datasets. Gaming giants like Meta and multi-nationals like Google and Intel turn to our platform for their advanced AI solutions.

### **Open Source Commitment**
At Hugging Face, we believe in open-source collaboration. Our essential libraries for machine learning—including Transformers, Diffusers, and Tokenizers—provide researchers and practitioners with the tools to innovate.

## Company Culture
Hugging Face champions a progressive, inclusive, and community-driven workplace culture. We prioritize collaboration and empowerment, enabling our team and users to push boundaries and explore new frontiers in AI. We support a diverse range of voices and ideas, believing they are crucial for breakthroughs in technology.

## Join Us
### **Careers at Hugging Face**
We are constantly on the lookout for passionate individuals ready to make a difference. If you are interested in working with cutting-edge technology and being part of a team that values growth, innovation, and community, check out our careers page for current job openings.

---

For more detailed information, don't hesitate to visit our [website](https://huggingface.co) and explore the exciting world of AI and machine learning with Hugging Face!

### Brochure Translator to Bengali 

In [1]:
system_prompt = """
You are a Bengali translator who can translate English text to Bengali. You will be given a brochure of a company and you need to translate it into Bengali. Make sure 
to keep the formatting and layout of the original brochure intact.
"""

In [14]:
def user_prompt_translate_brochure(company_name, brochure):
    user_prompt = f"You are given the brochure for the company {company_name}."
    user_prompt += f"Please translate the following brochure to Bengali: {brochure}"
    return user_prompt


In [17]:
print(get_brochure)

None
