In [1]:
import re
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from google import genai

In [2]:
load_dotenv()
api_key = os.getenv('GEMINI_API_KEY')
if api_key:
    print("API Key looks good.")
else:
    print("There might be some problem with your API Key. Please check.")

# import google.generativeai as genai
# genai.configure(api_key=api_key)
# MODEL = "gemini-2.0-flash-lite"
# Content = "Explain how AI works in a few words"
# Gemini = genai.GenerativeModel(MODEL)

# response = Gemini.generate_content(
#     contents=Content
# )

Gemini = genai.Client(api_key=api_key)
MODEL = "gemini-2.0-flash-lite"
Content = "Explain how AI works in a few words"
response = Gemini.models.generate_content(
    model=MODEL, contents=Content
)
print(response.text)

API Key looks good.
AI learns from data to make decisions.



In [5]:
def Chrome_Version():
    ChromeVer = !reg query "HKEY_CURRENT_USER\Software\Google\Chrome\BLBeacon" /v version
    text = '\n'.join(ChromeVer)

    # Extract the version number
    version = re.search(r'\d+\.\d+\.\d+\.\d+', text)
    ChromeVer = version.group() if version else "Not found"
    #print("Chrome version:", version.group() if version else "Not found")
    return ChromeVer
latestversion = Chrome_Version()

In [9]:
browser = "Chrome/"+latestversion
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"# + browser
}

class Website:
    '''Scaping website'''

    def __init__(self, url):
        self.url = url
        response = requests.get(url=url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found."

        if soup.body:
            for irrelavant in soup.body(["script", "style", "img", "input"]):
                irrelavant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        #print(self.text)

        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]
        #print(self.links)
    
    def get_contents(self):
        return f"Webpage Title: \n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [10]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',
 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-

In [11]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [12]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [13]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [14]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/
https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/
https://edwarddonner.com/2025/01/23/ll

In [15]:
def get_links(url):
    website = Website(url)
    messages = []
    # if link_system_prompt:
    #     messages.append({"role": "system", "content": link_system_prompt})
    # messages.append({"role": "user", "content": get_links_user_prompt(website)})
    # system_instruction = {"role": "system", "content": link_system_prompt}
    # prompt = {"role": "user", "content": get_links_user_prompt(website)}
   
    #generation_config = {"response_mime_type": "application/json"}
    # message = [
    #         {"role": "system", "content": link_system_prompt},
    #         {"role": "user", "content": get_links_user_prompt(website)}
    #   ]

    full_prompt = f"""
    System: {link_system_prompt}

    User: {get_links_user_prompt(website)}

    Please respond in valid JSON format.
    """
    try:
        response = Gemini.models.generate_content(
            model=MODEL,
            contents=full_prompt
            # model=MODEL,
            # messages,
            # generation_config = {"response_mime_type": "application/json"}
            # response_format={"type": "json_object"}
        )
        raw_text = response.text.strip()

        # Extract JSON using regex
        json_match = re.search(r'\{.*\}', raw_text, re.DOTALL)
        if json_match:
            json_data = json.loads(json_match.group())
            return json_data
        else:
            print("No valid JSON found in the response.")
            print("Raw response:", raw_text)
            return None

    except json.JSONDecodeError:
        print("Error decoding JSON response")
        print("Raw response:", response.text)
        return None

    # result = response.text
    # return json.loads(result)

In [16]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nari-labs/Dia-1.6B',
 '/Qwen/Qwen3-235B-A22B',
 '/deepseek-ai/DeepSeek-Prover-V2-671B',
 '/Qwen/Qwen3-30B-A3B',
 '/moonshotai/Kimi-Audio-7B-Instruct',
 '/models',
 '/spaces/nari-labs/Dia-1.6B',
 '/spaces/enzostvs/deepsite',
 '/spaces/Qwen/Qwen3-Demo',
 '/spaces/stepfun-ai/Step1X-Edit',
 '/spaces/nvidia/describe-anything-model-demo',
 '/spaces',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/OpenGVLab/InternVL-Data',
 '/datasets/Eureka-Lab/PHYBench',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/Anthropic/values-in-the-wild',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers',

In [17]:
get_links("https://huggingface.co")

{'links': [{'type': 'models page', 'url': 'https://huggingface.co/models'},
  {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'},
  {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'docs page', 'url': 'https://huggingface.co/docs'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'}]}

In [18]:
# Make the Brochure

def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    try:
        for link in links["links"]:
            result += f"\n\n{link['type']}\n"
            result += Website(link["url"]).get_contents()
    except:
        print("None type object")
    return result

In [19]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'models page', 'url': 'https://huggingface.co/models'}, {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'}, {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'documentation page', 'url': 'https://huggingface.co/docs'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'community forum', 'url': 'https://discuss.huggingface.co'}, {'type': 'github page', 'url': 'https://github.com/huggingface'}, {'type': 'twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'linkedin page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'discord page', 'url': 'https://huggingface.co/join/discord'}]}
L

In [20]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."


In [22]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages \
    use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5000]
    return user_prompt

In [24]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'models page', 'url': 'https://huggingface.co/models'}, {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'}, {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'documentation', 'url': 'https://huggingface.co/docs'}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}, {'type': 'github', 'url': 'https://github.com/huggingface'}, {'type': 'twitter', 'url': 'https://twitter.com/huggingface'}, {'type': 'linkedin', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'learn page', 'url': 'https://huggingface.co/learn'}]}


'You are looking at a company: HuggingFace\nHere are the contents of its landing page and other relevant pages     use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title: \nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nnari-labs/Dia-1.6B\nUpdated\n4 days ago\n•\n97.3k\n•\n1.61k\nQwen/Qwen3-235B-A22B\nUpdated\nabout 4 hours ago\n•\n15.2k\n•\n557\ndeepseek-ai/DeepSeek-Prover-V2-671B\nUpdated\nabout 13 hours ago\n•\n545\n•\n518\nQwen/Qwen3-30B-A3B\nUpdated\n1 day ago\n•\n22.4k\n•\n357\nmoonshotai/Kimi-Audio-7B-Instruct\nUpdated\n3 days ago\n•\n2.89k\n•\n244\nBrowse 1M+ models\nSpaces\nRunning\non\nZero\n996\n996\n

In [35]:
def create_brochure(company_name, url):
    full_prompt = f"""
    System: {system_prompt}

    User: {get_brochure_user_prompt(company_name,url)}

    """
    try:
        response = Gemini.models.generate_content(
            model=MODEL,
            contents=full_prompt
            # model=MODEL,
            # messages,
            # generation_config = {"response_mime_type": "application/json"}
            # response_format={"type": "json_object"}
        )
    except:
        print("Error decoding response")
        return None

    result = response.text
    display(Markdown(result))

In [36]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'models page', 'url': 'https://huggingface.co/models'}, {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'}, {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'about/docs page', 'url': 'https://huggingface.co/docs'}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}, {'type': 'careers/jobs page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'learn page', 'url': 'https://huggingface.co/learn'}, {'type': 'company info/brand page', 'url': 'https://huggingface.co/brand'}, {'type': 'about/tasks page', 'url': 'https://huggingface.co/tasks'}, {'type': 'social media - linkedin', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'social media - twitter', 'url': 'https://twitter.com/huggingface'}, {'type': 'social media - github', 'url': 'https://gith

## Hugging Face: The AI Community Building the Future

**Welcome to Hugging Face, the leading platform where the machine learning community collaborates on models, datasets, and applications.**

### **What We Do**

We are building the future of AI by providing the tools and platform for creating, discovering, and collaborating on machine learning. Our mission is to make AI more accessible and accelerate its progress.

*   **Collaboration:** Host and collaborate on unlimited public models, datasets, and applications.
*   **Innovation:** Explore AI apps and browse over 1 million models.
*   **Community:** Engage with a vibrant community of AI enthusiasts, researchers, and developers.
*   **Open Source:** Building the foundation of ML tooling with the community.

### **Our Key Offerings**

*   **Models:** A vast library of state-of-the-art machine learning models.
*   **Datasets:** Access and share datasets for any ML tasks.
*   **Spaces:** Run and share AI applications with ease.
*   **Open Source Tools:**
    *   Transformers
    *   Diffusers
    *   Safetensors
    *   Hub Python Library
    *   Tokenizers
    *   TRL
    *   Transformers.js
    *   PEFT
    *   Datasets
    *   Text Generation Inference
    *   Accelerate

### **Who We Serve**

*   **AI Researchers:** Create, share, and discover cutting-edge models and datasets.
*   **Developers:** Build and deploy AI applications using our open-source stack.
*   **Enterprises:** Leverage our Enterprise solutions for secure, scalable, and collaborative AI development.

### **Enterprise Solutions**

Give your team the most advanced platform to build AI with enterprise-grade security, access controls and dedicated support.

*   Single Sign-On
*   Regions
*   Priority Support
*   Audit Logs
*   Resource Groups
*   Private Datasets Viewer

### **Customers**

More than 50,000 organizations are using Hugging Face

*   Ai2 (non-profit)
*   AI at Meta
*   Amazon
*   Google
*   Intel
*   Microsoft
*   Grammarly
*   Writer

### **Join Our Community**

*   **Sign up:**  Get started today to explore our platform and connect with the AI community.
*   **Explore:** Browse our models, datasets, and Spaces.
*   **Contribute:** Help us build the future of AI by contributing to our open-source projects.

### **Careers at Hugging Face**

Check out our website for current job openings in various roles.

### **Contact Us**

*   [Website](https://huggingface.co/)
*   [Twitter](https://twitter.com/HuggingFace)
*   [LinkedIn](https://www.linkedin.com/company/hugging-face/)
*   [Discord](https://discord.com/invite/huggingface)


In [42]:
def stream_brochure(company_name, url):
    full_prompt = f"""
    System: {system_prompt}

    User: {get_brochure_user_prompt(company_name,url)}
    """
    
    try:
        stream = Gemini.models.generate_content_stream(
            model=MODEL,
            contents=full_prompt
        )
    except:
        print("Error decoding response")
        return None

    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.text or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [43]:
stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'models page', 'url': 'https://huggingface.co/models'}, {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'}, {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'docs page', 'url': 'https://huggingface.co/docs'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'learn page', 'url': 'https://huggingface.co/learn'}, {'type': 'github page', 'url': 'https://github.com/huggingface'}, {'type': 'twitter page', 'url': 'https://twitter.com/huggingface'}, {'type': 'linkedin page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}



# Hugging Face: The AI Community Building the Future

**Welcome to Hugging Face, the platform where the machine learning community collaborates on models, datasets, and applications.**

## About Us

We are building the future of AI by providing the tools and platform for the machine learning community to create, discover, and collaborate. We are committed to open source, empowering researchers and developers to accelerate their ML projects. Our platform hosts a vast collection of models, datasets, and applications, fostering innovation across various modalities, including text, image, video, and audio.

## Our Mission

To democratize AI through open-source tools and community collaboration, making cutting-edge machine learning accessible to everyone.

## What We Offer

*   **Models:** Explore and utilize a vast library of over 1 million pre-trained models.
*   **Datasets:** Access and share over 250,000 datasets for various ML tasks.
*   **Spaces:**  Run and showcase AI applications, allowing users to generate realistic dialogue, create applications, and more.
*   **Open Source:** Contribute to and leverage our open-source tools, including Transformers, Diffusers, and more.

## Key Features

*   **Collaboration:**  Host and collaborate on public models, datasets, and applications.
*   **Community:**  Join a thriving community of over 50,000 organizations, including leading AI companies and research institutions.
*   **Accelerated ML:** Utilize our open-source stack for faster development.
*   **Multiple Modalities:**  Work with text, image, video, audio, and 3D data.

## Solutions for Everyone

*   **Individual Developers:**  Share your work, build your ML profile, and access powerful tools.
*   **Enterprises:** Benefit from enterprise-grade security, access controls, dedicated support, and private datasets.
*   **Compute:** Deploy your models on optimized Inference Endpoints and utilize GPUs.

## Customers

We are trusted by over 50,000 organizations, including:

*   AI2 (Enterprise, non-profit)
*   AI at Meta (Enterprise, company)
*   Amazon (company)
*   Google (company)
*   Intel (company)
*   Microsoft (company)
*   Grammarly (Enterprise, company)
*   Writer (Enterprise, company)

## Open Source Projects

We are building the foundation of ML tooling with the community through projects like:

*   Transformers
*   Diffusers
*   Safetensors
*   Hub Python Library
*   Tokenizers
*   TRL
*   Transformers.js
*   smolagents
*   PEFT
*   Datasets
*   Text Generation Inference
*   Accelerate

## Careers at Hugging Face

We don't have specific job postings, but Hugging Face has a Jobs page to apply for available positions.

## Contact Us

*   **Website:** [https://huggingface.co/](https://huggingface.co/)
*   **Social Media:** GitHub, Twitter, LinkedIn, Discord

**Join us and be part of the future of AI!**

