In [None]:
import re
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from google import genai



In [None]:
load_dotenv()
api_key = os.getenv('GEMINI_API_KEY')
if api_key:
    print("API Key looks good.")
else:
    print("There might be some problem with your API Key. Please check.")

# import google.generativeai as genai
# genai.configure(api_key=api_key)
# MODEL = "gemini-2.0-flash-lite"
# Content = "Explain how AI works in a few words"
# Gemini = genai.GenerativeModel(MODEL)

# response = Gemini.generate_content(
#     contents=Content
# )

Gemini = genai.Client(api_key=api_key)
MODEL = "gemini-2.0-flash-lite"
Content = "Explain how AI works in a few words"
response = Gemini.models.generate_content(
    model=MODEL, contents=Content
)
print(response.text)

API Key looks good.
AI works by learning and reasoning like humans, but with code.



In [53]:
def Chrome_Version():
    ChromeVer = !reg query "HKEY_CURRENT_USER\Software\Google\Chrome\BLBeacon" /v version
    text = '\n'.join(ChromeVer)

    # Extract the version number
    version = re.search(r'\d+\.\d+\.\d+\.\d+', text)
    ChromeVer = version.group() if version else "Not found"
    #print("Chrome version:", version.group() if version else "Not found")
    return ChromeVer
latestversion = Chrome_Version()

In [54]:
browser = "Chrome/"+latestversion
headers = {
    "User-Agent": browser + " Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    '''Scaping website'''

    def __init__(self, url):
        self.url = url
        response = requests.get(url=url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found."

        if soup.body:
            for irrelavant in soup.body(["script", "style", "img", "input"]):
                irrelavant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        #print(self.text)

        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]
        #print(self.links)
    
    def get_contents(self):
        return f"Webpage Title: \n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [55]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',
 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-

In [56]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [57]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [58]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [59]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/
https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/
https://edwarddonner.com/2025/01/23/ll

In [109]:
def get_links(url):
    website = Website(url)
    messages = []
    # if link_system_prompt:
    #     messages.append({"role": "system", "content": link_system_prompt})
    # messages.append({"role": "user", "content": get_links_user_prompt(website)})
    # system_instruction = {"role": "system", "content": link_system_prompt}
    # prompt = {"role": "user", "content": get_links_user_prompt(website)}
   
    #generation_config = {"response_mime_type": "application/json"}
    # message = [
    #         {"role": "system", "content": link_system_prompt},
    #         {"role": "user", "content": get_links_user_prompt(website)}
    #   ]

    full_prompt = f"""
    System: {link_system_prompt}

    User: {get_links_user_prompt(website)}

    Please respond in valid JSON format.
    """
    try:
        response = Gemini.models.generate_content(
            model=MODEL,
            contents=full_prompt
            # model=MODEL,
            # messages,
            # generation_config = {"response_mime_type": "application/json"}
            # response_format={"type": "json_object"}
        )
        raw_text = response.text.strip()

        # Extract JSON using regex
        json_match = re.search(r'\{.*\}', raw_text, re.DOTALL)
        if json_match:
            json_data = json.loads(json_match.group())
            return json_data
        else:
            print("No valid JSON found in the response.")
            print("Raw response:", raw_text)
            return None

    except json.JSONDecodeError:
        print("Error decoding JSON response")
        print("Raw response:", response.text)
        return None

    # result = response.text
    # return json.loads(result)

In [110]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nari-labs/Dia-1.6B',
 '/Qwen/Qwen3-235B-A22B',
 '/deepseek-ai/DeepSeek-Prover-V2-671B',
 '/Qwen/Qwen3-30B-A3B',
 '/sand-ai/MAGI-1',
 '/models',
 '/spaces/nari-labs/Dia-1.6B',
 '/spaces/enzostvs/deepsite',
 '/spaces/Qwen/Qwen3-Demo',
 '/spaces/stepfun-ai/Step1X-Edit',
 '/spaces/nvidia/describe-anything-model-demo',
 '/spaces',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/OpenGVLab/InternVL-Data',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/Eureka-Lab/PHYBench',
 '/datasets/Anthropic/values-in-the-wild',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transformers',
 '/docs/diffusers'

In [111]:
get_links("https://huggingface.co")

{'links': [{'type': 'models page', 'url': 'https://huggingface.co/models'},
  {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'},
  {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'},
  {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'},
  {'type': 'pricing page (endpoints)',
   'url': 'https://huggingface.co/pricing#endpoints'},
  {'type': 'pricing page (spaces)',
   'url': 'https://huggingface.co/pricing#spaces'},
  {'type': 'docs page', 'url': 'https://huggingface.co/docs'},
  {'type': 'careers/jobs page',
   'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'blog page', 'url': 'https://huggingface.co/blog'},
  {'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'brand page', 'url': 'https://huggingface.co/brand'},
  {'type': 'learn page', 'url': 'https://huggingface.co/learn'},
  {'type': 'LinkedIn page',
   'url': 'https://

In [112]:
# Make the Brochure

def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    try:
        for link in links["links"]:
            result += f"\n\n{link['type']}\n"
            result += Website(link["url"]).get_contents()
    except:
        print("None type object")
    return result

In [113]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': '/huggingface'}, {'type': 'models page', 'url': '/models'}, {'type': 'datasets page', 'url': '/datasets'}, {'type': 'spaces page', 'url': '/spaces'}, {'type': 'enterprise page', 'url': '/enterprise'}, {'type': 'pricing page', 'url': '/pricing'}, {'type': 'Careers/Jobs page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'Learn page', 'url': '/learn'}, {'type': 'Blog page', 'url': '/blog'}, {'type': 'Documentation page', 'url': '/docs'}, {'type': 'Community Forum', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn', 'url': 'https://www.linkedin.com/company/huggingface/'}]}
None type object
Landing page:
Webpage Title: 
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community bu