In [3]:
import os
import json
from dotenv import load_dotenv
from IPython.display import Markdown, display, update_display

import google.generativeai as genai
from scraper import fetch_website_links, fetch_website_contents

# Load env
load_dotenv(override=True)
api_key = os.getenv("GEMINI_API_KEY")

if api_key:
    print("Gemini API key loaded")
else:
    print("API key missing!")

# Configure Gemini
genai.configure(api_key=api_key)

MODEL = "gemini-flash-lite-latest"


Gemini API key loaded


In [2]:
pip install python-dotenv google-genai openai langchain fastapi uvicorn


Collecting openai
  Using cached openai-2.14.0-py3-none-any.whl.metadata (29 kB)
Collecting langchain
  Downloading langchain-1.2.3-py3-none-any.whl.metadata (4.9 kB)
Collecting fastapi
  Downloading fastapi-0.128.0-py3-none-any.whl.metadata (30 kB)
Collecting uvicorn
  Downloading uvicorn-0.40.0-py3-none-any.whl.metadata (6.7 kB)
Collecting jiter<1,>=0.10.0 (from openai)
  Downloading jiter-0.12.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting langchain-core<2.0.0,>=1.2.1 (from langchain)
  Using cached langchain_core-1.2.6-py3-none-any.whl.metadata (3.7 kB)
Collecting langgraph<1.1.0,>=1.0.2 (from langchain)
  Using cached langgraph-1.0.5-py3-none-any.whl.metadata (7.4 kB)
Collecting jsonpatch<2.0.0,>=1.33.0 (from langchain-core<2.0.0,>=1.2.1->langchain)
  Using cached jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting langsmith<1.0.0,>=0.3.45 (from langchain-core<2.0.0,>=1.2.1->langchain)
  Downloading langsmith-0.6.2-py3-none-a

In [4]:
link_system_prompt = """
You are provided with a list of links found on a webpage.
Decide which links are relevant for a company brochure
(About, Company, Careers, Jobs, Blog).

Respond ONLY in JSON format like:

{
  "links": [
    {"type": "about page", "url": "https://example.com/about"}
  ]
}
"""


In [5]:
def get_links_user_prompt(url):
    links = fetch_website_links(url)
    return f"""
Website: {url}

Below is a list of links found on the site.
Select only relevant links for a company brochure.
Ignore privacy, terms, mailto links.

Links:
{chr(10).join(links)}
"""


In [6]:
def select_relevant_links(url):
    print(f"Selecting relevant links for {url}")

    model = genai.GenerativeModel(
        MODEL,
        generation_config={
            "response_mime_type": "application/json",
            "temperature": 0.2
        }
    )

    response = model.generate_content([
        link_system_prompt,
        get_links_user_prompt(url)
    ])

    links = json.loads(response.text)
    print(f"Found {len(links['links'])} relevant links")
    return links


In [7]:
def fetch_page_and_all_relevant_links(url):
    contents = fetch_website_contents(url)
    relevant_links = select_relevant_links(url)

    result = f"## Landing Page\n\n{contents}\n\n## Relevant Pages\n"

    for link in relevant_links["links"]:
        result += f"\n\n### {link['type'].title()}\n"
        result += fetch_website_contents(link["url"])

    return result


In [8]:
brochure_system_prompt = """
You analyze website content and create a concise professional brochure.
Target audience: customers, investors, job seekers.

Respond in MARKDOWN.
Do not use code blocks.
Mention culture, products, customers, and careers if available.
"""


In [9]:
def get_brochure_user_prompt(company_name, url):
    content = fetch_page_and_all_relevant_links(url)
    return f"""
Company Name: {company_name}

Below is website content. Create a short brochure.

{content[:5000]}
"""


In [10]:
def stream_brochure(company_name, url):
    model = genai.GenerativeModel(MODEL)

    stream = model.generate_content(
        [
            brochure_system_prompt,
            get_brochure_user_prompt(company_name, url)
        ],
        stream=True
    )

    response_text = ""
    display_handle = display(Markdown(""), display_id=True)

    for chunk in stream:
        # Safety: chunk may have no candidates or no parts
        if not chunk.candidates:
            continue

        content = chunk.candidates[0].content
        if not content or not content.parts:
            continue

        for part in content.parts:
            if hasattr(part, "text") and part.text:
                response_text += part.text
                update_display(
                    Markdown(response_text),
                    display_id=display_handle.display_id
                )


In [11]:
stream_brochure("Microsoft", "https://www.microsoft.com/en-in/")


Selecting relevant links for https://www.microsoft.com/en-in/
Found 2 relevant links


# Microsoft: Empowering Achievement

## Transform Your World. Achieve More.

**Microsoft** is dedicated to empowering every person and every organization on the planet to achieve more. We are at the forefront of technological advancement, driving innovation that shapes the future across every sector of society.

***

### Our Products & Solutions

Harness the power of our leading technology platforms:

*   **Productivity & Cloud:** Microsoft 365, Teams, Copilot
*   **Operating Systems & Devices:** Windows, Surface hardware
*   **Entertainment:** Xbox gaming ecosystem
*   **Enterprise Solutions:** Azure, Dynamics 365, Power Platform

***

### Our Commitment to Customers

We focus on creating a future that benefits everyone. Through cutting-edge **AI and Innovation**, including Microsoft Copilot and the Microsoft Cloud, we provide the tools necessary for organizations and individuals to reach unprecedented levels of productivity and creativity.

***

### Culture & Careers

**Culture:** Our culture is rooted in a **growth mindset**, driving continuous learning and adaptation. We are united by our mission and committed to **Diversity and Inclusion**, fostering an environment where everyone can thrive together.

**Careers:** Join a mission-driven company where your work matters. Explore world-class benefits and growth opportunities as you help us achieve our core objective.

**Find Your Next Opportunity at Microsoft Careers.**