# Robots and Sitemap

In [None]:
import os
import requests
import urllib.robotparser as robotparser
import xml.etree.ElementTree as ET
from urllib.parse import urljoin, urlparse
from datetime import datetime

def ensure_folder_exists(folder_path):
    """Create folder if it doesn't exist."""
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

def fetch_file(url):
    """Fetch content from a URL."""
    try:
        headers = {'User-Agent': 'Mozilla/5.0 (compatible; URLGrabber/1.0)'}
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        return response.text
    except requests.RequestException as e:
        print(f"Error fetching {url}: {e}")
        return None

def save_file(content, file_path):
    """Save content to a file."""
    try:
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(content)
    except Exception as e:
        print(f"Error saving file {file_path}: {e}")

def parse_robots_txt(robots_url, base_url):
    """Parse robots.txt to get allowed URLs and sitemap URLs."""
    allowed_urls = set()
    sitemap_urls = set()

    robots_content = fetch_file(robots_url)
    if not robots_content:
        return allowed_urls, sitemap_urls

    # Save robots.txt
    domain = urlparse(base_url).netloc
    folder_path = os.path.join('sites', domain)
    ensure_folder_exists(folder_path)
    save_file(robots_content, os.path.join(folder_path, 'robots.txt'))

    # Parse robots.txt
    rp = robotparser.RobotFileParser()
    rp.set_url(robots_url)
    try:
        rp.parse(robots_content.splitlines())
    except Exception as e:
        print(f"Error parsing robots.txt: {e}")
        return allowed_urls, sitemap_urls

    # Extract sitemap URLs
    for line in robots_content.splitlines():
        line = line.strip()
        if line.lower().startswith('sitemap:'):
            sitemap_url = line.split(':', 1)[1].strip()
            sitemap_urls.add(sitemap_url)

    # Since robotparser doesn't directly provide allowed URLs, we assume all URLs are allowed
    # unless disallowed. We'll rely on sitemap for actual URLs.
    return allowed_urls, sitemap_urls

def parse_sitemap(sitemap_url, base_url):
    """Parse sitemap (XML or sitemap index) to extract URLs."""
    urls = set()
    sitemap_content = fetch_file(sitemap_url)
    if not sitemap_content:
        return urls

    # Save sitemap
    domain = urlparse(base_url).netloc
    folder_path = os.path.join('sites', domain)
    ensure_folder_exists(folder_path)
    sitemap_filename = sitemap_url.split('/')[-1] or f"sitemap_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xml"
    save_file(sitemap_content, os.path.join(folder_path, sitemap_filename))

    try:
        root = ET.fromstring(sitemap_content)
        namespace = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}

        # Check if it's a sitemap index
        if root.tag.endswith('sitemapindex'):
            for sitemap in root.findall('ns:sitemap/ns:loc', namespace):
                sub_sitemap_url = sitemap.text.strip()
                urls.update(parse_sitemap(sub_sitemap_url, base_url))
        else:
            # Regular sitemap
            for url in root.findall('ns:url/ns:loc', namespace):
                full_url = url.text.strip()
                if not full_url.startswith(('http://', 'https://')):
                    full_url = urljoin(base_url, full_url)
                urls.add(full_url)
    except ET.ParseError as e:
        print(f"Error parsing sitemap {sitemap_url}: {e}")

    return urls

def process_site(site_url):
    """Process a site to extract allowed URLs and save files."""
    # Normalize URL
    if not site_url.startswith(('http://', 'https://')):
        site_url = 'https://' + site_url
    site_url = site_url.rstrip('/')

    # Initialize sets
    all_allowed_urls = set()

    # Fetch and parse robots.txt
    robots_url = urljoin(site_url, '/robots.txt')
    allowed_urls, sitemap_urls = parse_robots_txt(robots_url, site_url)
    all_allowed_urls.update(allowed_urls)

    # Fetch and parse sitemaps
    for sitemap_url in sitemap_urls:
        all_allowed_urls.update(parse_sitemap(sitemap_url, site_url))

    # Save allowed URLs to file
    domain = urlparse(site_url).netloc
    folder_path = os.path.join('sites', domain)
    ensure_folder_exists(folder_path)
    output_file = os.path.join(folder_path, 'allowed_urls.txt')

    with open(output_file, 'w', encoding='utf-8') as f:
        for url in sorted(all_allowed_urls):
            f.write(url + '\n')
            print(url)

    print(f"\nSaved {len(all_allowed_urls)} URLs to {output_file}")

if __name__ == "__main__":
    site_url = input("Enter the site URL (e.g., example.com): ")
    process_site(site_url)

Enter the site URL (e.g., example.com): openai.com
https://openai.com/
https://openai.com/12-days/
https://openai.com/about/
https://openai.com/api-scale-tier/
https://openai.com/api/
https://openai.com/api/pricing/
https://openai.com/approach-to-patents/
https://openai.com/brand-old/
https://openai.com/brand/
https://openai.com/building-dynamic-teams/
https://openai.com/business/
https://openai.com/business/enabling-a-data-driven-workforce-webinar/
https://openai.com/business/fine-tuning-gpt-4o-webinar/
https://openai.com/business/guides-and-resources/
https://openai.com/business/new-in-chatgpt-for-business-april-updates-2025/
https://openai.com/business/new-in-chatgpt-for-work-march-updates-2025/
https://openai.com/business/put-ai-to-work-automate-and-scale-financial-operations/
https://openai.com/business/put-ai-to-work-for-marketing-teams/
https://openai.com/business/put-ai-to-work-lessons-from-hundreds-of-successful-deployments/
https://openai.com/business/solving-complex-problems

In [None]:
!pip install selenium pandas webdriver-manager

Collecting webdriver-manager
  Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl.metadata (12 kB)
Collecting python-dotenv (from webdriver-manager)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB)
Downloading webdriver_manager-4.0.2-py2.py3-none-any.whl (27 kB)
Downloading python_dotenv-1.1.0-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv, webdriver-manager
Successfully installed python-dotenv-1.1.0 webdriver-manager-4.0.2


In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd

# Configure Selenium for Colab
options = Options()
options.add_argument('--headless')  # Run in headless mode
options.add_argument('--no-sandbox')  # Required for Colab
options.add_argument('--disable-dev-shm-usage')  # Avoids memory issues
driver = webdriver.Chrome(options=options)  # Use chromium-chromedriver

# Target SPA URL
url = "https://svelte.dev/blog"
driver.get(url)

# Wait for dynamic content to load
try:
    WebDriverWait(driver, 15).until(
        EC.presence_of_element_located((By.TAG_NAME, "article"))
    )
except:
    print("Timeout waiting for page to load")
    driver.quit()
    exit()

# Extract data
articles = driver.find_elements(By.TAG_NAME, "article")
data = []
for article in articles:
    try:
        title = article.find_element(By.TAG_NAME, "h2").text
        summary = article.find_element(By.TAG_NAME, "p").text
        data.append({"title": title, "summary": summary})
    except:
        continue  # Skip articles with missing elements

# Save to CSV
df = pd.DataFrame(data)
df.to_csv("svelte_blog_selenium.csv", index=False)

# Cleanup
driver.quit()

print("Data scraped and saved to svelte_blog_selenium.csv")

Data scraped and saved to svelte_blog_selenium.csv


In [None]:
!curl -fsSL https://ollama.com/install.sh | sh
!nohup ollama serve > output.log 2>&1 &
!ollama pull phi4

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to render group...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.
[?2026h[?25l[1Gpulling manifest ⠋ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠙ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠹ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠸ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠼ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠴ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠦ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠧ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠇ [K[?25h[?2026l[?2026h[?25l[1Gpulling manifest ⠏ [K[?25h[?2026l[

In [None]:
import uuid
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import requests
from IPython.display import display

# In-memory storage for scraped data
scraped_data = []
agent_id = str(uuid.uuid4())

# Selenium Scraper
def scrape_svelte_blog():
    global scraped_data
    options = Options()
    options.add_argument('--headless')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')

    try:
        driver = webdriver.Chrome(options=options)
        url = "https://svelte.dev/blog"
        driver.get(url)

        # Wait for dynamic content
        WebDriverWait(driver, 15).until(
            EC.presence_of_element_located((By.TAG_NAME, "article"))
        )

        # Extract data
        articles = driver.find_elements(By.TAG_NAME, "article")
        data = []
        for article in articles:
            try:
                title = article.find_element(By.TAG_NAME, "h2").text
                summary = article.find_element(By.TAG_NAME, "p").text
                data.append({"title": title, "summary": summary})
            except:
                continue

        # Save to CSV
        df = pd.DataFrame(data)
        df.to_csv("svelte_blog_selenium.csv", index=False)

        scraped_data = data
        driver.quit()
        return {"status": "success", "articles_scraped": len(data), "message": "Data scraped and saved to svelte_blog_selenium.csv"}
    except Exception as e:
        if 'driver' in locals():
            driver.quit()
        return {"status": "error", "message": str(e)}

# Zippopotam API
def get_zipcode_info(zipcode):
    try:
        response = requests.get(f"https://api.zippopotam.us/us/{zipcode}")
        if response.status_code == 200:
            return {"status": "success", "data": response.json()}
        else:
            return {"status": "error", "message": "Invalid zip code or API error"}
    except Exception as e:
        return {"status": "error", "message": str(e)}

# Simulated Phi4 reasoning (processes queries over scraped data)
def process_query(query):
    query = query.lower()
    results = []

    for article in scraped_data:
        if query in article["title"].lower() or query in article["summary"].lower():
            results.append(article)

    if results:
        return {"status": "success", "results": results}
    else:
        return {"status": "no_results", "message": "No articles match your query"}

# Interactive Agent Interface
def run_agent():
    print(f"AI Agent ID: {agent_id}")
    print("Welcome to the AI Agent (Svelte Blog Scraper + Zip Code Info)")

    while True:
        print("\nOptions:")
        print("1. Scrape Svelte Blog")
        print("2. Query Scraped Articles")
        print("3. Get Zip Code Info")
        print("4. Exit")

        choice = input("Enter your choice (1-4): ")

        if choice == "1":
            result = scrape_svelte_blog()
            print(result)
            if result["status"] == "success":
                display(pd.DataFrame(scraped_data))

        elif choice == "2":
            if not scraped_data:
                print("No data available. Please scrape the blog first (Option 1).")
                continue
            query = input("Enter your query (e.g., 'sveltekit'): ")
            result = process_query(query)
            print(result)
            if result["status"] == "success":
                display(pd.DataFrame(result["results"]))

        elif choice == "3":
            zipcode = input("Enter a US zip code (e.g., 90210): ")
            result = get_zipcode_info(zipcode)
            print(result)
            if result["status"] == "success":
                display(pd.DataFrame([result["data"]]))

        elif choice == "4":
            print("Exiting AI Agent.")
            break

        else:
            print("Invalid choice. Please select 1-4.")

# Run the agent
if __name__ == "__main__":
    run_agent()

AI Agent ID: 8c17489d-bd59-4159-a79d-d04d5f5f1510
Welcome to the AI Agent (Svelte Blog Scraper + Zip Code Info)

Options:
1. Scrape Svelte Blog
2. Query Scraped Articles
3. Get Zip Code Info
4. Exit
Enter your choice (1-4): 1
{'status': 'success', 'articles_scraped': 84, 'message': 'Data scraped and saved to svelte_blog_selenium.csv'}


Unnamed: 0,title,summary
0,What’s new in Svelte: May 2025,"Svelte Summit soon! Plus, await in components"
1,What’s new in Svelte: April 2025,"Writable $derived statements, async reroute an..."
2,What’s new in Svelte: March 2025,Congrats to the SvelteHack winners! Plus impro...
3,What’s new in Svelte: February 2025,"New types, pnpm 10 support and better syntax h..."
4,What’s new in Svelte: January 2025,"Svelte 5 just keeps getting better. Plus, an i..."
...,...,...
79,Using CSS-in-JS with Svelte,"You don’t need to, but you can"
80,Svelte v2 is out!,Here’s what you need to know
81,Sapper: Towards the ideal web app framework,Taking the next-plus-one step
82,The zen of Just Writing CSS,"I would say this is the future, but we’re alre..."



Options:
1. Scrape Svelte Blog
2. Query Scraped Articles
3. Get Zip Code Info
4. Exit
Enter your choice (1-4): 2
Enter your query (e.g., 'sveltekit'): sveltekit
{'status': 'success', 'results': [{'title': 'What’s new in Svelte: March 2025', 'summary': 'Congrats to the SvelteHack winners! Plus improved SSR in Svelte and SvelteKit'}, {'title': 'What’s new in Svelte: January 2024', 'summary': 'SvelteKit 2 and a much-improved $state rune'}, {'title': 'Announcing SvelteKit 2', 'summary': 'A special SvelteKit anniversary release'}, {'title': 'What’s new in Svelte: October 2023', 'summary': 'Reactions to Runes and SvelteKit +server fallbacks'}, {'title': 'Hacktoberfest 2023 with SvelteKit', 'summary': 'SvelteKit joins in the Hacktoberfest event in 2023'}, {'title': 'What’s new in Svelte: September 2023', 'summary': 'New parameters in SvelteKit’s redirect and an onNavigate lifecycle function come to life'}, {'title': 'Unlocking view transitions in SvelteKit 1.24', 'summary': 'Streamlined page

Unnamed: 0,title,summary
0,What’s new in Svelte: March 2025,Congrats to the SvelteHack winners! Plus impro...
1,What’s new in Svelte: January 2024,SvelteKit 2 and a much-improved $state rune
2,Announcing SvelteKit 2,A special SvelteKit anniversary release
3,What’s new in Svelte: October 2023,Reactions to Runes and SvelteKit +server fallb...
4,Hacktoberfest 2023 with SvelteKit,SvelteKit joins in the Hacktoberfest event in ...
5,What’s new in Svelte: September 2023,New parameters in SvelteKit’s redirect and an ...
6,Unlocking view transitions in SvelteKit 1.24,Streamlined page transitions with onNavigate
7,What’s new in Svelte: June 2023,"SvelteHack winners, lots of new bindings, Svel..."
8,What’s new in Svelte: March 2023,"SvelteHack, post-1.0 SvelteKit improvements an..."
9,"Streaming, snapshots, and other new features s...",Exciting improvements in the latest version of...



Options:
1. Scrape Svelte Blog
2. Query Scraped Articles
3. Get Zip Code Info
4. Exit
Enter your choice (1-4): 2
Enter your query (e.g., 'sveltekit'): November 2020
{'status': 'success', 'results': [{'title': 'What’s new in Svelte: November 2020', 'summary': 'Slot forwarding fixes, SvelteKit for faster local development, and more from Svelte Summit'}]}


Unnamed: 0,title,summary
0,What’s new in Svelte: November 2020,"Slot forwarding fixes, SvelteKit for faster lo..."



Options:
1. Scrape Svelte Blog
2. Query Scraped Articles
3. Get Zip Code Info
4. Exit
Enter your choice (1-4): 3
Enter a US zip code (e.g., 90210): 90210
{'status': 'success', 'data': {'post code': '90210', 'country': 'United States', 'country abbreviation': 'US', 'places': [{'place name': 'Beverly Hills', 'longitude': '-118.4065', 'state': 'California', 'state abbreviation': 'CA', 'latitude': '34.0901'}]}}


Unnamed: 0,post code,country,country abbreviation,places
0,90210,United States,US,"[{'place name': 'Beverly Hills', 'longitude': ..."



Options:
1. Scrape Svelte Blog
2. Query Scraped Articles
3. Get Zip Code Info
4. Exit


KeyboardInterrupt: Interrupted by user

helpful free opoen apis (https://apipheny.io/free-api/)