In [1]:
!pip install requests beautifulsoup4 pandas

import requests
import time
import random
import json
import pandas as pd
import urllib.robotparser
from bs4 import BeautifulSoup
from datetime import datetime



1. Logging Setup

In [2]:
def log(message, level="INFO"):
    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    print(f"[{timestamp}] [{level}] {message}")

2. Robots.txt Validation

In [3]:
def is_allowed(url, user_agent="CompliantScraperBot"):
    rp = urllib.robotparser.RobotFileParser()
    base_url = "/".join(url.split("/")[:3])
    robots_url = base_url + "/robots.txt"

    try:
        rp.set_url(robots_url)
        rp.read()
        allowed = rp.can_fetch(user_agent, url)
        return allowed
    except:
        return False

3. Fetch with Exponential Backoff

In [4]:
def fetch_with_retry(url, retries=3):
    delay = 2
    headers = {"User-Agent": "CompliantScraperBot"}

    for attempt in range(retries):
        try:
            response = requests.get(url, headers=headers, timeout=10)

            if response.status_code == 200:
                return response

            elif response.status_code in [429, 500, 502, 503]:
                log(f"Retry {attempt+1} due to {response.status_code}")
                time.sleep(delay)
                delay *= 2

        except Exception as e:
            log(f"Error: {str(e)}", "ERROR")
            time.sleep(delay)
            delay *= 2

    return None

4. CAPTCHA / Block Detection

In [5]:
def detect_block(text):
    keywords = ["captcha", "blocked", "access denied", "unusual traffic"]
    for word in keywords:
        if word in text.lower():
            return True
    return False

5. HTML Parser

In [6]:
def parse_html(html, url):
    soup = BeautifulSoup(html, "html.parser")

    # Title
    title = soup.title.string.strip() if soup.title else None

    # H1 (first visible)
    h1_tags = soup.find_all("h1")
    h1 = h1_tags[0].get_text(strip=True) if h1_tags else None

    # Meta Description
    meta_tag = soup.find("meta", attrs={"name": "description"})
    meta_description = meta_tag["content"] if meta_tag else None

    # Rating from JSON-LD
    rating = None
    review_count = None

    scripts = soup.find_all("script", type="application/ld+json")

    for script in scripts:
        try:
            data = json.loads(script.string)
            if isinstance(data, dict) and "aggregateRating" in data:
                rating = data["aggregateRating"].get("ratingValue")
                review_count = data["aggregateRating"].get("reviewCount")
        except:
            continue

    # CTA Buttons
    ctas = []
    for a in soup.find_all("a"):
        text = a.get_text(strip=True)
        link = a.get("href")
        if text and link:
            ctas.append({"text": text, "link": link})

    return {
        "url": url,
        "title": title,
        "h1": h1,
        "meta_description": meta_description,
        "rating": rating,
        "review_count": review_count,
        "pricing": None,
        "ctas": ctas,
        "status": "ok"
    }

6. Main Execution Cell

In [7]:
urls = ["https://theresanaiforthat.com/"]

results = []

for url in urls:
    log(f"Processing {url}")

    # Robots check
    if not is_allowed(url):
        log("Skipped due to robots.txt")
        results.append({"url": url, "status": "skipped"})
        continue

    # Polite delay
    time.sleep(random.uniform(2, 5))

    # Fetch
    response = fetch_with_retry(url)

    if not response:
        results.append({"url": url, "status": "failed"})
        continue

    # Block detection
    if detect_block(response.text):
        log("Blocked detected!", "ERROR")
        results.append({"url": url, "status": "blocked"})
        continue

    # Parse
    parsed_data = parse_html(response.text, url)
    results.append(parsed_data)

log("Scraping completed.")

[2026-02-23 23:02:08] [INFO] Processing https://theresanaiforthat.com/
[2026-02-23 23:02:15] [ERROR] Blocked detected!
[2026-02-23 23:02:15] [INFO] Scraping completed.


7. Save JSON + CSV

In [8]:
with open("results.json", "w", encoding="utf-8") as f:
    json.dump(results, f, indent=4)

df = pd.json_normalize(results)
df.to_csv("results.csv", index=False)

log("Results saved successfully.")

[2026-02-23 23:02:37] [INFO] Results saved successfully.
