### MODULES TO IMPORT

In [27]:
import os
import time
import logging
import traceback
from typing import Optional, List
from urllib.parse import urlparse
import requests
from requests.adapters import HTTPAdapter, Retry

from bs4 import BeautifulSoup
from readability import Document
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

import ollama
from openai import OpenAI

from dotenv import load_dotenv
load_dotenv(override=True)

True

### CONFIGURATION AND LOGGING

In [28]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("web_summarizer")

DEFAULT_USER_AGENT = ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                      "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
REQUEST_TIMEOUT = 10
MAX_CHARS = 15000
MODEL_OLLAMA = os.getenv("MODEL_OLLAMA", "llama3.2")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OLLAMA_ENABLED = os.getenv("USE_OLLAMA", "true").lower() in ("1", "true", "yes")

### HTTP SESSION AND ROBOTS CHECK HELPERS

In [29]:
def make_session(user_agent: str = DEFAULT_USER_AGENT) -> requests.Session:
    s = requests.Session()
    s.headers.update({"User-Agent": user_agent})
    retries = Retry(total=3, backoff_factor=0.5, status_forcelist=[429, 500, 502, 503, 504])
    s.mount("https://", HTTPAdapter(max_retries=retries))
    s.mount("http://", HTTPAdapter(max_retries=retries))
    return s

In [30]:
def allowed_by_robots(url: str, user_agent: str = "*") -> bool:
    parsed = urlparse(url)
    robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
    try:
        r = requests.get(robots_url, timeout=5)
        if r.status_code != 200:
            return True
        txt = r.text.lower()
        if "disallow: /" in txt:
            return False
    except Exception:
        return True
    return True

### EXTRACTION HELPER

In [31]:
def extract_with_readability(html: str) -> str:
    doc = Document(html)
    content_html = doc.summary()
    soup = BeautifulSoup(content_html, "html.parser")
    for tag in soup(["script", "style", "img", "nav", "footer", "header", "form"]):
        tag.decompose()
    text = soup.get_text(separator="\n", strip=True)
    lines = [ln.strip() for ln in text.splitlines() if ln.strip() and len(ln.strip()) > 2]
    return "\n".join(lines)

In [32]:
def fetch_with_selenium(url: str, headless: bool = True, wait_seconds: int = 5) -> str:
    options = Options()
    if headless:
        options.add_argument("--headless=new")
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--window-size=1920,1080")
    options.add_argument(f"--user-agent={DEFAULT_USER_AGENT}")
    chrome_paths = [...]
    for p in chrome_paths:
        if os.path.exists(p):
            options.binary_location = p
            break

    service = Service(ChromeDriverManager().install())
    driver = webdriver.Chrome(service=service, options=options)
    try:
        driver.set_page_load_timeout(30)
        driver.get(url)
        time.sleep(wait_seconds)
        try:
            WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "main")))
        except Exception:
            pass
        html = driver.page_source
    finally:
        driver.quit()
    return html

### HIGH LEVEL FETCH FLOW

In [33]:
def fetch_text(url: str, use_selenium_if_needed: bool = True, force_selenium: bool = False) -> Optional[str]:
    if not allowed_by_robots(url):
        logger.warning("Blocked by robots.txt: %s", url)
        return None

    session = make_session()
    try:
        r = session.get(url, timeout=REQUEST_TIMEOUT)
        r.raise_for_status()
        html = r.text
        with open("debug_raw.html", "w", encoding="utf-8") as f:
            f.write(html)
        text = extract_with_readability(html)
        logger.info("Readability extracted %d characters", len(text))
        if (len(text) < 200 and use_selenium_if_needed) or force_selenium:
            logger.info("Trying Selenium fallback (force=%s)", force_selenium)
            html = fetch_with_selenium(url, wait_seconds=10)
            with open("debug_selenium.html", "w", encoding="utf-8") as f:
                f.write(html)
            text = extract_with_readability(html)
        # fallback: extract headings/paragraphs if still small
        if len(text) < 200:
            soup = BeautifulSoup(html, "html.parser")
            parts = [t.get_text(strip=True) for t in soup.find_all(["h1","h2","h3","p"]) if len(t.get_text(strip=True))>20]
            text = "\n\n".join(parts[:200])
        return text
    except Exception as e:
        logger.warning("Requests fetch failed: %s; trying Selenium if allowed", e)
        if use_selenium_if_needed:
            try:
                html = fetch_with_selenium(url)
                return extract_with_readability(html)
            except Exception as se:
                logger.error("Selenium fallback failed: %s", se)
                return None
        return None

### CHUNKING AND SUMMARIZATION

In [34]:
def chunk_text(text: str, max_chars: int = MAX_CHARS) -> List[str]:
    if len(text) <= max_chars:
        return [text]
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chars
        if end < len(text):
            cut = text.rfind("\n", start, end)
            if cut <= start:
                cut = text.rfind(" ", start, end)
            if cut <= start:
                cut = end
        else:
            cut = len(text)
        chunks.append(text[start:cut].strip())
        start = cut
    return chunks

In [35]:
def summarize_with_ollama(chunks: List[str], system_prompt: str) -> str:
    summaries = []
    for chunk in chunks:
        resp = ollama.chat(model=MODEL_OLLAMA, messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": chunk}
        ])
        if isinstance(resp, dict):
            content = resp.get("message", {}).get("content", "")
        elif hasattr(resp, "message"):
            content = getattr(resp.message, "content", "")
        else:
            content = str(resp)
        summaries.append(content.strip())
    combined = "\n\n".join(summaries)
    final = ollama.chat(model=MODEL_OLLAMA, messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": "Combine and shorten the following summaries into a single short markdown summary:\n\n" + combined}
    ])
    if isinstance(final, dict):
        return final.get("message", {}).get("content", "")
    elif hasattr(final, "message"):
        return getattr(final.message, "content", "")
    return str(final)

In [36]:
def summarize_with_openai(chunks: List[str], system_prompt: str) -> str:
    client = OpenAI(api_key=OPENAI_API_KEY)
    summaries = []
    for chunk in chunks:
        resp = client.chat.completions.create(model="gpt-4o-mini", messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": chunk}
        ])
        msg = resp.choices[0].message.content if hasattr(resp, "choices") else str(resp)
        summaries.append(msg.strip())
    combined = "\n\n".join(summaries)
    final = client.chat.completions.create(model="gpt-4o-mini", messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": "Combine and shorten the following summaries into a single short markdown summary:\n\n" + combined}
    ])
    return final.choices[0].message.content

### High-level orchestrator and example usage

In [37]:
SYSTEM_PROMPT = ("You are an assistant that analyzes the contents of a website and provides a short "
                 "summary in markdown, ignoring navigation and unrelated UI text. Be concise.")

def summarize_url(url: str, use_ollama: bool = OLLAMA_ENABLED) -> Optional[str]:
    try:
        text = fetch_text(url)
        if not text or len(text) < 100:
            logger.warning("No meaningful text extracted from %s", url)
            return None
        chunks = chunk_text(text)
        if use_ollama:
            return summarize_with_ollama(chunks, SYSTEM_PROMPT)
        else:
            return summarize_with_openai(chunks, SYSTEM_PROMPT)
    except Exception as e:
        logger.error("Summarization failed: %s", e)
        logger.debug(traceback.format_exc())
        return None

if __name__ == "__main__":
    url = "https://openai.com"
    summary = summarize_url(url)
    if summary:
        print(summary)
    else:
        print("Could not produce summary.")



Could not produce summary.
