In [None]:
"""
Prototype: Generative AI–Powered Web Data Summarization

Objective:
- Scrape an EU energy project / proposal web page.
- Extract main text content.
- Use OpenAI GPT model to summarize into a structured format:
    - project_name
    - location
    - duration
    - start_date
    - end_date (optional)
    - budget
    - funding_program (if any)
    - key_objectives
    - key_activities
    - website_url

Stack:
    - Python
    - requests
    - beautifulsoup4
    - openai (new client: from openai import OpenAI)

Install:
    pip install requests beautifulsoup4 openai

Env:
    export OPENAI_API_KEY="your_api_key_here"

Usage:
    python web_energy_summarizer.py "https://example.com/project-page"
"""

import os
import sys
import textwrap
from dataclasses import dataclass, asdict

import requests
from bs4 import BeautifulSoup
from openai import OpenAI


# -------------------------------------------------------------------
# Data model for structured project summary
# -------------------------------------------------------------------
@dataclass
class ProjectSummary:
    project_name: str | None = None
    location: str | None = None
    duration: str | None = None
    start_date: str | None = None
    end_date: str | None = None
    budget: str | None = None
    funding_program: str | None = None
    key_objectives: str | None = None
    key_activities: str | None = None
    website_url: str | None = None


# -------------------------------------------------------------------
# 1. Fetch HTML from URL
# -------------------------------------------------------------------
def fetch_page(url: str) -> str:
    """Download the HTML for a given URL."""
    headers = {
        "User-Agent": "Mozilla/5.0 (compatible; GenAI-Summarizer/1.0)"
    }
    resp = requests.get(url, headers=headers, timeout=20)
    resp.raise_for_status()
    return resp.text


# -------------------------------------------------------------------
# 2. Extract main text from HTML (simple heuristic)
# -------------------------------------------------------------------
def extract_main_text(html: str) -> str:
    """Extract visible text from HTML using BeautifulSoup.

    For a prototype, we keep it simple:
    - Remove script/style tags
    - Extract body text
    - Collapse excessive whitespace
    """
    soup = BeautifulSoup(html, "html.parser")

    # Remove scripts and styles
    for tag in soup(["script", "style", "noscript"]):
        tag.decompose()

    body = soup.body
    if body is None:
        text = soup.get_text(separator="\n")
    else:
        text = body.get_text(separator="\n")

    # Normalize whitespace
    lines = [line.strip() for line in text.splitlines()]
    text = "\n".join([line for line in lines if line])

    # Truncate very long pages (to keep token usage reasonable)
    max_chars = 8000
    if len(text) > max_chars:
        text = text[:max_chars]

    return text


# -------------------------------------------------------------------
# 3. Call OpenAI GPT to create structured summary
# -------------------------------------------------------------------
def summarize_project_with_llm(page_text: str, url: str) -> ProjectSummary:
    """Use OpenAI GPT model to extract structured proposal info."""
    client = OpenAI()  # uses OPENAI_API_KEY from env

    system_prompt = """
    You are an expert analyst of European energy project proposals.
    Your task is to read the project web page content and extract
    a structured summary with the following fields:

    - project_name
    - location (country/region, or 'Unknown' if not clear)
    - duration (e.g. "36 months", or a clear textual description)
    - start_date (DD/MM/YYYY or Month YYYY if available, otherwise 'Unknown')
    - end_date (if available, otherwise 'Unknown')
    - budget (total budget or funding amount, including currency)
    - funding_program (e.g. Horizon Europe, CEF, etc., or 'Unknown')
    - key_objectives (2–4 bullet points of main goals)
    - key_activities (2–4 bullet points of main activities/work packages)
    - website_url (the input URL)

    Important:
    - If information is missing, set the value to "Unknown".
    - Return ONLY valid JSON, no extra text, no explanation.
    """

    user_prompt = f"""
    Below is the raw text extracted from an energy project proposal web page.

    WEBSITE URL:
    {url}

    PAGE CONTENT:
    \"\"\"
    {page_text}
    \"\"\"

    Now extract the structured summary in JSON with keys:
    project_name, location, duration, start_date, end_date,
    budget, funding_program, key_objectives, key_activities, website_url.
    """

    response = client.chat.completions.create(
        model="gpt-4o-mini",  # or "gpt-4.1-mini" / "gpt-3.5-turbo"
        messages=[
            {"role": "system", "content": textwrap.dedent(system_prompt).strip()},
            {"role": "user", "content": textwrap.dedent(user_prompt).strip()},
        ],
        temperature=0.1,
    )

    content = response.choices[0].message.content

    # Parse JSON safely
    import json

    try:
        data = json.loads(content)
    except json.JSONDecodeError:
        # Fallback: try to clean minor formatting issues
        content_clean = content.strip().strip("```").replace("json", "")
        data = json.loads(content_clean)

    summary = ProjectSummary(
        project_name=data.get("project_name"),
        location=data.get("location"),
        duration=data.get("duration"),
        start_date=data.get("start_date"),
        end_date=data.get("end_date"),
        budget=data.get("budget"),
        funding_program=data.get("funding_program"),
        key_objectives=data.get("key_objectives"),
        key_activities=data.get("key_activities"),
        website_url=data.get("website_url", url),
    )

    return summary


# -------------------------------------------------------------------
# 4. End-to-end pipeline for a single URL
# -------------------------------------------------------------------
def process_project_url(url: str) -> ProjectSummary:
    print(f"[INFO] Fetching page: {url}")
    html = fetch_page(url)

    print("[INFO] Extracting main text content...")
    page_text = extract_main_text(html)

    print(f"[INFO] Extracted {len(page_text)} characters of text.")
    print("[INFO] Calling OpenAI for structured summarization...")
    summary = summarize_project_with_llm(page_text, url)

    return summary


# -------------------------------------------------------------------
# CLI entry point
# -------------------------------------------------------------------
def main():
    if len(sys.argv) < 2:
        print("Usage: python web_energy_summarizer.py <project_url>")
        sys.exit(1)

    url = sys.argv[1]
    summary = process_project_url(url)

    print("\n=== Structured Project Summary ===")
    import json

    print(json.dumps(asdict(summary), indent=4, ensure_ascii=False))


if __name__ == "__main__":
    main()
