In [10]:
import requests
import json
import os
import gzip
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from PIL import Image
from io import BytesIO
from PyPDF2 import PdfReader
import hashlib

# Konfigurationswerte laden
with open("config.json", "r", encoding="utf-8") as f:
    config = json.load(f)

KEYWORDS = config["keywords"]
MAIN_DOMAINS = config["domains"]
MAX_PDF_PAGES = config["max_pdf_pages"]
MAX_IMAGE_SIZE = tuple(config["max_image_size"])
STORAGE_PATHS = config["storage_paths"]

# Sicherstellen, dass Verzeichnisse existieren
for path in STORAGE_PATHS.values():
    os.makedirs(path, exist_ok=True)

# Lade URLs aus der vorherigen Scraping-Stufe
if os.path.exists("extrahierte_urls.json"):
    with open("extrahierte_urls.json", "r", encoding="utf-8") as f:
        extracted_urls = json.load(f)
else:
    extracted_urls = {domain: [f"https://{domain}"] for domain in MAIN_DOMAINS}

def get_hash(content):
    """Berechnet den Hash-Wert einer Datei oder eines Textes, um Duplikate zu vermeiden."""
    return hashlib.md5(content.encode()).hexdigest() if isinstance(content, str) else hashlib.md5(content).hexdigest()

def extract_main_text(html):
    """Extrahiert nur den Hauptinhalt einer Webseite."""
    soup = BeautifulSoup(html, "html.parser")
    for tag in ["script", "style", "header", "footer", "nav", "aside"]:
        for element in soup.find_all(tag):
            element.extract()
    return soup.get_text().strip()

def save_text(url, text):
    """Speichert den extrahierten Text komprimiert als GZIP."""
    filename = f"{STORAGE_PATHS['text']}{urlparse(url).netloc}.txt.gz"
    with gzip.open(filename, "wt", encoding="utf-8") as f:
        f.write(text)
    print(f"Gespeicherter Text: {filename}")

def download_pdf_if_relevant(url):
    """Lädt PDFs herunter, wenn sie relevante Keywords enthalten."""
    response = requests.get(url, stream=True)
    pdf_content = response.content
    pdf_reader = PdfReader(BytesIO(pdf_content))

    for page in pdf_reader.pages[:MAX_PDF_PAGES]:  # Bis zu 5 Seiten prüfen
        text = page.extract_text()
        if text and any(keyword.lower() in text.lower() for keyword in KEYWORDS):
            filename = f"{STORAGE_PATHS['pdfs']}{url.split('/')[-1]}"
            with open(filename, "wb") as f:
                f.write(pdf_content)
            print(f"PDF gespeichert: {filename}")
            return True
    print(f"PDF verworfen: {url}")
    return False

def download_and_compress_image(url):
    """Lädt Bilder herunter, konvertiert sie zu WebP & speichert sie komprimiert."""
    response = requests.get(url, stream=True)
    image = Image.open(BytesIO(response.content))
    image.thumbnail(MAX_IMAGE_SIZE)

    filename = f"{STORAGE_PATHS['images']}{url.split('/')[-1].split('.')[0]}.webp"
    image.save(filename, "WEBP", quality=80)
    print(f"Bild gespeichert: {filename}")

def scrape_page(url):
    """Scrapt eine Webseite & speichert relevante Inhalte (Text, PDFs, Bilder)."""
    try:
        response = requests.get(url, timeout=5)
        soup = BeautifulSoup(response.text, "html.parser")

        # 1️⃣ Haupttext extrahieren & speichern
        main_text = extract_main_text(response.text)
        if any(keyword.lower() in main_text.lower() for keyword in KEYWORDS):
            save_text(url, main_text)

        # 2️⃣ PDFs extrahieren & speichern
        for link in soup.find_all("a", href=True):
            file_url = urljoin(url, link["href"])
            if file_url.endswith(".pdf"):
                download_pdf_if_relevant(file_url)

        # 3️⃣ Bilder extrahieren & speichern
        for img in soup.find_all("img", src=True):
            img_url = urljoin(url, img["src"])
            if any(ext in img_url.lower() for ext in [".jpg", ".jpeg", ".png"]):
                download_and_compress_image(img_url)

    except Exception as e:
        print(f"Fehler beim Scraping von {url}: {e}")

# Scraping für jede gefundene URL ausführen
for domain, urls in extracted_urls.items():
    for url in urls:
        scrape_page(url)


NameError: name 'scrape_page' is not defined