In [8]:
import requests
from bs4 import BeautifulSoup, NavigableString
import os
import json
import uuid

In [7]:
def scrape_webpage_with_image_positions(webpage_url, save_dir):
    """
    Scrape a webpage, download images, and create a text representation
    with markers indicating image positions.
    
    Args:
        webpage_url (str): URL of the webpage to scrape
        save_dir (str): Directory to save images and output
        
    Returns:
        dict: Content with text and image metadata
    """
    try:
        # Send a GET request to the webpage
        response = requests.get(webpage_url)
        response.raise_for_status()  # Check for HTTP issues
        
        # Parse the HTML content
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Create directory if not exists
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        
        # Initialize content collection
        content = {
            "text": "",
            "images": []
        }
        
        # Process the body of the document
        body = soup.body or soup
        process_element(body, content, webpage_url, save_dir)
        
        # Save the content structure to a JSON file
        output_path = os.path.join(save_dir, "content.json")
        with open(output_path, 'w', encoding='utf-8') as f:
            json.dump(content, f, indent=2)
            
        # Save the text with image markers to a separate file
        text_path = os.path.join(save_dir, "content.txt")
        with open(text_path, 'w', encoding='utf-8') as f:
            f.write(content["text"])
        
        print(f"Scraped content saved to {output_path}")
        print(f"Text with image markers saved to {text_path}")
        return content
        
    except requests.exceptions.RequestException as e:
        print(f"Failed to scrape webpage: {e}")
        return None

In [17]:
def process_element(element, content, base_url, save_dir):
    """
    Recursively process an HTML element and its children,
    extracting text and images in the order they appear.
    
    Args:
        element: BeautifulSoup element
        content: Dictionary to store content
        base_url: Base URL for resolving relative URLs
        save_dir: Directory to save images
    """
    # Skip script, style, and other non-content elements
    if element.name in ['script', 'style', 'meta', 'link', 'head']:
        return
    
    # Process all children of this element
    for child in element.children:
        if isinstance(child, NavigableString):
            # Handle text content
            text = child.strip()
            if text:
                content["text"] += text + " "
        elif child.name == 'img':
            # Handle image element
            process_image(child, content, base_url, save_dir)
        elif child.name in ['p', 'div', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
            # Process block elements and add appropriate spacing
            process_element(child, content, base_url, save_dir)
            content["text"] += "\n\n"
        elif child.name == 'br':
            # Handle line breaks
            content["text"] += "\n"
        elif child.name == 'a':
            # For links, get the text but also track that it was a link
            link_text = child.get_text().strip()
            if link_text:
                content["text"] += link_text + " "
            # Recursively process any nested elements in the link
            process_element(child, content, base_url, save_dir)
        else:
            # Process other elements recursively
            process_element(child, content, base_url, save_dir)

In [18]:
def process_image(img_tag, content, base_url, save_dir):
    """
    Process an image element, download it, and insert a marker in the text.
    
    Args:
        img_tag: BeautifulSoup img element
        content: Dictionary to store content
        base_url: Base URL for resolving relative URLs
        save_dir: Directory to save images
    """
    img_url = img_tag.get('src')
    if not img_url:
        return
    
    # Handle relative URLs
    if not img_url.startswith(('http:', 'https:')):
        img_url = requests.compat.urljoin(base_url, img_url)
    
    # Generate a unique ID for this image
    img_id = str(uuid.uuid4())[:8]
    
    # Get original filename or generate one
    original_filename = img_url.split('/')[-1]
    filename = f"{img_id}_{original_filename}"
    local_path = os.path.join(save_dir, filename)
    
    # Get image alt text if available
    alt_text = img_tag.get('alt', '')
    
    # Create a marker for this image position
    marker = f"[IMAGE:{img_id}]"
    
    # Insert the marker at the current position in the text
    content["text"] += marker + " "
    
    # Record image metadata
    image_info = {
        "id": img_id,
        "url": img_url,
        "local_path": local_path,
        "alt_text": alt_text,
        "marker": marker
    }
    content["images"].append(image_info)
    
    # Download the image
    try:
        response = requests.get(img_url, stream=True)
        response.raise_for_status()
        with open(local_path, 'wb') as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        print(f"Downloaded: {local_path}")
    except requests.exceptions.RequestException as e:
        print(f"Failed to download {img_url}: {e}")
        image_info["download_error"] = str(e)

In [19]:
def reconstruct_document(content_path, output_path, image_format="html"):
    """
    Reconstruct a document from the content.json file,
    replacing image markers with actual images.
    
    Args:
        content_path (str): Path to the content.json file
        output_path (str): Path to save the reconstructed document
        image_format (str): Format for image inclusion ("html" or "markdown")
    """
    with open(content_path, 'r', encoding='utf-8') as f:
        content = json.load(f)
    
    text = content["text"]
    images = content["images"]
    
    # Replace each image marker with the appropriate image tag
    for img in images:
        marker = img["marker"]
        if image_format == "html":
            img_tag = f'<img src="{img["local_path"]}" alt="{img["alt_text"]}" />'
        else:  # markdown
            img_tag = f'![{img["alt_text"]}]({img["local_path"]})'
        
        text = text.replace(marker, img_tag)
    
    # Save the reconstructed document
    if image_format == "html":
        text = f"<html><body>{text}</body></html>"
    
    with open(output_path, 'w', encoding='utf-8') as f:
        f.write(text)
    
    print(f"Reconstructed document saved to {output_path}")



In [20]:
url1 = "https://zw01f.github.io/malware%20analysis/auto-color/"

In [21]:
url2 = "https://gootloader.wordpress.com/2025/03/31/gootloader-returns-malware-hidden-in-google-ads-for-legal-documents/"

In [22]:
path = "/home/bartek/Kod/PD/praca_dyplomowa/notebooki/images"

In [23]:
# Example usage
if __name__ == "__main__":
    url = url1
    save_directory = "scraped_content"
    
    # Scrape the webpage with image positions
    content = scrape_webpage_with_image_positions(url, path)
    
    # Reconstruct the document (optional)
    if content:
        reconstruct_document(
            os.path.join(save_directory, "content.json"),
            os.path.join(save_directory, "reconstructed.html"),
            image_format="html"
        )

Downloaded: /home/bartek/Kod/PD/praca_dyplomowa/notebooki/images/3120c4c2_logo.png
Downloaded: /home/bartek/Kod/PD/praca_dyplomowa/notebooki/images/51aab4d1_avatar.jpg
Downloaded: /home/bartek/Kod/PD/praca_dyplomowa/notebooki/images/f57202cb_VT.png
Downloaded: /home/bartek/Kod/PD/praca_dyplomowa/notebooki/images/9cd1735e_flow.png
Downloaded: /home/bartek/Kod/PD/praca_dyplomowa/notebooki/images/9ec1eaad_str_dec.png
Downloaded: /home/bartek/Kod/PD/praca_dyplomowa/notebooki/images/9f8735e8_installation.png
Downloaded: /home/bartek/Kod/PD/praca_dyplomowa/notebooki/images/ff6d72fd_lock_file.png
Downloaded: /home/bartek/Kod/PD/praca_dyplomowa/notebooki/images/a2751c8e_fork.png
Downloaded: /home/bartek/Kod/PD/praca_dyplomowa/notebooki/images/74844779_enc_config.png
Downloaded: /home/bartek/Kod/PD/praca_dyplomowa/notebooki/images/b7ca948d_config_decryption.png
Downloaded: /home/bartek/Kod/PD/praca_dyplomowa/notebooki/images/2d87406a_checksum.png
Downloaded: /home/bartek/Kod/PD/praca_dyplomowa/

FileNotFoundError: [Errno 2] No such file or directory: 'scraped_content/content.json'