In [1]:
import requests
from bs4 import BeautifulSoup
from pathlib import Path
import json
from typing import Dict, List, Tuple, Any
from trafilatura import fetch_url, extract

In [None]:
def fetch_webpage(url: str) -> str:
    """Fetch webpage content from URL."""
    response = requests.get(url)
    response.raise_for_status()  # Raise exception for 4xx/5xx responses
    return response.text

def parse_html(html_content: str) -> BeautifulSoup:
    """Parse HTML content using BeautifulSoup."""
    return BeautifulSoup(html_content, 'html.parser')



def extract_image_metadata(soup: BeautifulSoup) -> List[Dict[str, Any]]:
    """Extract metadata for all images in the HTML."""
    images = []
    for i, img in enumerate(soup.find_all('img')):
        image_id = f"{i+1}"
        filename = Path(img.get('src', 'empty')).name
        images.append({
            'id': image_id,
            'filename': filename,
            'src': img.get('src', ''),
            'alt': img.get('alt', ''),
            'element': img  # Store reference to original element
        })
    return images

def get_text_with_image_markers(soup: BeautifulSoup, images: List[Dict[str, Any]]) -> str:
    """
    Extract text from HTML with image markers inserted at appropriate positions.
    """
    # Make a copy of the soup to avoid modifying the original
    soup_copy = BeautifulSoup(str(soup), 'html.parser')
    
    # Replace each image with a marker
    for img_data in images:
        img_element = soup_copy.find('img', src=img_data['src'], alt=img_data['alt'])
        if img_element:
            marker = soup_copy.new_string(f"[IMG:{img_data['id']}]")
            img_element.replace_with(marker)
    
    # Extract text, removing excessive whitespace
    text = ' '.join(soup_copy.get_text().split())
    return text

def save_metadata(images: List[Dict[str, Any]], output_path: Path) -> None:
    """Save image metadata to a JSON file."""
    # Remove the BeautifulSoup element reference before saving
    clean_images = []
    for img in images:
        img_copy = img.copy()
        img_copy.pop('element', None)
        clean_images.append(img_copy)
        
    with open(output_path, 'w') as f:
        json.dump(clean_images, f, indent=2)

def scrape_webpage_with_image_markers(url: str, metadata_output: str = 'image_metadata.json') -> Tuple[str, List[Dict[str, Any]]]:
    """
    Main function to scrape webpage and return text with image markers.
    
    Args:
        url: URL of the webpage to scrape
        metadata_output: Path to save image metadata JSON
        
    Returns:
        Tuple containing:
        - Text content with image markers
        - List of image metadata dictionaries
    """
    html_content = fetch_webpage(url)
    soup = parse_html(html_content)
    
    # Extract image metadata
    images = extract_image_metadata(soup)
    
    # Get text with image markers
    text_with_markers = get_text_with_image_markers(soup, images)
    
    # Save metadata if output path is provided
    if metadata_output:
        save_metadata(images, Path(metadata_output))
    
    return text_with_markers, images

In [4]:
# Example usage
if __name__ == "__main__":
    url = "https://zw01f.github.io/malware%20analysis/auto-color/"
    text, images = scrape_webpage_with_image_markers(url)
    print(text)
    print(f"Found {len(images)} images")

Auto-color - Linux backdoor - ZW01f Skip links Skip to primary navigation Skip to content Skip to footer [IMG:1] ZW01f Malware Analysis CTF Writeups All Categories Toggle search Toggle menu [IMG:2] Mohamed Ezat Malware Analysis and RE Follow Email Twitter LinkedIn GitHub Auto-color - Linux backdoor 17 minute read On this page Meet Auto-color Technical in Points First look String Decryption Malware Installation Running in the Background - Demonstration Auto-color ’s C2 Functionality Config extraction Connecting to C2 Server Executing C2 Commands Analysis of libcext.so.2 Protecting /etc/ld.preload Persistance Hiding Network Activity YARA Rule Python Automated Configuration Extraction References Meet Auto-color Auto-color is a Linux backdoor that has been seen in cyberattacks targeting government organizations and universities in North America and Asia. It was first observed between November and December 2024 and is designed to avoid detection while remaining hidden in systems for a long 

In [5]:
url = "https://zw01f.github.io/malware%20analysis/auto-color/"
downloaded = fetch_url(url)

In [6]:
result = extract(downloaded, output_format="txt")

In [7]:
from difflib import *

In [8]:
reslist = []
for line in result.splitlines():
    reslist.extend(line.split(" "))

In [9]:
splitted_text = text.split(" ")

In [14]:
diff = ndiff(reslist, splitted_text)

In [15]:
ldiff = list(diff)

In [18]:
import re

In [34]:
pattern = r"\+\s\[IMG:\d{1,2}\]"

In [40]:
i = "+ [IMG:2]"

In [41]:
x = re.fullmatch(pattern, i)

In [42]:
print(x)

<re.Match object; span=(0, 9), match='+ [IMG:2]'>


In [38]:
keep = []
trash = []
for i in ldiff:
    if not i.startswith("+") or re.fullmatch(pattern, i):
        keep.append(i)
    else:
        trash.append(i)

In [39]:
keep

['  Auto-color',
 '  -',
 '  Linux',
 '  backdoor',
 '+ [IMG:1]',
 '+ [IMG:2]',
 '  Meet',
 '  Auto-color',
 '  Auto-color',
 '  is',
 '  a',
 '  Linux',
 '  backdoor',
 '  that',
 '  has',
 '  been',
 '  seen',
 '  in',
 '  cyberattacks',
 '  targeting',
 '  government',
 '  organizations',
 '  and',
 '  universities',
 '  in',
 '  North',
 '  America',
 '  and',
 '  Asia.',
 '  It',
 '  was',
 '  first',
 '  observed',
 '  between',
 '  November',
 '  and',
 '  December',
 '  2024',
 '  and',
 '  is',
 '  designed',
 '  to',
 '  avoid',
 '  detection',
 '  while',
 '  remaining',
 '  hidden',
 '  in',
 '  systems',
 '  for',
 '  a',
 '  long',
 '  time.',
 '  The',
 '  malware',
 '  acts',
 '  as',
 '  be',
 '  benign',
 '  color-enhancement',
 '  tool',
 '  and',
 '  uses',
 '  common',
 '  file',
 '  names',
 '  like',
 '  “door,”',
 '  “egg,”',
 '  and',
 '  “log”',
 '  to',
 '  disguise',
 '  itself.',
 '  Auto-color',
 '  gets',
 '  its',
 '  name',
 '  from',
 '  the',
 '  file

In [43]:
trash

['+ -',
 '+ ZW01f',
 '+ Skip',
 '+ links',
 '+ Skip',
 '+ to',
 '+ primary',
 '+ navigation',
 '+ Skip',
 '+ to',
 '+ content',
 '+ Skip',
 '+ to',
 '+ footer',
 '+ ZW01f',
 '+ Malware',
 '+ Analysis',
 '+ CTF',
 '+ Writeups',
 '+ All',
 '+ Categories',
 '+ Toggle',
 '+ search',
 '+ Toggle',
 '+ menu',
 '+ Mohamed',
 '+ Ezat',
 '+ Malware',
 '+ Analysis',
 '+ and',
 '+ RE',
 '+ Follow',
 '+ Email',
 '+ Twitter',
 '+ LinkedIn',
 '+ GitHub',
 '+ Auto-color',
 '+ -',
 '+ Linux',
 '+ backdoor',
 '+ 17',
 '+ minute',
 '+ read',
 '+ On',
 '+ this',
 '+ page',
 '+ Meet',
 '+ Auto-color',
 '+ Technical',
 '+ in',
 '+ Points',
 '+ First',
 '+ look',
 '+ String',
 '+ Decryption',
 '+ Malware',
 '+ Installation',
 '+ Running',
 '+ in',
 '+ the',
 '+ Background',
 '+ -',
 '+ Demonstration',
 '+ Auto-color',
 '+ ’s',
 '+ C2',
 '+ Functionality',
 '+ Config',
 '+ extraction',
 '+ Connecting',
 '+ to',
 '+ C2',
 '+ Server',
 '+ Executing',
 '+ C2',
 '+ Commands',
 '+ Analysis',
 '+ of',
 '+ libcext.s

In [None]:
blacklist = ["avatar", "logo"]