<a href="https://colab.research.google.com/github/SS-2005/MedPic_Detector/blob/main/ImageExtractor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4 pymupdf Pillow

Collecting pymupdf
  Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.26.3-cp39-abi3-manylinux_2_28_x86_64.whl (24.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pymupdf
Successfully installed pymupdf-1.26.3


In [None]:
from google.colab import files
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from PIL import Image
from io import BytesIO
import fitz
import re
import argparse

# Updated headers to mimic a real browser
HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
    'Accept-Language': 'en-US,en;q=0.9',
    'Referer': 'https://www.google.com/',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
}

def extract_images_from_url(url, output_dir="extracted_images"):
    """
    Extracts all images from a webpage URL with improved headers and error handling
    """
    os.makedirs(output_dir, exist_ok=True)
    image_paths = []

    try:
        # Fetch webpage content with updated headers
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()

        # Parse HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        img_tags = soup.find_all('img')

        # Download images
        for i, img_tag in enumerate(img_tags):
            img_url = img_tag.get('src') or img_tag.get('data-src')
            if not img_url:
                continue

            # Make absolute URL
            img_url = urljoin(url, img_url)

            try:
                # Download image with headers
                img_response = requests.get(img_url, headers=HEADERS, stream=True, timeout=10)
                img_response.raise_for_status()

                # Check content type
                content_type = img_response.headers.get('content-type', '')
                if 'image' not in content_type:
                    continue

                # Get file extension from content type or URL
                ext = 'jpg'  # default
                if 'image/' in content_type:
                    ext = content_type.split('/')[-1].split(';')[0]
                elif '.' in img_url.split('/')[-1].split('?')[0]:
                    ext = img_url.split('/')[-1].split('.')[-1].split('?')[0]

                # Clean extension
                ext = re.sub(r'[^a-z0-9]', '', ext.lower())[:5]
                if not ext:
                    ext = 'jpg'

                # Save image
                filename = f"url_image_{i+1}.{ext}"
                img_path = os.path.join(output_dir, filename)

                with open(img_path, 'wb') as f:
                    for chunk in img_response.iter_content(1024):
                        f.write(chunk)

                image_paths.append(img_path)
                print(f"Saved: {img_path}")

            except Exception as e:
                print(f"Error downloading {img_url}: {str(e)}")

    except Exception as e:
        print(f"Error processing URL: {str(e)}")
        # Try alternative approach if initial fails
        try:
            print("Trying alternative approach...")
            response = requests.get(url, headers={
                'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.4 Safari/605.1.15'
            })
            response.raise_for_status()
            # Process with BeautifulSoup same as above
        except Exception as alt_e:
            print(f"Alternative approach failed: {str(alt_e)}")

    return image_paths

def extract_images_from_pdf(pdf_path, output_dir="extracted_images"):
    """
    Extracts images from a PDF file (local path or URL)
    """
    os.makedirs(output_dir, exist_ok=True)
    image_paths = []
    is_url = pdf_path.startswith('http')

    try:
        # Handle PDF from URL
        if is_url:
            response = requests.get(pdf_path, headers=HEADERS)
            response.raise_for_status()
            pdf_data = BytesIO(response.content)
            doc = fitz.open(stream=pdf_data, filetype="pdf")
        # Handle local PDF
        else:
            doc = fitz.open(pdf_path)

        # Extract images
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            image_list = page.get_images(full=True)

            for img_index, img in enumerate(image_list, start=1):
                xref = img[0]
                base_image = doc.extract_image(xref)
                image_bytes = base_image["image"]
                ext = base_image["ext"]

                # Save image
                filename = f"pdf_page{page_num+1}_img{img_index}.{ext}"
                img_path = os.path.join(output_dir, filename)

                with open(img_path, "wb") as f:
                    f.write(image_bytes)

                image_paths.append(img_path)
                print(f"Saved: {img_path}")

        doc.close()

    except Exception as e:
        print(f"Error processing PDF: {str(e)}")

    return image_paths

def main(input_source, output_dir="extracted_images"):
    """
    Main function to handle either URL or PDF input
    """
    if re.match(r'https?://', input_source, re.I):
        print(f"Processing URL: {input_source}")
        return extract_images_from_url(input_source, output_dir)
    elif input_source.lower().endswith('.pdf'):
        print(f"Processing PDF: {input_source}")
        return extract_images_from_pdf(input_source, output_dir)
    else:
        raise ValueError("Input must be a valid URL or PDF file path")

In [None]:
main("https://radiologyassistant.nl/chest/chest-x-ray/lung-disease")

Processing URL: https://radiologyassistant.nl/chest/chest-x-ray/lung-disease
Saved: extracted_images/url_image_1.png
Saved: extracted_images/url_image_2.jpeg
Saved: extracted_images/url_image_3.jpeg
Saved: extracted_images/url_image_4.png
Saved: extracted_images/url_image_5.jpeg
Saved: extracted_images/url_image_6.png
Saved: extracted_images/url_image_7.png
Saved: extracted_images/url_image_8.png
Saved: extracted_images/url_image_9.jpeg
Saved: extracted_images/url_image_10.jpeg
Saved: extracted_images/url_image_11.jpeg
Saved: extracted_images/url_image_12.jpeg


KeyboardInterrupt: 

In [None]:
main("/content/pdf.pdf")

Processing PDF: /content/pdf.pdf
Saved: extracted_images/pdf_page1_img1.png
Saved: extracted_images/pdf_page1_img2.png
Saved: extracted_images/pdf_page2_img1.png
Saved: extracted_images/pdf_page3_img1.png
Saved: extracted_images/pdf_page3_img2.png


['extracted_images/pdf_page1_img1.png',
 'extracted_images/pdf_page1_img2.png',
 'extracted_images/pdf_page2_img1.png',
 'extracted_images/pdf_page3_img1.png',
 'extracted_images/pdf_page3_img2.png']