In [2]:
import fitz  # PyMuPDF
import os

def extract_images(pdf_path, output_folder="extracted_images"):
    os.makedirs(output_folder, exist_ok=True)
    pdf = fitz.open(pdf_path)

    img_num = 0
    for page_index in range(len(pdf)):
        page = pdf[page_index]
        images = page.get_images(full=True)

        for img in images:
            xref = img[0]
            base_image = pdf.extract_image(xref)
            image_bytes = base_image["image"]
            image_ext = base_image["ext"]
            img_num += 1

            image_path = os.path.join(output_folder, f"image_{img_num}.{image_ext}")
            with open(image_path, "wb") as f:
                f.write(image_bytes)

            print(f"Saved {image_path}")

    pdf.close()

# Example
extract_images("/home/smayan/Desktop/Research-Paper-Analyst/downloads/1906.00744v5.pdf")


Saved extracted_images/image_1.png
Saved extracted_images/image_2.png
Saved extracted_images/image_3.png
Saved extracted_images/image_4.png
Saved extracted_images/image_5.png
Saved extracted_images/image_6.png
Saved extracted_images/image_7.png
Saved extracted_images/image_8.png
Saved extracted_images/image_9.png
Saved extracted_images/image_10.png
Saved extracted_images/image_11.png
Saved extracted_images/image_12.png
Saved extracted_images/image_13.png
Saved extracted_images/image_14.png
Saved extracted_images/image_15.png
Saved extracted_images/image_16.png
Saved extracted_images/image_17.png
Saved extracted_images/image_18.png
Saved extracted_images/image_19.png
Saved extracted_images/image_20.png
Saved extracted_images/image_21.png
Saved extracted_images/image_22.png
Saved extracted_images/image_23.png
Saved extracted_images/image_24.png
Saved extracted_images/image_25.png
Saved extracted_images/image_26.png
Saved extracted_images/image_27.png
Saved extracted_images/image_28.png
S

In [5]:
from pdf2image import convert_from_path
import os

def extract_page_images(pdf_path, output_folder="pages_as_images"):
    os.makedirs(output_folder, exist_ok=True)

    pages = convert_from_path(pdf_path)
    for i, page in enumerate(pages):
        page_path = os.path.join(output_folder, f"page_{i+1}.png")
        page.save(page_path, "PNG")
        print(f"Saved {page_path}")

# Example
extract_page_images("/home/smayan/Desktop/Research-Paper-Analyst/downloads/1906.00744v5.pdf")


Saved pages_as_images/page_1.png
Saved pages_as_images/page_2.png
Saved pages_as_images/page_3.png
Saved pages_as_images/page_4.png
Saved pages_as_images/page_5.png
Saved pages_as_images/page_6.png
Saved pages_as_images/page_7.png
Saved pages_as_images/page_8.png
Saved pages_as_images/page_9.png
Saved pages_as_images/page_10.png
Saved pages_as_images/page_11.png
Saved pages_as_images/page_12.png
Saved pages_as_images/page_13.png
Saved pages_as_images/page_14.png
Saved pages_as_images/page_15.png
Saved pages_as_images/page_16.png
Saved pages_as_images/page_17.png
Saved pages_as_images/page_18.png
Saved pages_as_images/page_19.png


In [11]:
import os
import fitz  # PyMuPDF

def safe_pixmap_from_bytes(image_bytes):
    """Try multiple ways to load damaged or low-quality image streams."""
    try:
        return fitz.Pixmap(image_bytes)
    except:
        pass

    # try removing alpha or strange channels
    try:
        pix = fitz.Pixmap(image_bytes)
        if pix.alpha:
            return fitz.Pixmap(pix, 0)
        return pix
    except:
        pass

    # final fallback: wrap into a PNG container
    try:
        from PIL import Image
        import io

        img = Image.open(io.BytesIO(image_bytes))
        buf = io.BytesIO()
        img.save(buf, format="PNG")
        return fitz.Pixmap(buf.getvalue())
    except:
        return None


def recover_image(doc, img):
    """Handle SMask, ColorSpace, damaged images, and corrupted streams."""
    xref = img[0]
    smask = img[1]

    # ---- CASE 1: /SMask soft-mask ----
    if smask > 0:
        try:
            base = doc.extract_image(xref)["image"]
            mask = doc.extract_image(smask)["image"]

            pix_base = safe_pixmap_from_bytes(base)
            pix_mask = safe_pixmap_from_bytes(mask)

            if pix_base is None:
                return None

            try:
                pix = fitz.Pixmap(pix_base, pix_mask)
            except:
                pix = pix_base  # fallback

            ext = "png" if pix_base.n <= 3 else "pam"
            return {"ext": ext, "image": pix.tobytes(ext)}

        except:
            pass  # fallback below

    # ---- CASE 2: /ColorSpace conversion ----
    try:
        if "/ColorSpace" in doc.xref_object(xref, compressed=True):
            pix = fitz.Pixmap(doc, xref)
            pix = fitz.Pixmap(fitz.csRGB, pix)
            return {"ext": "png", "image": pix.tobytes("png")}
    except:
        pass

    # ---- CASE 3: Normal extraction ----
    try:
        return doc.extract_image(xref)
    except:
        pass

    # ---- CASE 4: *Last resort* fallback using Pixmap reconstruction ----
    try:
        raw = doc.xref_stream(xref)
        if raw:
            pix = safe_pixmap_from_bytes(raw)
            if pix:
                return {"ext": "png", "image": pix.tobytes("png")}
    except:
        pass

    return None  # fully unrecoverable


def extract_images(pdf_path, output_dir="output"):
    """Extract all embedded images from the provided PDF path (very robust)."""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    print(f"Loading PDF: {pdf_path}")
    doc = fitz.open(pdf_path)

    seen = set()
    count = 0

    print(f"Total pages: {doc.page_count}")

    for page_index in range(doc.page_count):
        print(f"\n--- Page {page_index+1}/{doc.page_count} ---")
        images = doc.get_page_images(page_index, full=True)

        for img in images:
            xref = img[0]
            if xref in seen:
                continue

            data = recover_image(doc, img)
            if not data:
                print(f"Skipping xref {xref} (unrecoverable image)")
                continue

            ext = data["ext"]
            img_bytes = data["image"]

            filename = os.path.join(output_dir, f"img{xref:05d}.{ext}")
            try:
                with open(filename, "wb") as f:
                    f.write(img_bytes)
                print(f"Saved: {filename}")
                count += 1
            except:
                print(f"Failed saving xref {xref}")

            seen.add(xref)

    print(f"\n======================================")
    print(f"DONE. Extracted {count} images")
    print(f"Saved to folder: {output_dir}")
    print("======================================")

extract_images("/home/smayan/Desktop/Research-Paper-Analyst/downloads/2006.05474v2.pdf")


Loading PDF: /home/smayan/Desktop/Research-Paper-Analyst/downloads/2006.05474v2.pdf
Total pages: 5

--- Page 1/5 ---

--- Page 2/5 ---

--- Page 3/5 ---

--- Page 4/5 ---

--- Page 5/5 ---

DONE. Extracted 0 images
Saved to folder: output


In [15]:
# STEP 1
# import libraries
import fitz  # PyMuPDF
import io
from PIL import Image

# STEP 2
# file path you want to extract images from
file = "/home/smayan/Desktop/Research-Paper-Analyst/downloads/1906.00744v5.pdf"

# open the file
pdf_file = fitz.open(file)

# STEP 3
# iterate over PDF pages
for page_index in range(len(pdf_file)):

    # get the page itself
    page = pdf_file.load_page(page_index)  # load the page
    image_list = page.get_images(full=True)  # get images on the page

    # printing number of images found in this page
    if image_list:
        print(f"[+] Found a total of {len(image_list)} images on page {page_index}")
    else:
        print("[!] No images found on page", page_index)
    
    for image_index, img in enumerate(image_list, start=1):
        # get the XREF of the image
        xref = img[0]

        # extract the image bytes
        base_image = pdf_file.extract_image(xref)
        image_bytes = base_image["image"]

        # get the image extension
        image_ext = base_image["ext"]

        # save the image
        image_name = f"image{page_index+1}_{image_index}.{image_ext}"
        with open(image_name, "wb") as image_file:
            image_file.write(image_bytes)
            print(f"[+] Image saved as {image_name}")

[!] No images found on page 0
[+] Found a total of 2 images on page 1
[+] Image saved as image2_1.png
[+] Image saved as image2_2.png
[!] No images found on page 2
[+] Found a total of 15 images on page 3
[+] Image saved as image4_1.png
[+] Image saved as image4_2.png
[+] Image saved as image4_3.png
[+] Image saved as image4_4.png
[+] Image saved as image4_5.png
[+] Image saved as image4_6.png
[+] Image saved as image4_7.png
[+] Image saved as image4_8.png
[+] Image saved as image4_9.png
[+] Image saved as image4_10.png
[+] Image saved as image4_11.png
[+] Image saved as image4_12.png
[+] Image saved as image4_13.png
[+] Image saved as image4_14.png
[+] Image saved as image4_15.png
[+] Found a total of 38 images on page 4
[+] Image saved as image5_1.png
[+] Image saved as image5_2.png
[+] Image saved as image5_3.png
[+] Image saved as image5_4.png
[+] Image saved as image5_5.png
[+] Image saved as image5_6.png
[+] Image saved as image5_7.png
[+] Image saved as image5_8.png
[+] Image sa