### Google Drive Mount

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### File Paths

In [None]:
BOOK_PATH = "/content/drive/My Drive/pdf2latex/data-science.pdf"
IMG_DIR = "/content/drive/My Drive/Datasets/pdf2latex/data-science-latex/images"  # found images are stored in this subfolder
OUTPUT_TEX_FILE = "/content/drive/My Drive/pdf2latex/output.tex"

### Installs

In [None]:
# Install the OpenAI library
!pip install openai --upgrade
!pip install sentence-transformers

!pip install --upgrade pymupdf
!pip install python-Levenshtein

Collecting openai
  Downloading openai-1.51.2-py3-none-any.whl.metadata (24 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai)
  Downloading h11-0.14.0-py3-none-any.whl.metadata (8.2 kB)
Downloading openai-1.51.2-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.7/383.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpx-0.27.2-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading httpcore-1.0.6-py3-none-any.whl (78 kB)
[2K   [90m━━

### Imports

In [None]:
import Levenshtein
import pymupdf
import json
from tqdm import tqdm
from openai import OpenAI
import copy

import io
import os
import sys
import time

### PyMuPDF Setup

In [None]:
doc = pymupdf.open(BOOK_PATH)
doc.page_count

456

### Prompt Setup

### OpenAI Setup

In [None]:
key = ""
with open("apikey.txt", "r") as file:
    # Get all lines from the file
    key = file.readlines()[0]
OPENAI_API_KEY = key
client = OpenAI(api_key = OPENAI_API_KEY)

### PyMuPDF Get Raw Data

In [None]:
def get_page_text_data(page_number, span_counter, doc):
    page = doc[page_number]
    blocks = page.get_text("dict", flags=0)["blocks"]

    page_data = {
        "page_number": page_number,
        "blocks": []
    }

    # Skip the first block (header)
    for block_number, b in enumerate(blocks[1:], start=1):
        block_data = {
            "block_number": block_number,
            "lines": []
        }

        for line_number, l in enumerate(b["lines"], start=1):
            line_data = {
                "line_number": line_number,
                "spans": []
            }

            for span_number, s in enumerate(l["spans"], start=1):
                span_data = {
                    "span_number": span_number,
                    "text": s["text"],
                    "font": s["font"],
                    "size": s["size"],
                    "bbox": {
                        "x0": s["bbox"][0],
                        "y0": s["bbox"][1],
                        "x1": s["bbox"][2],
                        "y1": s["bbox"][3]
                    },
                    "style": {
                        "is_italic": "italic" in flags_decomposer(s["flags"]),
                        "is_bold": "bold" in flags_decomposer(s["flags"]),
                        "is_superscript": "superscript" in flags_decomposer(s["flags"])
                    }
                }
                line_data["spans"].append(span_data)
                span_counter += 1

            block_data["lines"].append(line_data)
        page_data["blocks"].append(block_data)

    return json.dumps(page_data, indent=2), span_counter

def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return ", ".join(l)

In [None]:
def get_page_text_data(page_number, span_counter, text_data, doc):
    page = doc[page_number]
    print(page)

    # Read page text as a dictionary, suppressing extra spaces in CJK fonts
    blocks = page.get_text("dict", flags=0)["blocks"]
    # print(blocks)
    line_number_in_page = 0
    span_number_in_page = 0

    for block_number, b in enumerate(blocks):  # Iterate through the text blocks
        span_number_in_block = 0  # Initialize span counter for the block

        print(b["lines"])
        for l in b["lines"]:  # Iterate through the text lines
            # print(l)
            line_number_in_page += 1
            span_number_in_line = 0  # Initialize span counter for the line

            for s in l["spans"]:  # Iterate through the text spans
                 # Create a deep copy of the original span dictionary to preserve all its properties
                span_data = copy.deepcopy(s)

                # Add additional properties if needed
                span_data["page_number"] = page_number
                span_data["span_number_overall"] = span_counter
                span_data["span_number_in_line"] = span_number_in_line
                span_data["span_number_in_block"] = span_number_in_block
                span_data["span_number_in_page"] = span_number_in_page
                span_data["block_number"] = block_number

                # Extract and store bounding box information
                x0, y0, x1, y1 = span_data["bbox"]
                span_data["indent_left"] = x0
                span_data["indent_top"] = y0
                span_data["x1"] = x1
                span_data["y1"] = y1

                # Decompose flags to determine font styles
                decomposed_flags = flags_decomposer(span_data["flags"])
                span_data["is_italic"] = "italic" in decomposed_flags
                span_data["is_bold"] = "bold" in decomposed_flags
                span_data["is_superscript"] = "superscript" in decomposed_flags
                # Append the dictionary to the text_data list
                text_data.append(span_data)
                # Increase the overall counters
                span_counter += 1
                span_number_in_line += 1  # Increase the span counter within the line
                span_number_in_block += 1  # Increase the span counter within the block
                span_number_in_page += 1

    return text_data, span_counter

def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return ", ".join(l)

# print(text_data)

## doc --> page --> block --> lines --> span (unit of text with same font size, font, styling etc)

In [None]:
#remove the first line of the first block as it is the header and \book document class handles it.
def get_page_text_data_2(page_number, span_counter, text_data, doc):
    page = doc[page_number]
    blocks = page.get_text("dict", flags=0)["blocks"]
    line_number_in_page = 0
    span_number_in_page = 0

    # Skip the first block (header)
    for block_number, b in enumerate(blocks[1:], start=1):
        span_number_in_block = 0

        print(b["lines"])
        for l in b["lines"]:
            line_number_in_page += 1
            span_number_in_line = 0

            for s in l["spans"]:
                span_data = copy.deepcopy(s)

                # Add additional properties
                span_data["page_number"] = page_number
                span_data["span_number_overall"] = span_counter
                span_data["span_number_in_line"] = span_number_in_line
                span_data["span_number_in_block"] = span_number_in_block
                span_data["span_number_in_page"] = span_number_in_page
                span_data["block_number"] = block_number

                # Extract and store bounding box information
                x0, y0, x1, y1 = span_data["bbox"]
                span_data["indent_left"] = x0
                span_data["indent_top"] = y0
                span_data["x1"] = x1
                span_data["y1"] = y1

                # Decompose flags to determine font styles
                decomposed_flags = flags_decomposer(span_data["flags"])
                span_data["is_italic"] = "italic" in decomposed_flags
                span_data["is_bold"] = "bold" in decomposed_flags
                span_data["is_superscript"] = "superscript" in decomposed_flags

                # Append the dictionary to the text_data list
                text_data.append(span_data)

                # Increase the counters
                span_counter += 1
                span_number_in_line += 1
                span_number_in_block += 1
                span_number_in_page += 1

    return text_data, span_counter


In [None]:
text_data = []
span_counter = 0
for i in range(443, 444):
  text_data, span_counter = get_page_text_data(i, span_counter, text_data, doc)
  # text_data, span_counter = get_page_text_data(i, span_counter, doc)

print(len(text_data))
print(text_data)

page 443 of /content/drive/My Drive/pdf2latex/data-science.pdf
[{'spans': [{'size': 9.962599754333496, 'flags': 4, 'font': 'XgmrvdCfsxnjCMR10', 'color': 0, 'ascender': 0.75, 'descender': -0.25, 'text': '432', 'origin': (79.87958526611328, 43.7943115234375), 'bbox': (79.87958526611328, 36.32236099243164, 94.82347106933594, 46.28496170043945)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (79.87958526611328, 36.32236099243164, 94.82347106933594, 46.28496170043945)}, {'spans': [{'size': 9.962599754333496, 'flags': 6, 'font': 'CdrjfvKwjwncCMSL10', 'color': 0, 'ascender': 0.75, 'descender': -0.25, 'text': 'CHAPTER 14. BIBLIOGRAPHY', 'origin': (269.9305725097656, 43.7943115234375), 'bbox': (269.9305725097656, 36.32236099243164, 423.5339660644531, 46.28496170043945)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (269.9305725097656, 36.32236099243164, 423.5339660644531, 46.28496170043945)}]
[{'spans': [{'size': 8.966400146484375, 'flags': 4, 'font': 'SprqrjDdwwxxCMR9', 'color': 0, 'ascender': 0.75, 'des

In [None]:
# first_page_command = "Convert the above text information to LaTeX, preserving all formatting, structure, and image references. The data extracted from PyMuPDF contains details on fonts, styles, formatting, and text positions within the document. It may include headings, subheadings, paragraphs, and images. Provide only the LaTeX code, incorporating any necessary packages and modules for a complete document. Include commands for chapters, sections, and subsections, while maintaining macros for inserting images. Don't end the document. Add page number as comment at the end of each page."
first_page_command = "Convert the above structured information to LaTeX, preserving all formatting, structure, and image references. The data extracted from PyMuPDF for one page of a pdf and contains details on fonts, styles, formatting, and text positions within the document. It may include headings, subheadings, paragraphs, and images. The information given to you is a JSON for a page which contains the following hierarchy: page -> blocks -> lines -> spans. It contains detailed information for each span. Your task is to convert this information to the LaTeX code, incorporating any necessary packages and modules for a complete document. Include commands for chapters, sections, and subsections, while maintaining macros for inserting images. Don't end the document. Add page number as comment at the end of each page. Use a clearpage latex tag at the end of each page. Ignore the first line of the first block which is the header information of the page as it is automatically handled by the document class book. Similarly, if you find any spans towards the end of the page with a single number they are mostly page numbers and can be ignored as these are handled automatically."

In [None]:
# default_page_command = "Continue Converting the above text information to LaTeX, preserving formatting and image references. The data extracted from PyMuPDF contains details on fonts, styles, formatting, and text positions within the document. It may include headings, subheadings, paragraphs, and images. Provide only the LaTeX code. Include commands for chapters, sections, and subsections. Only continue the LaTeX code for the new page based on the provided latex code of previous page. Don't end the document until i say it's the last page. Add page number as a comment at the end of each page. Include latex code for figueres whenever necessary. The images are in a images directory. Their namings follow the following format: page{pno:03d}_img{image_count:03d}.png. Don't start chapter unless it is really the start of a chapter."
default_page_command = "Continue Converting the above text information to LaTeX, preserving formatting and image references. The data extracted from PyMuPDF contains details on fonts, styles, formatting, and text positions within the document. It may include headings, subheadings, paragraphs, and images. The information given to you is a JSON for a page which contains the following hierarchy: page -> blocks -> lines -> spans. It contains detailed information for each span. Your task is to convert this information to the LaTeX code. Include commands for chapters, sections, and subsections. Only continue the LaTeX code for the new page based on the provided latex code of previous page. Don't end the document until i say it's the last page. Add page number as a comment at the end of each page. Include latex code for figueres whenever necessary. The images are in a images directory. Their namings follow the following format: page{pno:03d}_img{image_count:03d}.png. Don't start chapter unless it is really the start of a chapter. Use a clearpage latex tag at the end of each page. Ignore the first line of the first block which is the header information of the page as it is automatically handled by the documentclass book. Similarly, if you find any spans towards the end of the page with a single number they are mostly page numbers and can be ignored as these are handled automatically."

In [None]:
def generate_response(command, data, prev_response, temperature=1):
  first_page_prompt = f"{data} \n {command}"
  default_page_prompt = f"""\
  LaTeX code for previous page: {prev_response} \n

  Data for the new latex page:
{data}

{command}
"""
  prompt_content = first_page_prompt if prev_response == "" else default_page_prompt
  response =  client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {"role": "system", "content": "You are a helpful assistant. You convert PDF documents to LaTeX."},
    {"role": "user", "content": f"{data} \n {command}"}
    ],
  temperature=temperature
  )
  return response.choices[0].message.content

### Main Pipeline

In [None]:
prev_response = ""

# Initialize the .tex file (optional, to clear previous content)
with open(OUTPUT_TEX_FILE, 'w') as f:
    f.write("")

for i in range(11, 12):
  print(f"Page number {i}")
  text_data = []
  span_counter = 0
  command = first_page_command if prev_response == "" else default_page_command
  text_data, span_counter = get_page_text_data(i, span_counter, text_data, doc)
  response = generate_response(command, text_data, prev_response)
  prev_response = response
  print(response)

  # Append the response to the .tex file
  with open(OUTPUT_TEX_FILE, 'a') as f:
      f.write(response + "\n")


Page number 11
page 11 of /content/drive/My Drive/pdf2latex/data-science.pdf
[{'spans': [{'size': 24.787099838256836, 'flags': 20, 'font': 'RfrbdtDgcwxnCMBX12', 'color': 0, 'ascender': 0.75, 'descender': -0.25099998712539673, 'text': 'Contents', 'origin': (79.93990325927734, 106.197998046875), 'bbox': (79.93990325927734, 87.60767364501953, 188.80487060546875, 112.41956329345703)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (79.93990325927734, 87.60767364501953, 188.80487060546875, 112.41956329345703)}]
[{'spans': [{'size': 9.962599754333496, 'flags': 20, 'font': 'YxbpwhRkkdnhCMBX10', 'color': 0, 'ascender': 0.75, 'descender': -0.25, 'text': '1', 'origin': (79.93990325927734, 168.155029296875), 'bbox': (79.93990325927734, 160.68307495117188, 85.66839599609375, 170.6456756591797)}], 'wmode': 0, 'dir': (1.0, 0.0), 'bbox': (79.93990325927734, 160.68307495117188, 85.66839599609375, 170.6456756591797)}, {'spans': [{'size': 9.962599754333496, 'flags': 20, 'font': 'YxbpwhRkkdnhCMBX10', 'color': 0

RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

### Extract Images using PyMuPDF

In [None]:
if not tuple(map(int, pymupdf.version[0].split("."))) >= (1, 18, 18):
    raise SystemExit("require PyMuPDF v1.18.18+")

dimlimit = 0  # 100  # each image side must be greater than this
relsize = 0  # 0.05  # image : image size ratio must be larger than this (5%)
abssize = 0  # 2048  # absolute image size limit 2 KB: ignore if smaller

if not os.path.exists(IMG_DIR):  # make subfolder if necessary
    os.mkdir(IMG_DIR)


def recoverpix(doc, item):
    xref = item[0]  # xref of PDF image
    smask = item[1]  # xref of its /SMask

    # special case: /SMask or /Mask exists
    if smask > 0:
        pix0 = pymupdf.Pixmap(doc.extract_image(xref)["image"])
        if pix0.alpha:  # catch irregular situation
            pix0 = pymupdf.Pixmap(pix0, 0)  # remove alpha channel
        mask = pymupdf.Pixmap(doc.extract_image(smask)["image"])

        try:
            pix = pymupdf.Pixmap(pix0, mask)
        except:  # fallback to original base image in case of problems
            pix = pymupdf.Pixmap(doc.extract_image(xref)["image"])

        if pix0.n > 3:
            ext = "pam"
        else:
            ext = "png"

        return {  # create dictionary expected by caller
            "ext": ext,
            "colorspace": pix.colorspace.n,
            "image": pix.tobytes(ext),
        }

    # special case: /ColorSpace definition exists
    # to be sure, we convert these cases to RGB PNG images
    if "/ColorSpace" in doc.xref_object(xref, compressed=True):
        pix = pymupdf.Pixmap(doc, xref)
        pix = pymupdf.Pixmap(pymupdf.csRGB, pix)
        return {  # create dictionary expected by caller
            "ext": "png",
            "colorspace": 3,
            "image": pix.tobytes("png"),
        }
    return doc.extract_image(xref)


def get_images_from_page(pno, imgdir=IMG_DIR):
    image_list = doc.get_page_images(pno)
    print(image_list)
    imglist.extend([x[0] for x in image_list])
    image_count = 1  # Initialize image count for the current page

    for img in image_list:
        print("found an image")
        xref = img[0]
        if xref in xreflist:
            continue

        width = img[2]
        height = img[3]
        if min(width, height) <= dimlimit:
            continue

        image = recoverpix(doc, img)
        n = image["colorspace"]
        imgdata = image["image"]

        if len(imgdata) <= abssize:
            continue
        if len(imgdata) / (width * height * n) <= relsize:
            continue

        # Naming the image with page number and image number within the page
        imgfile = os.path.join(imgdir, f"page{pno:03d}_img{image_count:03d}.{image['ext']}")
        with open(imgfile, "wb") as fout:
            fout.write(imgdata)

        xreflist.append(xref)
        image_count += 1  # Increment image count for the current page

In [None]:
t0 = time.time()
xreflist = []
imglist = []
page_count = doc.page_count  # number of pages


for pno in range(1, page_count):
  get_images_from_page(pno, IMG_DIR)
t1 = time.time()
imglist = list(set(imglist))
print(len(set(imglist)), "images in total")
print(len(xreflist), "images extracted")
print("total time %g sec" % (t1 - t0))

[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[(5327, 0, 896, 643, 8, 'DeviceRGB', '', 'X1', 'DCTDecode')]
found an image
[(5332, 0, 896, 430, 8, 'DeviceRGB', '', 'X2', 'FlateDecode')]
found an image
[]
[(5335, 0, 661, 502, 8, 'DeviceRGB', '', 'Im3', 'DCTDecode'), (5336, 0, 671, 548, 8, 'DeviceRGB', '', 'Im4', 'DCTDecode')]
found an image
found an image
[]
[(5343, 0, 1120, 389, 8, 'DeviceRGB', '', 'X1', 'DCTDecode')]
found an image
[]
[(5345, 0, 771, 301, 8, 'DeviceRGB', '', 'Im6', 'FlateDecode')]
found an image
[(5349, 0, 896, 633, 8, 'DeviceRGB', '', 'X1', 'DCTDecode')]
found an image
[]
[]
[]
[]
[(5353, 0, 460, 246, 8, 'DeviceRGB', '', 'X1', 'DCTDecode'), (5354, 0, 502, 246, 8, 'DeviceRGB', '', 'X2', 'DCTDecode'), (5355, 0, 491, 246, 8, 'DeviceRGB', '', 'X3', 'DCTDecode'), (5356, 0, 490, 246, 8, 'DeviceRGB', '', 'X4', 'DCTDecode')]
found an image
found an image
found an image
found an image
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[(8063, 0, 181, 107, 8, 'DeviceRGB', '', 'X1',

In [None]:
imglist = list(set(imglist))