In [1]:
import pymupdf
import json
from tqdm import tqdm
from openai import OpenAI
import copy
import re
import io
import os
import sys
import time

import fitz
from pdf2image import convert_from_path
from PIL import Image

In [2]:
BOOK_PATH = "../../files/data-science_book/data-science.pdf"
TEX_PATH = "../../files/data-science_book/outputs/data-science_indexed.tex"
# IMG_DIR = "/content/drive/My Drive/pdf2latex/new_approach_test/images"  # found images are stored in this subfolder
OUTPUT_TEX_FILE = "../../files/data-science_book/outputs/data-science_cleaned.tex"

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("API_KEY")

In [None]:
OPENAI_API_KEY = api_key
client = OpenAI(api_key = OPENAI_API_KEY)

In [4]:
doc = pymupdf.open(BOOK_PATH)
doc.page_count

456

In [5]:
with open(TEX_PATH, 'r') as file:
    tex_file_contents = file.read()

In [6]:
tex_file_contents



In [7]:
def get_page_text_data(page_number, span_counter, text_data, doc):
    page = doc[page_number]
    # print(page)

    # Read page text as a dictionary, suppressing extra spaces in CJK fonts
    blocks = page.get_text("dict", flags=0)["blocks"]
    # print(blocks)
    line_number_in_page = 0
    span_number_in_page = 0
    # print("--Old--")
    for block_number, b in enumerate(blocks):  # Iterate through the text blocks
        span_number_in_block = 0  # Initialize span counter for the block

        # print(b["lines"])

        for l in b["lines"]:  # Iterate through the text lines
            # print(l)
            line_number_in_page += 1
            span_number_in_line = 0  # Initialize span counter for the line
            # print("Spans : "+ str(len(l["spans"])))
            for s in l["spans"]:  # Iterate through the text spans
                 # Create a deep copy of the original span dictionary to preserve all its properties
                span_data = copy.deepcopy(s)

                # Temporary removal to check hwo it works
                del span_data["size"]
                # del span_data["flags"]
                del span_data["bidi"]
                del span_data["char_flags"]
                del span_data["ascender"]
                del span_data["descender"]
                del span_data['origin']
                del span_data['bbox']
                del span_data['color']
                del span_data['font']




                # Add additional properties if needed
                # span_data["page_number"] = page_number
                # span_data["span_number_overall"] = span_counter
                # span_data["span_number_in_line"] = span_number_in_line
                # span_data["span_number_in_block"] = span_number_in_block
                # span_data["span_number_in_page"] = span_number_in_page
                # span_data["block_number"] = block_number

                # Extract and store bounding box information
                # x0, y0, x1, y1 = span_data["bbox"]
                # span_data["indent_left"] = x0
                # span_data["indent_top"] = y0
                # span_data["x1"] = x1
                # span_data["y1"] = y1

                # Decompose flags to determine font styles
                decomposed_flags = flags_decomposer(span_data["flags"])
                span_data["is_italic"] = "italic" in decomposed_flags
                span_data["is_bold"] = "bold" in decomposed_flags
                span_data["is_superscript"] = "superscript" in decomposed_flags

                del span_data["flags"]

                # Append the dictionary to the text_data list
                text_data.append(span_data)
                # Increase the overall counters
                span_counter += 1
                span_number_in_line += 1  # Increase the span counter within the line
                span_number_in_block += 1  # Increase the span counter within the block
                span_number_in_page += 1
    # print("---Old End---")
    return text_data, span_counter

def flags_decomposer(flags):
    """Make font flags human readable."""
    l = []
    if flags & 2 ** 0:
        l.append("superscript")
    if flags & 2 ** 1:
        l.append("italic")
    if flags & 2 ** 2:
        l.append("serifed")
    else:
        l.append("sans")
    if flags & 2 ** 3:
        l.append("monospaced")
    else:
        l.append("proportional")
    if flags & 2 ** 4:
        l.append("bold")
    return ", ".join(l)

In [8]:
text_data = []
span_counter = 0
for i in range(16, 16+16):
  text_data, span_counter = get_page_text_data(i, span_counter, text_data, doc)
  # text_data, span_counter = get_page_text_data(i, span_counter, doc)

print("Length : " + str(len(text_data)))

Length : 772


In [9]:
page_breaks = re.findall(r'%---- Page End Break Here ---- Page : (\d+)', tex_file_contents)
page_positions = {int(page): pos.start() for page, pos in zip(page_breaks, re.finditer(r'%---- Page End Break Here ---- Page : \d+', tex_file_contents))}


In [10]:
first_page_command = """
You will receive an unformatted LaTeX (.tex) file part of a book along with a separate JSON file containing formatting instructions.  
Your task is to format the LaTeX file part according to the JSON data while ensuring proper structure and presentation for a book.  

### **Formatting Guidelines:**  

**1. Apply JSON Formatting Instructions:**  
   - Modify only the necessary parts based on JSON data.  
   - Do **not** make arbitrary changes—only apply specified formatting corrections.  

**2. Book Structure:**  
   - Organize content into proper **chapters, sections, and subsections** only if explicitly marked in the `.tex` file.  
   - **Do not assume chapter starts based on recurring text** (e.g., headers repeated on every page).  
   - If chapter names and numbers appear on every page in the JSON, **ignore them** when determining chapter breaks.  
   - **Remove hardcoded numbering** for chapters and sections, allowing LaTeX to handle it automatically.  
   - Make the Contents Page dynamically if contents is present in the .tex file part. Do not hardcode the table of contents.

**3. Image Handling:**  
   - Convert all instances of `\includegraphics{}` into a proper `figure` environment:  

**4. Table Formatting:**  
   - Ensure tables are properly structured with appropriate spacing, alignment, and captions for readability.  

**5. Italics Handling:**  
   - Apply italics **only** to content explicitly marked as italicized in the JSON data.  

**6. Document Setup:**  
   - This is the **first part of the book**, so include **all necessary LaTeX imports and the document class**.  
   - **Do not modify LaTeX package imports unless explicitly required in the JSON file.** 
   - Do **not** manually start or end the document unless such commands are explicitly present.  

**7. Strict Output Requirements:**  
   - The output **must be pure LaTeX code**—**no explanations, comments, or markdown syntax.**  
   - The formatted output will be **directly appended** to the `.tex` file, so it must be immediately compilable.  

**8. Accuracy and Consistency:**  
   - Since the book is processed in parts, formatting should be **consistent across all sections**.  
   - **Do not introduce new formatting styles** that conflict with previous or upcoming sections.  
   - Ensure that all content is preserved and formatted correctly—no missing text, no misinterpretations.  

**Final Note:**  
Errors in formatting can **significantly affect the compiled document.** Ensure precise execution of all instructions while preserving the document's original meaning and intent.  
  
"""



next_pages_prompt = """
You will receive a portion of a LaTeX (.tex) file part of a book along with a separate JSON file containing formatting instructions.  
Your task is to format this LaTeX file part according to the provided JSON data while maintaining consistency with previous sections.  

### **Formatting Guidelines:**  

**1. Apply JSON Formatting Instructions:**  
   - Modify only the necessary parts as specified in the JSON data.  
   - Do **not** assume formatting—only apply explicit corrections.  

**2. Maintain Book Structure:**  
   - Organize content into proper **chapters, sections, and subsections** only if explicitly marked in the `.tex` file.  
   - **Do not assume chapter starts based on recurring text** (e.g., headers repeated on every page).  
   - If chapter names and numbers appear on every page in the JSON, **ignore them** when determining chapter breaks.  
   - **Remove hardcoded numbering** on chapters, sections and subsections and rely on LaTeX’s automatic numbering system strictly.  
   - Make the Contents Page dynamically if contents is present in the .tex file part. Do not hardcode the table of contents.
**3. Image Handling:**  
   - Convert `\includegraphics{}` into a properly formatted `figure` environment:  


**4. Table Formatting:**  
   - Ensure tables are properly structured, aligned, and formatted for readability.  

**5. Italics Handling:**  
   - Apply italics **only** to content explicitly marked as italicized in the JSON data.  

**6. Document Integrity:**  
   - **Do not add any LaTeX preamble, document class, or import statements.**  
   - **Do not modify LaTeX package imports unless explicitly required in the JSON file.** 
   - **Do not include `\begin{document}` or `\end{document}`** unless explicitly present in the provided `.tex` file.  

**7. Strict Output Requirements:**  
   - The output **must be pure LaTeX code**—no explanations, comments, or markdown syntax.  
   - The formatted output will be **directly appended** to an existing `.tex` file, so it must be immediately compilable.  

**8. Accuracy and Consistency:**  
   - Ensure formatting is **consistent with previous sections** of the book.  
   - **Do not introduce new formatting styles** that conflict with earlier parts.  
   - Ensure **all content is retained**, formatted correctly, and adheres to the document’s original intent.  

**Final Note:**  
Errors in formatting can **significantly impact the final compiled document.** Follow the instructions precisely to maintain a high-quality, structured LaTeX book.  

"""


"""
     ```
     \begin{figure}[h]
         \centering
         \includegraphics{filename}
         \caption{Caption text}
         \label{fig:label}
     \end{figure}
     ```  
"""


'\n     ```\n     \x08egin{figure}[h]\n         \\centering\n         \\includegraphics{filename}\n         \\caption{Caption text}\n         \\label{fig:label}\n     \\end{figure}\n     ```  \n'

In [11]:
def generate_response(command, data, prev_response, temperature=1):
  first_page_prompt = f"{command} \n {data}"
  default_page_prompt = f"""{command} \n{data}"""
  prompt_content = first_page_prompt if prev_response == "" else default_page_prompt
  response =  client.chat.completions.create(
  model="gpt-4o",
  messages=[
    {"role": "system", "content": "You are a helpful assistant. You convert PDF documents to LaTeX."},
    {"role": "user", "content": f"{prompt_content}"}
    ],
  temperature=temperature
  )
  return response.choices[0].message.content

In [12]:
book_page_data = {}
page_numbers = []
for i in range(len(doc)):
    page = doc[i]
    page_numbers.append(page.get_label())
    book_page_data[i] = page.get_text("text").replace("\n", " ")

In [13]:
page_breaks

['2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '9',
 '10',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '19',
 '20',
 '21',
 '22',
 '23',
 '24',
 '25',
 '28',
 '30',
 '33',
 '34',
 '35',
 '36',
 '37',
 '38',
 '39',
 '41',
 '45',
 '46',
 '47',
 '48',
 '51',
 '52',
 '55',
 '58',
 '59',
 '60',
 '61',
 '62',
 '63',
 '64',
 '65',
 '66',
 '67',
 '68',
 '69',
 '70',
 '71',
 '72',
 '73',
 '74',
 '75',
 '76',
 '77',
 '78',
 '79',
 '80',
 '81',
 '82',
 '83',
 '84',
 '86',
 '87',
 '88',
 '89',
 '90',
 '92',
 '93',
 '97',
 '99',
 '100',
 '101',
 '102',
 '103',
 '104',
 '105',
 '107',
 '108',
 '109',
 '110',
 '111',
 '112',
 '114',
 '115',
 '116',
 '117',
 '118',
 '119',
 '122',
 '125',
 '126',
 '129',
 '133',
 '134',
 '135',
 '136',
 '137',
 '138',
 '139',
 '141',
 '142',
 '143',
 '144',
 '145',
 '147',
 '148',
 '152',
 '153',
 '156',
 '158',
 '159',
 '160',
 '161',
 '162',
 '163',
 '164',
 '168',
 '170',
 '171',
 '172',
 '174',
 '177',
 '178',
 '179',
 '180',
 '183',
 '186',
 '187',
 '188',
 '189',
 '

In [14]:
page_numbers.index(page_breaks[0])

19

In [15]:
def get_pages_data(start_indx, end_indx, doc):
    text_data = []
    span_counter = 0
    for i in range(start_indx, end_indx+1):
        text_data, span_counter = get_page_text_data(i, span_counter, text_data, doc)
    # text_data, span_counter = get_page_text_data(i, span_counter, doc)

    # print("Length : " + str(len(text_data)))
    return text_data

In [72]:
start_indx = 0

tex_start_pos = 0
tex_end_pos = 0

first_part = 1

parts = len(page_breaks)
counter = 1
for page in tqdm(page_breaks[:parts]):
    
    end_indx = page_numbers.index(page)
    text_data = get_pages_data(start_indx, end_indx, doc)

    tex_end_pos = page_positions[int(page)]
    tex_contents = tex_file_contents[tex_start_pos:tex_end_pos]

    # gpt api call
    combined_data = (
    "Below is pre-generated TeX code without proper formatting.\n\n"
    f"{tex_contents}\n\n"
    "Below is the JSON  data which contains formatting :\n\n"
    f"{text_data}"
    )
    if counter == parts:
      combined_data += "\n\n"
      combined_data += "This was the last part, close the latex document with end document. Before that, make an index using \makeindex command and similarly make a bibliography."
    else:
       combined_data += "\n\n"
       combined_data += f"This is the {counter} part of the book, do not close the latex document with end document."

    command = first_page_command if first_part==1 else next_pages_prompt
    response = generate_response(combined_data, command, "") # reversed the combined_data and command
        
    first_part = 0
    counter+=1
    with open(OUTPUT_TEX_FILE, 'a') as f:
      f.write(response + "\n")
      f.write(f"%---- Page End Break Here ---- Page : {page}\n")


    # update positions
    tex_start_pos = tex_end_pos+1
    start_indx = end_indx+1


100%|██████████| 323/323 [53:02<00:00,  9.85s/it]  


In [16]:
start_indx = 0

tex_start_pos = 0
tex_end_pos = 0

first_part = 1

parts = len(page_breaks)
counter = 1

skipper = 1
for page in tqdm(page_breaks[:parts]):

    if skipper != 10:
       skipper += 1
       continue

    skipper = 1
    
    end_indx = page_numbers.index(page)
    text_data = get_pages_data(start_indx, end_indx, doc)

    tex_end_pos = page_positions[int(page)]
    tex_contents = tex_file_contents[tex_start_pos:tex_end_pos]

    # gpt api call
    combined_data = (
    "Below is pre-generated TeX code without proper formatting.\n\n"
    f"{tex_contents}\n\n"
    "Below is the JSON  data which contains formatting :\n\n"
    f"{text_data}"
    )
    if counter == parts:
      combined_data += "\n\n"
      combined_data += "This was the last part, close the latex document with end document. Before that, make an index using \makeindex command and similarly make a bibliography."
    else:
       combined_data += "\n\n"
       combined_data += f"This is the {counter} part of the book, do not close the latex document with end document."

    command = first_page_command if first_part==1 else next_pages_prompt
    response = generate_response(combined_data, command, "") # reversed the combined_data and command
        
    first_part = 0
    counter+=1
    with open(OUTPUT_TEX_FILE, 'a') as f:
      f.write(response + "\n")
      f.write(f"%---- Page End Break Here ---- Page : {page}\n")


    # update positions
    tex_start_pos = tex_end_pos+1
    start_indx = end_indx+1


100%|██████████| 323/323 [30:39<00:00,  5.69s/it]
