In [1]:
import fitz
import os
import base64
import io
import pdfplumber
from PIL import Image
from together import Together
from typing import List

In [2]:
def get_image_caption(base64_image: base64) -> str:
    """Image captioning by using a Vision Language Model"""
    
    client = Together()
    prompt = "Give a suitable caption for the provided image"
    
    stream = client.chat.completions.create(
    model="meta-llama/Llama-Vision-Free",
    # Other vision model choices
    # Meta Llama 3.2 90B Vision Instruct Turbo $ 1.2
    # Meta Llama 3.2 11B Vision Instruct Turbo $ 0.18
    # Meta Llama Guard 3 11B Vision Turbo $ 0.18
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                    },
                },
            ],
        }
    ],
    stream=True,
    )
    
    caption = ""
    for chunk in stream:
        if chunk.choices and chunk.choices[0].delta:
            content = chunk.choices[0].delta.content
            caption += content
    
    return caption

In [3]:
def process_pdf(pdf_path: str) -> str:
    """
        Processes a PDF, extracts text, images (gets captions), and tables,
        and returns a Markdown string.
    """
    doc = fitz.open(pdf_path)
    print(f"Processing PDF: {pdf_path} with {len(doc)} pages.")
    
    final_doc = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        page_content = []
        page_content.append(f"\n## Page {page_num + 1}\n")
        
        # Extracting Text
        text = page.get_text("text")
        if text.strip():
            page_content.append("### Text\n")
            page_content.append(text.strip())
            # page_content.append("\n")
            pass
            
        
        # Extracting Images and getting caption
        image_list = page.get_images(full=True)
        print(f"Page: {page_num}")
        if image_list:
            # print(f"YESS: {page_num}")
            page_content.append("### Images\n")
            
            for img in image_list:
                
                # get the XREF of the image
                xref = img[0]

                base_image = doc.extract_image(xref)
                # base_image is a dictionary with lot of info
                
                # this is the bytes of the image
                image_bytes = base_image["image"]
                
                # converting it to base 64 to make it easy to use with Together AI
                base64_image = base64.b64encode(image_bytes).decode("utf-8")
                
                # get the image extension(useful for saving the img)
                # image_ext = base_image["ext"]
                
                # Caption the image and add it to our page_content
                caption = get_image_caption(base64_image)
                page_content.append(caption)

        
        # Extracting tables
        # PyMuPDF's table extraction is heuristic.
        # For complex tables, check pdfplumber or camelot-py.
        tables = page.find_tables()
        if tables.tables:
            # page_content.append("### Table\n")
            # Write logic here to convert table into plain text
            pass
                    
        final_doc.extend(page_content)
    
    return "\n\n".join(final_doc)

In [4]:
PDF_PATH = 'Data/Applications of Transformers.pdf'

In [5]:
output = process_pdf(PDF_PATH)

Processing PDF: Data/Applications of Transformers.pdf with 58 pages.
Page: 0
Page: 1
Page: 2
Page: 3
Page: 4
Page: 5
Page: 6
Page: 7
Page: 8
Page: 9
Page: 10
Page: 11
Page: 12
Page: 13
Page: 14
Page: 15
Page: 16
Page: 17
Page: 18
Page: 19
Page: 20
Page: 21
Page: 22
Page: 23
Page: 24
Page: 25
Page: 26
Page: 27
Page: 28
Page: 29
Page: 30
Page: 31
Page: 32
Page: 33
Page: 34
Page: 35
Page: 36
Page: 37
Page: 38
Page: 39
Page: 40
Page: 41
Page: 42
Page: 43
Page: 44
Page: 45
Page: 46
Page: 47
Page: 48
Page: 49
Page: 50
Page: 51
Page: 52
Page: 53
Page: 54
Page: 55
Page: 56
Page: 57


In [6]:
print(output)


## Page 1


### Text


This work has been submitted to the Expert Systems With Applications journal (Elsevier) for
possible publication
A COMPREHENSIVE SURVEY ON APPLICATIONS OF
TRANSFORMERS FOR DEEP LEARNING TASKS
Saidul Islam1, Hanae Elmekki1, Ahmed Elsebai1, Jamal Bentahar1,2,∗, Najat Drawel 1, Gaith Rjoub3,1, Witold Pedrycz4,5,6,7
1Concordia Institute for Information Systems Engineering, Concordia University, Montreal, Canada
2Department of Electrical Engineering and Computer Science, Khalifa University, Abu Dhabi, UAE
3King Hussein School of Computing Sciences, Princess Sumaya University for Technology, Jordan
4Department of Electrical and Computer Engineering, University of Alberta, Edmonton, Canada
5Systems Research Institute, Polish Academy of Sciences, Warsaw, Poland
6Department of Computer Engineering, Istinye University, Sariyer/Istanbul, Turkiye
7Department of Electrical and Computer Engineering, King Abdulaziz University, Jeddah, Saudi Arabia
∗Corresponding Author’s Email

In [None]:
# with open("output.md", "w", encoding="utf-8") as f:
#     f.write(output)

In [49]:
with open("transformers.md", "w", encoding="utf-8") as f:
    f.write(output)

In [None]:
def process_pdf(pdf_path: str) -> str:
    """
        Processes a PDF, extracts text, images (gets captions), and tables,
        and returns a Markdown string.
    """
    doc = fitz.open(pdf_path)
    print(f"Processing PDF: {pdf_path} with {len(doc)} pages.")
    c = 0
    final_doc = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        page_content = []
        page_content.append(f"\n## Page {page_num + 1}\n")
        
        # Extracting Text
        text = page.get_text("text")
        if text.strip():
            page_content.append("### Text\n")
            page_content.append(text.strip())
            page_content.append("\n")
            
        
        # Extracting Images and getting caption
        image_list = page.get_images(full=True)
        print(f"Page: {page_num}")
        if image_list:
            # print(f"YESS: {page_num}")
            page_content.append("### Images\n")
            
            for img in image_list:
                
                # get the XREF of the image
                xref = img[0]

                base_image = doc.extract_image(xref)
                # base_image is a dictionary with lot of info
                
                # this is the bytes of the image
                image_bytes = base_image["image"]
                
                # converting it to base 64 to make it easy to use with Together AI
                base64_image = base64.b64encode(image_bytes).decode("utf-8")
                
                # get the image extension(useful for saving the img)
                # image_ext = base_image["ext"]
                
                # Caption the image and add it to our page_content
                caption = get_image_caption(base64_image)
                page_content.append(caption)
                page_content.append("\n")

        
        # Extracting tables
        tables = page.find_tables()
        if tables.tables:
            page_content.append("### Table\n")
            for table in tables.tables:
                data = table.extract() # List[List[str]]
                md_table = process_tables(data)
                page_content.append(md_table)
                page_content.append("\n")

        final_doc.extend(page_content)
    return final_doc

In [8]:
process_pdf(pdf_path=PDF_PATH)

Processing PDF: Data/Applications of Transformers.pdf with 58 pages.
37


In [9]:
def process_pdf(pdf_path: str) -> list:
    doc = fitz.open(pdf_path)
    print(f"Processing PDF: {pdf_path} with {len(doc)} pages.")
    
    all_tables = [] 

    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        tables = page.find_tables()
        
        if tables.tables:
            for table in tables.tables:
                data = table.extract()  # List[List[str]]
                all_tables.append(data)
    
    print(f"Extracted {len(all_tables)} tables.")
    return all_tables

In [10]:
tables = process_pdf(PDF_PATH)

Processing PDF: Data/Applications of Transformers.pdf with 58 pages.
Extracted 43 tables.


In [14]:
tables[0]

[['Fields of\nApplication',
  'Keywords for Paper Search',
  'Tasks Of Application',
  None,
  'Number of papers',
  None],
 [None,
  None,
  None,
  None,
  'Relevant models\nusing keywords',
  'Selected models\nfor Taxonomy'],
 ['Natural Language\nProcessing',
  '“Natural Language Processing”,\n“NLP”,“Text”,“Text Processing”,\n“Transformer”, “Attention”,\n“Self-attention”, “multi-head\nattention”, “Language model”.',
  'Language Translation',
  None,
  '257',
  '25'],
 [None, None, 'Text Classification & Segmentation', None, None, None],
 [None, None, 'Question Answering', None, None, None],
 [None, None, 'Text Summarization', None, None, None],
 [None, None, 'Text Generation', None, None, None],
 [None, None, 'Natural Language Reasoning', None, None, None],
 [None, None, 'Automated Symbolic\nReasoning', None, None, None],
 ['Computer Vision',
  '“Transformer”,“Attention”,\n“Self-attention”,“Image”,\n“Natural image”,“medical\nimage”,“Biomedical”,\n“health”,“Image processing”,\n“Compu

In [15]:
import json
raw_table = tables[0]

# Serialize it as a JSON-like string (so LLM can parse it easily)
table_str = json.dumps(raw_table, indent=2)

In [18]:
messages = [ {"role": "system", "content": "You are a Markdown formatting assistant."},
    {
        "role": "user",
        "content": f"""
I have extracted a table from a PDF using OCR. It is in the form of a nested list of rows (some cells are `null` meaning continuation of above cell). Please convert this into a clean, readable markdown table.

If some cells are meant to span multiple rows, fill in the blanks based on context. Properly handle newlines inside cells too.

Here's the table:
    {table_str}
    Now return the cleaned markdown version of this table.
"""
    }
]


In [21]:
client = Together() 

response = client.chat.completions.create(
    model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
    messages=messages
)
md_table = response.choices[0].message.content
print(md_table)

### Cleaned Markdown Table
| Fields of Application | Keywords for Paper Search | Tasks Of Application | Number of papers | Selected models for Taxonomy |
| --- | --- | --- | --- | --- |
| Natural Language Processing | "Natural Language Processing", "NLP", "Text", "Text Processing", "Transformer", "Attention", "Self-attention", "multi-head attention", "Language model" | Language Translation | 257 | 25 |
|  |  | Text Classification & Segmentation |  |  |
|  |  | Question Answering |  |  |
|  |  | Text Summarization |  |  |
|  |  | Text Generation |  |  |
|  |  | Natural Language Reasoning |  |  |
|  |  | Automated Symbolic Reasoning |  |  |
| Computer Vision | "Transformer", "Attention", "Self-attention", "Image", "Natural image", "medical image", "Biomedical", "health", "Image processing", "Computer vision", "Vision" | Natural Image Processing | 197 | 27 |
|  |  |  | Image Classification |  |  |
|  |  |  | Recognition & Object Detection |  |  |
|  |  |  | Image Segmentation |  |  |
|  |

In [3]:
def process_table(table):
    table_str = json.dumps(raw_table, indent=2)
    client = Together()
    response = client.chat.completions.create(
    model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
    messages=messages
    )
    md_table = response.choices[0].message.content
    return md_table

In [22]:
with open("table_trial.md", "w") as f:
    f.write(md_table)