In [43]:
import fitz
import os
import base64
import io
from PIL import Image
from together import Together

In [None]:
def get_image_caption(base64_image: base64) -> str:
    """Image captioning by using a Vision Language Model"""
    
    client = Together()
    prompt = "Give a suitable caption for the provided image"
    
    stream = client.chat.completions.create(
    model="meta-llama/Llama-Vision-Free",
    # Other vision model choices
    # Meta Llama 3.2 90B Vision Instruct Turbo $ 1.2
    # Meta Llama 3.2 11B Vision Instruct Turbo $ 0.18
    # Meta Llama Guard 3 11B Vision Turbo $ 0.18
    messages=[
        {
            "role": "user",
            "content": [
                {"type": "text", "text": prompt},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                    },
                },
            ],
        }
    ],
    stream=True,
    )
    
    caption = ""
    for chunk in stream:
        if chunk.choices and chunk.choices[0].delta:
            content = chunk.choices[0].delta.content
            caption += content
    
    return caption
        

In [45]:
def table_to_markdown(table_data: list) -> str:
    """Converts a table (list of lists) to a Markdown format"""
    return "This is a placeholder for converted markdown of a table"

In [None]:
def process_pdf(pdf_path: str) -> str:
    """
        Processes a PDF, extracts text, images (gets captions), and tables,
        and returns a Markdown string.
    """
    doc = fitz.open(pdf_path)
    print(f"Processing PDF: {pdf_path} with {len(doc)} pages.")
    
    final_doc = []
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        page_content = []
        page_content.append(f"\n## Page {page_num + 1}\n")
        
        # Extracting Text
        text = page.get_text("text")
        if text.strip():
            page_content.append("### Text\n")
            page_content.append(text.strip())
            # page_content.append("\n")
            pass
            
        
        # Extracting Images and getting caption
        image_list = page.get_images(full=True)
        print(f"Page: {page_num}")
        if image_list:
            # print(f"YESS: {page_num}")
            page_content.append("### Images\n")
            
            for img in image_list:
                
                # get the XREF of the image
                xref = img[0]

                base_image = doc.extract_image(xref)
                # base_image is a dictionary with lot of info
                
                # this is the bytes of the image
                image_bytes = base_image["image"]
                
                # converting it to base 64 to make it easy to use with Together AI
                base64_image = base64.b64encode(image_bytes).decode("utf-8")
                
                # get the image extension(useful for saving the img)
                # image_ext = base_image["ext"]
                
                # Caption the image and add it to our page_content
                caption = get_image_caption(base64_image)
                page_content.append(caption)

        
        # Extracting tables
        # PyMuPDF's table extraction is heuristic.
        # For complex tables, check pdfplumber or camelot-py.
        tables = page.find_tables()
        if tables.tables:
            # page_content.append("### Table\n")
            # Write logic here to convert table into plain text
            pass
                    
        final_doc.extend(page_content)
    
    return "\n\n".join(final_doc)

In [47]:
PDF_PATH = 'Data/Applications of Transformers.pdf'

In [48]:
output = process_pdf(PDF_PATH)

Processing PDF: Data/Applications of Transformers.pdf with 58 pages.
Page: 0
Page: 1
Page: 2
Page: 3
Page: 4
Page: 5
Page: 6
Page: 7
Page: 8
Page: 9
Page: 10
Page: 11
Page: 12
Page: 13
Page: 14
Page: 15
Page: 16
Page: 17
Page: 18
Page: 19
Page: 20
Page: 21
Page: 22
Page: 23
Page: 24
Page: 25
Page: 26
Page: 27
Page: 28
Page: 29
Page: 30
Page: 31
Page: 32
Page: 33
Page: 34
Page: 35
Page: 36
Page: 37
Page: 38
Page: 39
Page: 40
Page: 41
Page: 42
Page: 43
Page: 44
Page: 45
Page: 46
Page: 47
Page: 48
Page: 49
Page: 50
Page: 51
Page: 52
Page: 53
Page: 54
Page: 55
Page: 56
Page: 57


In [41]:
print(output)


## Page 1


Venkatakrishnan E  
23110357 
Problem 1:  
 
Discuss why Diffusion Maps work well for time-series clustering. 
 
●​ Diffusion maps capture the underlying manifold structure of high dimensional 
time series data 
●​ It constructs a graph based representation of the data, where we group points 
based on local similarity(which is basically the DTW distance), the edges of the 
graph are the transition probabilities​
 
●​ We can find non linear temporal patterns with the help of diffusion maps 
●​ Time-series data often contain noise and non-linear variations, but diffusion maps 
mainly focuses on the global structure thus reducing the impact of noise 
●​ Diffusion Maps reduce the dimensionality while preserving meaningful distances. 
​
 
Why Diffusion Maps outperform PCA/t-SNE? 
 
PCA assumes a linear structure, but the time series data generally is non linear, this is 
evident from the lower ARI and silhouette score compared to diffusion maps. 
t-SNE is designed for visualiza

In [None]:
# with open("output.md", "w", encoding="utf-8") as f:
#     f.write(output)

In [49]:
with open("transformers.md", "w", encoding="utf-8") as f:
    f.write(output)