In [1]:
import pathlib
import textwrap
import google.generativeai as genai
# from google.colab import userdata
from IPython.display import display
from IPython.display import Markdown
import PIL.Image
import google.ai.generativelanguage as glm

In [2]:
def to_markdown(text):
    text = text.replace('â€¢', '  *')
    return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [3]:
genai.configure(api_key= 'AIzaSyC1GWn9MIie45OvptabsXjHOXINlbA_0mk')

In [4]:
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-flash-latest
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


In [10]:
%%time
model = genai.GenerativeModel('gemini-pro')
response = model.generate_content("What is Markov Chain Rule")

CPU times: total: 0 ns
Wall time: 8.42 s


In [11]:
response

response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=glm.GenerateContentResponse({
      "candidates": [
        {
          "content": {
            "parts": [
              {
                "text": "**Markov Chain Rule**\n\nThe Markov Chain Rule states that the probability of a sequence of events occurring in a Markov chain is equal to the product of the probabilities of each event conditioned on the previous event in the sequence.\n\n**Formula:**\n\n```\nP(X_1, X_2, ..., X_n) = P(X_1) * P(X_2 | X_1) * P(X_3 | X_2) * ... * P(X_n | X_{n-1})\n```\n\nwhere:\n\n* X_1, X_2, ..., X_n are the events in the sequence.\n* P(X_1) is the probability of the first event.\n* P(X_i | X_{i-1}) is the conditional probability of event X_i occurring given that event X_{i-1} occurred previously.\n\n**Assumptions:**\n\n* The Markov chain is memoryless, meaning that the probability of a future event depends only on the most recent event in the sequence and not on any previous ev

In [12]:
to_markdown(response.text)


> **Markov Chain Rule**
> 
> The Markov Chain Rule states that the probability of a sequence of events occurring in a Markov chain is equal to the product of the probabilities of each event conditioned on the previous event in the sequence.
> 
> **Formula:**
> 
> ```
> P(X_1, X_2, ..., X_n) = P(X_1) * P(X_2 | X_1) * P(X_3 | X_2) * ... * P(X_n | X_{n-1})
> ```
> 
> where:
> 
> * X_1, X_2, ..., X_n are the events in the sequence.
> * P(X_1) is the probability of the first event.
> * P(X_i | X_{i-1}) is the conditional probability of event X_i occurring given that event X_{i-1} occurred previously.
> 
> **Assumptions:**
> 
> * The Markov chain is memoryless, meaning that the probability of a future event depends only on the most recent event in the sequence and not on any previous events.
> * The transition probabilities between states are constant over time.
> 
> **Uses:**
> 
> The Markov Chain Rule is used in various applications, including:
> 
> * Modeling sequential data
> * Predicting future events based on past events
> * Simulating random processes
> * Analyzing queuing systems
> * Solving optimization problems
> * Natural language processing
> 
> **Example:**
> 
> Consider a Markov chain with two states, A and B. The transition probabilities are:
> 
> ```
> P(A | A) = 0.6
> P(B | A) = 0.4
> P(A | B) = 0.3
> P(B | B) = 0.7
> ```
> 
> The probability of the sequence A -> A -> B is:
> 
> ```
> P(A, A, B) = P(A) * P(A | A) * P(B | A) = 0.5 * 0.6 * 0.4 = 0.12
> ```

In [13]:
response.prompt_feedback




In [14]:
model = genai.GenerativeModel('gemini-pro-vision')
img = PIL.Image.open('trial_img.png')

response = model.generate_content(img)
to_markdown(response.text)

>  **Proximity Routing:**
> - Chooses the path with the least delay for message forwarding between nodes.
> - Finding optimal paths is complex (NP-hard).
> - Heuristic approach: forward to the closest neighbor.
> - May increase routing path length.
> - Well suited for Pastry networks.
> 
> **Proximity Neighbor Selection:**
> - Nodes prioritize physically close neighbors in the overlay network, while respecting identifier constraints.
> - Example in Chord network: choosing the closest successor within a specific range.
> 
> **Landmark clusters with dimensionality reduction:**
> - Measures distance to landmarks to create a position vector.
>  - Uses space-filling curves to map high-dimensional data to lower dimensions while preserving proximity.
>  - Limited accuracy for very close nodes, can be improved with Round-Trip Time (RTT) measurement.
> - Easier to optimize in unstructured overlays due to more flexible neighbor relationships.
> 
> Each of these approaches leverages network proximity information to enhance the efficiency and performance of structured P2P overlay networks. By exploiting proximity-aware techniques in neighbor selection, routing optimization, and data replication/placement, these networks can achieve better scalability, reliability, and responsiveness, making them suitable for a wide range of distributed applications and services.

In [35]:
for m in genai.list_models():
    if 'generateContent' in m.supported_generation_methods:
        print(m.name)

models/gemini-1.0-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-latest
models/gemini-1.0-pro-vision-latest
models/gemini-1.5-flash-latest
models/gemini-1.5-pro-latest
models/gemini-pro
models/gemini-pro-vision


In [36]:
model = genai.GenerativeModel('gemini-1.5-flash-latest')


In [37]:
response = model.generate_content(["Extract all the text from the image. Remove bulletin points, markdowns and rephrase those as paragraphs", img], stream=True)
response.resolve()
# to_markdown(response.text)
print(response.text)

SAFETY

Recommended CRS Position as per the Vehicle Matrix

The suitability of seat position for carriage of children and recommended category of CRS is shown in the table below as per the child group.

X - Seat Position not suitable for children in this age group.

U - Suitable for "universal" category restraints approved for use in this age group.

L - Suitable for particular child restraints. These restraints may be of the specific vehicle, restricted or semi-universal categories.

Universal is a category in the AIS072 / ECE R44 norm.


If a child is seated in the front seat it may cause serious injury or even death during any collision. 



In [39]:
model = genai.GenerativeModel('gemini-1.5-flash-latest')
img = PIL.Image.open('img_3.png')

response = model.generate_content(["Extract all the text from the image provided to you irrespective of the word limits. Remove bulletin points and markdowns if the image has any and rephrase those as paragraphs", img], stream=True)
response.resolve()
to_markdown(response.text)
# print(response.text)

> ## IMPORTANT INFORMATION 
> 
> In this Owner's Manual, you will find the text

In [48]:
model = genai.GenerativeModel('gemini-1.5-flash-latest')
img = PIL.Image.open('img_4.png')

response = model.generate_content(["Extract all the text from the image provided to you irrespective of the word limits. If there is a table in the image, then understand and rephrase the entire table as a paragraph. Remove bulletin points and markdowns if the image has any and rephrase those as paragraphs. Don't give the table.", img], stream=True)
response.resolve()
to_markdown(response.text)
# print(response.text)

> SAFETY
> 
> Recommended CRS Position as per the Vehicle Matrix
> The suitability of seat position for carriage of children and recommended category of CRS is shown in the table below as per the child group. 
> 
> X - Seat Position not suitable for children in this age group. 
> U - Suitable for "universal" category restraints approved for use in this age group.
> L - Suitable for particular child restraints. These restraints may be of the specific vehicle, restricted or semi-universal categories. 
> Universal is a category in the AIS072 / ECE R44 norm. 
> 
> WARNING
> 
> If a child is seated in the front seat it may cause serious injury or even death during any collision. 
> 
> The table below shows recommended CRS positions for different age and weight categories.  A "U" denotes that the position is suitable for "Universal" category restraints, "U,L" denotes that the position is suitable for "Universal" and "L" category restraints. "X" denotes unsuitable, "0" denotes age groups up to 10 kilograms, "0+" denotes up to 13 kilograms, "I" denotes 9 to 18 kilograms, "II" denotes 15 to 25 kilograms and "III" denotes 22 to 36 kilograms.  The table also indicates the suitability of seating the child in the front passenger seat or in the rear seats, on either the left or right hand side.  It also states that this information is only relevant to vehicles within the ECE R44 standard. 


In [10]:
model = genai.GenerativeModel('gemini-1.5-flash-latest')
img = PIL.Image.open('F:\\psg\\bosch_hackathon\\input_data\\engine.jpeg')

response = model.generate_content(["""
                                   Summarize the image provided to you in a paragraph.
                                   Try to summarize the image in a detailed manner.""", img], 
                                   stream=True)
response.resolve()
to_markdown(response.text)
# print(response.text)

> The image shows a diagram of a car engine with various components labeled with numbers. The engine is a Petrol Engine (Kappa 1.2 MPI). The components are identified as: 1, 2, 3, 4, 5, 6, 7, 8, and 9. The text below the diagram states that the actual engine room in the vehicle may differ from the illustration.  The image provides a detailed view of the engine layout, which is a helpful visual guide for understanding the engine's components.  It's important to remember that this is just a representation, and the actual engine may look slightly different. 


In [136]:
# models/gemini-1.0-pro
# models/gemini-1.0-pro-001
# models/gemini-1.0-pro-latest
# models/gemini-1.0-pro-vision-latest
# models/gemini-1.5-flash-latest
# models/gemini-1.5-pro-latest
# models/gemini-pro
# models/gemini-pro-vision

model = genai.GenerativeModel('gemini-1.5-flash-latest')
img = PIL.Image.open('img_5.png')

response = model.generate_content(["""
                                   Extract the entire text from the image and try to rephrase it as paragraphs. 
                                   Extract all the text inside note and warning also. 
                                   Provide the complete text from the image.
                                   Do not restrict to any kind of word limit. Just extract all the text.
                                   If there is a table in the image, then understand the table contents and rephrase the entire table as a paragraph. 
                                   Don't give the table.
                                   Also extract the page number that will be in either bottom left or bottom right of the image. Provide it at the start as Page Number is ...""", img], 
                                   stream=True)
response.resolve()
# to_markdown(response.text)
print(response.text)

Page Number is 10

**SAFETY**

If your vehicle is equipped with a front passenger Airbag (PAB) and does not have PAB deactivation switch, do not install a rear-facing CRS in the front passenger seat. If the PAB inflates, a child in a rear facing CRS could be seriously injured or killed.

If you install a CRS in the rear seat, slide the front seat far enough forward so that the child's feet do not touch the front seat back. This will help avoid injury to the child in the event of a collision.

**NOTE**
Children could be endangered in a collision if their CRS is not properly secured in the vehicle. Be sure to secure the child in the restraint system according to the manufacturer's instructions. 

Do not use an infant carrier or a child safety seat that "hooks" over a seat-back, it will not provide adequate protection in a collision.

After a collision, we recommend to get seat belts, seats, ISOFIX and top-tether anchorages (as may be applicable) investigated at TATA MOTORS Authorised ser

In [116]:
from PIL import Image
import io
import fitz
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.styles import getSampleStyleSheet
import time

def extract_text_from_pdfs(pdf_path, model, prompt, output_pdf_path):
    styles = getSampleStyleSheet()
    document = canvas.Canvas(output_pdf_path, pagesize=letter)
    story = []

    with fitz.open(pdf_path) as doc:
        for page_num in range(len(doc)):
            page = doc.load_page(page_num)
            pix = page.get_pixmap()
            img = PIL.Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            response = model.generate_content(
                [prompt, img], stream=True
            )
            response.resolve()
            text = response.text
            story.append(text)

            # Add text to PDF
            document.drawString(10, 800 - (page_num * 20), text)  # Adjust y position for each page
            document.showPage()
            time.sleep(2)

    document.save()
    return story

In [117]:
model = genai.GenerativeModel('gemini-1.5-flash-latest')
pdf_path = "F:\\psg\\bosch\\CC - Question Bank Answers.pdf"
prompt = """
        Extract the entire text from the image and try to rephrase it as paragraphs. 
        Extract all the text inside note and warning also. 
        Provide the complete text from the image.
        If there is a table in the image, then understand the table contents and rephrase the entire table as a paragraph. 
        Don't give the table.
        Also extract the page number that will be in either bottom left or bottom right of the image. Provide it at the start as Page Number is ...
"""

story = extract_text_from_pdfs(pdf_path, model, prompt, "F:\\psg\\bosch\\CC-gemini-2.pdf")

In [118]:
story[0]

'Page Number is 1\n\nCC Answers\n\n1. HDFS to distinguish it from other generic distributed file systems.\n\n- Fault Tolerance: HDFS is designed with fault tolerance as a core feature. It expects hardware failures and addresses them by:\n    - Block Replication: Automatically replicating data blocks across multiple nodes.\n    - Replica Placement: Strategically placing replicas in different locations to balance reliability and communication costs.\n    - Heartbeat and Blockreport: Using these messages to monitor the health and status of DataNodes.\n\n- High-Throughput Access: HDFS optimizes for high-throughput access to large data sets by:\n    - Large Block Size: Utilizing large block sizes (e.g., 64 MB) to reduce metadata storage and enhance streaming reads.\n    - Batch Processing Focus: Prioritizing data throughput over latency, suitable for batch processing applications with large data sets.\n\n2. Explain the HDFS read and write operations.\n\nReading a File:\n- A user sends an "o

In [149]:
model = genai.GenerativeModel('gemini-1.5-flash-latest')
img = PIL.Image.open('img_4.png')

response = model.generate_content(["""
                                   If there is no table with any columns in the page just return None and nothing more.
                                   Do not consider the Note and Warnings as tables.
                                   If there is a table in the image, then understand the table contents and rephrase the entire table as a paragraph.""", img], 
                                   stream=True)
response.resolve()
to_markdown(response.text)
# print(response.text)

> The suitability of seat position for carriage of children and recommended category of CRS is shown in the table below as per the child group. 
> X - Seat Position not suitable for children in this age group. 
> U - Suitable for "universal" category restraints approved for use in this age group. 
> L - Suitable for particular child restraints. These restraints may be of the specific vehicle, restricted or semi-universal categories. 
> Universal is a category in the AIS072 / ECE R44 norm. 
> Group 0 is suitable for children weighing up to 10 kg and aged up to 9 months. Front passenger, rear outboard left and right, and rear center are all unsuitable. 
> Group 0+ is suitable for children weighing up to 13 kg and aged up to 24 months. Front passenger is unsuitable, rear outboard left and right are unsuitable, and rear center is unsuitable. 
> Group I is suitable for children weighing 9 to 18 kg and aged 9 to 48 months. Front passenger is unsuitable, rear outboard left and right are unsuitable, and rear center is unsuitable. 
> Group II is suitable for children weighing 15 to 25 kg and aged approximately 3 to 7 years. Front passenger is unsuitable, rear outboard left and right are unsuitable, and rear center is unsuitable. 
> Group III is suitable for children weighing 22 to 36 kg and aged approximately 6 to 12 years. Front passenger is unsuitable, rear outboard left and right are unsuitable, and rear center is unsuitable.

In [29]:
def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
    """Determine bboxes which wrap a column."""
    paths = page.get_drawings()
    bboxes = []

    # path rectangles
    path_rects = []

    # image bboxes
    img_bboxes = []

    # bboxes of non-horizontal text
    # avoid when expanding horizontal text boxes
    vert_bboxes = []

    # compute relevant page area
    clip = +page.rect
    clip.y1 -= footer_margin  # Remove footer area
    clip.y0 += header_margin  # Remove header area

    def can_extend(temp, bb, bboxlist):
        """Determines whether rectangle 'temp' can be extended by 'bb'
        without intersecting any of the rectangles contained in 'bboxlist'.

        Items of bboxlist may be None if they have been removed.

        Returns:
            True if 'temp' has no intersections with items of 'bboxlist'.
        """
        for b in bboxlist:
            if not intersects_bboxes(temp, vert_bboxes) and (
                b == None or b == bb or (temp & b).is_empty
            ):
                continue
            return False

        return True

    def in_bbox(bb, bboxes):
        """Return 1-based number if a bbox contains bb, else return 0."""
        for i, bbox in enumerate(bboxes):
            if bb in bbox:
                return i + 1
        return 0

    def intersects_bboxes(bb, bboxes):
        """Return True if a bbox intersects bb, else return False."""
        for bbox in bboxes:
            if not (bb & bbox).is_empty:
                return True
        return False

    def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
        """Extend a bbox to the right page border.

        Whenever there is no text to the right of a bbox, enlarge it up
        to the right page border.

        Args:
            bboxes: (list[IRect]) bboxes to check
            width: (int) page width
            path_bboxes: (list[IRect]) bboxes with a background color
            vert_bboxes: (list[IRect]) bboxes with vertical text
            img_bboxes: (list[IRect]) bboxes of images
        Returns:
            Potentially modified bboxes.
        """
        for i, bb in enumerate(bboxes):
            # do not extend text with background color
            if in_bbox(bb, path_bboxes):
                continue

            # do not extend text in images
            if in_bbox(bb, img_bboxes):
                continue

            # temp extends bb to the right page border
            temp = +bb
            temp.x1 = width

            # do not cut through colored background or images
            if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
                continue

            # also, do not intersect other text bboxes
            check = can_extend(temp, bb, bboxes)
            if check:
                bboxes[i] = temp  # replace with enlarged bbox

        return [b for b in bboxes if b != None]

    def clean_nblocks(nblocks):
        """Do some elementary cleaning."""

        # 1. remove any duplicate blocks.
        blen = len(nblocks)
        if blen < 2:
            return nblocks
        start = blen - 1
        for i in range(start, -1, -1):
            bb1 = nblocks[i]
            bb0 = nblocks[i - 1]
            if bb0 == bb1:
                del nblocks[i]

        # 2. repair sequence in special cases:
        # consecutive bboxes with almost same bottom value are sorted ascending
        # by x-coordinate.
        y1 = nblocks[0].y1  # first bottom coordinate
        i0 = 0  # its index
        i1 = -1  # index of last bbox with same bottom

        # Iterate over bboxes, identifying segments with approx. same bottom value.
        # Replace every segment by its sorted version.
        for i in range(1, len(nblocks)):
            b1 = nblocks[i]
            if abs(b1.y1 - y1) > 10:  # different bottom
                if i1 > i0:  # segment length > 1? Sort it!
                    nblocks[i0 : i1 + 1] = sorted(
                        nblocks[i0 : i1 + 1], key=lambda b: b.x0
                    )
                y1 = b1.y1  # store new bottom value
                i0 = i  # store its start index
            i1 = i  # store current index
        if i1 > i0:  # segment waiting to be sorted
            nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
        return nblocks

    # extract vector graphics
    for p in paths:
        path_rects.append(p["rect"].irect)
    path_bboxes = path_rects

    # sort path bboxes by ascending top, then left coordinates
    path_bboxes.sort(key=lambda b: (b.y0, b.x0))

    # bboxes of images on page, no need to sort them
    for item in page.get_images():
        img_bboxes.extend(page.get_image_rects(item[0]))

    # blocks of text on page
    blocks = page.get_text(
        "dict",
        flags=fitz.TEXTFLAGS_TEXT,
        clip=clip,
    )["blocks"]

    # Make block rectangles, ignoring non-horizontal text
    for b in blocks:
        bbox = fitz.IRect(b["bbox"])  # bbox of the block

        # ignore text written upon images
        if no_image_text and in_bbox(bbox, img_bboxes):
            continue

        # confirm first line to be horizontal
        line0 = b["lines"][0]  # get first line
        if line0["dir"] != (1, 0):  # only accept horizontal text
            vert_bboxes.append(bbox)
            continue

        srect = fitz.EMPTY_IRECT()
        for line in b["lines"]:
            lbbox = fitz.IRect(line["bbox"])
            text = "".join([s["text"].strip() for s in line["spans"]])
            if len(text) > 1:
                srect |= lbbox
        bbox = +srect

        if not bbox.is_empty:
            bboxes.append(bbox)

    # Sort text bboxes by ascending background, top, then left coordinates
    bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))

    # Extend bboxes to the right where possible
    bboxes = extend_right(
        bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
    )

    # immediately return of no text found
    if bboxes == []:
        return []

    # --------------------------------------------------------------------
    # Join bboxes to establish some column structure
    # --------------------------------------------------------------------
    # the final block bboxes on page
    nblocks = [bboxes[0]]  # pre-fill with first bbox
    bboxes = bboxes[1:]  # remaining old bboxes

    for i, bb in enumerate(bboxes):  # iterate old bboxes
        check = False  # indicates unwanted joins

        # check if bb can extend one of the new blocks
        for j in range(len(nblocks)):
            nbb = nblocks[j]  # a new block

            # never join across columns
            if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
                continue

            # never join across different background colors
            if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
                continue

            temp = bb | nbb  # temporary extension of new block
            check = can_extend(temp, nbb, nblocks)
            if check == True:
                break

        if not check:  # bb cannot be used to extend any of the new bboxes
            nblocks.append(bb)  # so add it to the list
            j = len(nblocks) - 1  # index of it
            temp = nblocks[j]  # new bbox added

        # check if some remaining bbox is contained in temp
        check = can_extend(temp, bb, bboxes)
        if check == False:
            nblocks.append(bb)
        else:
            nblocks[j] = temp
        bboxes[i] = None

    # do some elementary cleaning
    nblocks = clean_nblocks(nblocks)

    # return identified text bboxes
    return nblocks

In [30]:
import io
import os
import base64
import numpy as np
from PIL import Image
import pymupdf
import fitz

In [31]:
def summarize_image(image_path, page_data):
    model = genai.GenerativeModel('gemini-1.5-flash-latest')
    img = PIL.Image.open(image_path)

    response = model.generate_content([f"""
                                    Summarize the image provided to you in a paragraph.
                                    Try to summarize the image in a detailed manner.
                                    Use this data for summarization. This is the text data extracted from the pdf containing this image: {page_data}""", img], 
                                    stream=True)
    response.resolve()
    return response.text

In [32]:
path = 'F:\\psg\\bosch_hackathon\\input_data\\test_pdf.pdf'

In [33]:
COHERE_API_KEY = "g6WqGnL6XZVDQURCNwy2xtCTqEiihXr7nIZhL2UV"

In [34]:
import cohere

def get_prompt_table_summarization(page_data, table_data):
    prompt = f"""
        I want you to summarize the data extracted from the table: {table_data} using {page_data}.
        Try to understand the data and answer accordingly.
        Do not give content that is not related to the data.
        Do not use bulletin points while answering. Just summarize the table data as paragraphs.
    """

    return prompt

def generate_text(prompt, temp=0):
    co = cohere.Client(COHERE_API_KEY)
    response = co.chat(
        message=prompt,
        model="command-r",
        temperature=temp
    )

    return response.text

In [35]:
def encode_image(image_path):
    ''' Getting the base64 string '''
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    

def plt_img_base64(img_base64):

    # Create an HTML img tag with the base64 string as the source
    image_html = f'<img src="data:image/jpeg;base64,{img_base64}" />'

    # Display the image by rendering the HTML
    display(HTML(image_html))

In [41]:
def extract_data(file_path):
    doc = pymupdf.open(file_path)
    data = list()

    for i, page in enumerate(doc):
        print(f"Page no is {i}")
        try:
            page_data = ''
            tab_data = ''
            img_data = ''

            image_list = page.get_images(full=True)
            print(image_list)
            # tabs = page.find_tables()
            n = len(tabs.tables)

            print(f"No of tables in page is {n}")

            bboxes = column_boxes(page, footer_margin=50, no_image_text=True)

            for rect in bboxes:
                # print(page.get_text(clip=rect, sort=True))
                page_data += page.get_text(clip=rect, sort=True)
                page_data = re.sub(r'\n\s*', ' ', page_data)
                page_data = re.sub(r' +', ' ', page_data)
                page_data = re.sub(r'[^\x20-\x7E]', '', page_data)
                page_data += '\n'

            print(page_data)
            # for i in range(0, n, 1):
            #     tab = tabs[i]
            #     lines = tab.extract()
            #     print(lines)
            #     if len(lines) > 2:
            #         print(lines)
            #         time.sleep(1)

            #         prompt = get_prompt_table_summarization(page_data, lines)
            #         tab_data += generate_text(prompt)
            #         print(tab_data)
            #         tab_data += '\n'

            page_data += '\n'
            page_data += tab_data

            if image_list:
                print(f"[+] Found a total of {len(image_list)} images in page {page_index}")
            else:
                print("[!] No images found on page", page_index)

            # file = "F:\\psg\\bosch_hackathon\\input_data\\CC - Question Bank Answers.pdf"
            # open the file
            pdf_file = fitz.open(file)


            for image_index, img in enumerate(page.get_images(full=True), start=1):
                # get the XREF of the image
                xref = img[0]
                # extract the image bytes
                base_image = pdf_file.extract_image(xref)
                image_bytes = base_image["image"]
                # get the image extension
                image_ext = base_image["ext"]
                # load it to PIL
                image = Image.open(io.BytesIO(image_bytes))
                # save it to local disk
                image.save(open(f"F:\\psg\\bosch_hackathon\\output_data\\image{page_index+1}_{image_index}.{image_ext}", "wb"))

                
                img_data += summarize_image(f"F:\\psg\\bosch_hackathon\\output_data\\image{page_index+1}_{image_index}.{image_ext}", page_data)
                encoded_val = encode_image(f"F:\\psg\\bosch_hackathon\\output_data\\image{page_index+1}_{image_index}.{image_ext}")
                plt_img_base64(encoded_val)
                print(img_data)
                
            if page_data.strip():  # Check if page_data is not empty or just whitespace
                data.append(page_data)

        except:
            print(f"No table in page {page}")
            page_data = ''
            bboxes = column_boxes(page, footer_margin=50, no_image_text=True)

            for rect in bboxes:
                # print(page.get_text(clip=rect, sort=True))
                page_data += page.get_text(clip=rect, sort=True)
                page_data = re.sub(r'\n\s*', ' ', page_data)
                page_data = re.sub(r' +', ' ', page_data)
                page_data = re.sub(r'[^\x20-\x7E]', '', page_data)

                page_data += '\n'

            # print(page_data)

            page_data += '\n'
            if page_data.strip():  # Check if page_data is not empty or just whitespace
                data.append(page_data)
            # print(data)

    return data

In [42]:
extract_data(path)

Page no is 0
[(4, 0, 580, 813, 8, 'DeviceRGB', '', 'Image1', 'DCTDecode', 0)]
No table in page page 0 of F:\psg\bosch_hackathon\input_data\test_pdf.pdf


[]