## Image Summary

This Notebook prompts Gemini to generate summaries for the extracted images, which are stored in image chunks in QDrant DB.

In [1]:
def column_boxes(page, footer_margin=50, header_margin=50, no_image_text=True):
    """Determine bboxes which wrap a column."""
    paths = page.get_drawings()
    bboxes = []

    # path rectangles
    path_rects = []

    # image bboxes
    img_bboxes = []

    # bboxes of non-horizontal text
    # avoid when expanding horizontal text boxes
    vert_bboxes = []

    # compute relevant page area
    clip = +page.rect
    clip.y1 -= footer_margin  # Remove footer area
    clip.y0 += header_margin  # Remove header area

    def can_extend(temp, bb, bboxlist):
        """Determines whether rectangle 'temp' can be extended by 'bb'
        without intersecting any of the rectangles contained in 'bboxlist'.

        Items of bboxlist may be None if they have been removed.

        Returns:
            True if 'temp' has no intersections with items of 'bboxlist'.
        """
        for b in bboxlist:
            if not intersects_bboxes(temp, vert_bboxes) and (
                b == None or b == bb or (temp & b).is_empty
            ):
                continue
            return False

        return True

    def in_bbox(bb, bboxes):
        """Return 1-based number if a bbox contains bb, else return 0."""
        for i, bbox in enumerate(bboxes):
            if bb in bbox:
                return i + 1
        return 0

    def intersects_bboxes(bb, bboxes):
        """Return True if a bbox intersects bb, else return False."""
        for bbox in bboxes:
            if not (bb & bbox).is_empty:
                return True
        return False

    def extend_right(bboxes, width, path_bboxes, vert_bboxes, img_bboxes):
        """Extend a bbox to the right page border.

        Whenever there is no text to the right of a bbox, enlarge it up
        to the right page border.

        Args:
            bboxes: (list[IRect]) bboxes to check
            width: (int) page width
            path_bboxes: (list[IRect]) bboxes with a background color
            vert_bboxes: (list[IRect]) bboxes with vertical text
            img_bboxes: (list[IRect]) bboxes of images
        Returns:
            Potentially modified bboxes.
        """
        for i, bb in enumerate(bboxes):
            # do not extend text with background color
            if in_bbox(bb, path_bboxes):
                continue

            # do not extend text in images
            if in_bbox(bb, img_bboxes):
                continue

            # temp extends bb to the right page border
            temp = +bb
            temp.x1 = width

            # do not cut through colored background or images
            if intersects_bboxes(temp, path_bboxes + vert_bboxes + img_bboxes):
                continue

            # also, do not intersect other text bboxes
            check = can_extend(temp, bb, bboxes)
            if check:
                bboxes[i] = temp  # replace with enlarged bbox

        return [b for b in bboxes if b != None]

    def clean_nblocks(nblocks):
        """Do some elementary cleaning."""

        # 1. remove any duplicate blocks.
        blen = len(nblocks)
        if blen < 2:
            return nblocks
        start = blen - 1
        for i in range(start, -1, -1):
            bb1 = nblocks[i]
            bb0 = nblocks[i - 1]
            if bb0 == bb1:
                del nblocks[i]

        # 2. repair sequence in special cases:
        # consecutive bboxes with almost same bottom value are sorted ascending
        # by x-coordinate.
        y1 = nblocks[0].y1  # first bottom coordinate
        i0 = 0  # its index
        i1 = -1  # index of last bbox with same bottom

        # Iterate over bboxes, identifying segments with approx. same bottom value.
        # Replace every segment by its sorted version.
        for i in range(1, len(nblocks)):
            b1 = nblocks[i]
            if abs(b1.y1 - y1) > 10:  # different bottom
                if i1 > i0:  # segment length > 1? Sort it!
                    nblocks[i0 : i1 + 1] = sorted(
                        nblocks[i0 : i1 + 1], key=lambda b: b.x0
                    )
                y1 = b1.y1  # store new bottom value
                i0 = i  # store its start index
            i1 = i  # store current index
        if i1 > i0:  # segment waiting to be sorted
            nblocks[i0 : i1 + 1] = sorted(nblocks[i0 : i1 + 1], key=lambda b: b.x0)
        return nblocks

    # extract vector graphics
    for p in paths:
        path_rects.append(p["rect"].irect)
    path_bboxes = path_rects

    # sort path bboxes by ascending top, then left coordinates
    path_bboxes.sort(key=lambda b: (b.y0, b.x0))

    # bboxes of images on page, no need to sort them
    for item in page.get_images():
        img_bboxes.extend(page.get_image_rects(item[0]))

    # blocks of text on page
    blocks = page.get_text(
        "dict",
        flags=fitz.TEXTFLAGS_TEXT,
        clip=clip,
    )["blocks"]

    # Make block rectangles, ignoring non-horizontal text
    for b in blocks:
        bbox = fitz.IRect(b["bbox"])  # bbox of the block

        # ignore text written upon images
        if no_image_text and in_bbox(bbox, img_bboxes):
            continue

        # confirm first line to be horizontal
        line0 = b["lines"][0]  # get first line
        if line0["dir"] != (1, 0):  # only accept horizontal text
            vert_bboxes.append(bbox)
            continue

        srect = fitz.EMPTY_IRECT()
        for line in b["lines"]:
            lbbox = fitz.IRect(line["bbox"])
            text = "".join([s["text"].strip() for s in line["spans"]])
            if len(text) > 1:
                srect |= lbbox
        bbox = +srect

        if not bbox.is_empty:
            bboxes.append(bbox)

    # Sort text bboxes by ascending background, top, then left coordinates
    bboxes.sort(key=lambda k: (in_bbox(k, path_bboxes), k.y0, k.x0))

    # Extend bboxes to the right where possible
    bboxes = extend_right(
        bboxes, int(page.rect.width), path_bboxes, vert_bboxes, img_bboxes
    )

    # immediately return of no text found
    if bboxes == []:
        return []

    # --------------------------------------------------------------------
    # Join bboxes to establish some column structure
    # --------------------------------------------------------------------
    # the final block bboxes on page
    nblocks = [bboxes[0]]  # pre-fill with first bbox
    bboxes = bboxes[1:]  # remaining old bboxes

    for i, bb in enumerate(bboxes):  # iterate old bboxes
        check = False  # indicates unwanted joins

        # check if bb can extend one of the new blocks
        for j in range(len(nblocks)):
            nbb = nblocks[j]  # a new block

            # never join across columns
            if bb == None or nbb.x1 < bb.x0 or bb.x1 < nbb.x0:
                continue

            # never join across different background colors
            if in_bbox(nbb, path_bboxes) != in_bbox(bb, path_bboxes):
                continue

            temp = bb | nbb  # temporary extension of new block
            check = can_extend(temp, nbb, nblocks)
            if check == True:
                break

        if not check:  # bb cannot be used to extend any of the new bboxes
            nblocks.append(bb)  # so add it to the list
            j = len(nblocks) - 1  # index of it
            temp = nblocks[j]  # new bbox added

        # check if some remaining bbox is contained in temp
        check = can_extend(temp, bb, bboxes)
        if check == False:
            nblocks.append(bb)
        else:
            nblocks[j] = temp
        bboxes[i] = None

    # do some elementary cleaning
    nblocks = clean_nblocks(nblocks)

    # return identified text bboxes
    return nblocks

In [2]:
import pymupdf
import fitz

doc = pymupdf.open("F:\\psg\\bosch\\Next_Gen_Verna.pdf")
data = list()

for page in doc:
    bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
    for rect in bboxes:
        # print(page.get_text(clip=rect, sort=True))
        data.append(page.get_text(clip=rect, sort=True))
    # print("-" * 80)

In [3]:
len(data)

1395

In [4]:
data[0]

"Verna\nOWNER'S MANUAL\nOperation\nMaintenance\nSpecifications\n"

In [5]:
data[30]

'ŕ\nThis warranty is the entire war-\nranty given by HMIL for Hyundai\nvehicles and no dealer or its or his \nagent or employee is authorized\nto extend or enlarge this warranty \nand no dealer or its or his agent\nor employee is authorized to\nmake any oral warranty on HMIL’s \nbehalf.\nŕ\nHMIL reserves the right to make\nany  change in design or make\nany improvement on the vehicle\nat any  time without any obliga-\ntion to make the same change on\nvehicles previously  sold.\nŕ\nHMIL reserves the right for the fi-\nnal decision in all warranty mat-\nters.\n2ZQHUŔV\x035HVSRQVLELOLWLHV\nŕ\nProper use, maintenance and care \nof vehicle in accordance with the\ninstructions contained in this\nOwner’s Manual and Service\nBooklet. If the vehicle is subject\n'

In [6]:
COHERE_API_KEY = "g6WqGnL6XZVDQURCNwy2xtCTqEiihXr7nIZhL2UV"


In [7]:
import cohere

def get_prompt_table_summarization(page_data, table_data):
    prompt = f"""
        I want you to summarize the data extracted from the table: {table_data} using {page_data}.
        Try to understand the data and answer accordingly.
        Do not give content that is not related to the data.
        Do not use bulletin points while answering. Just summarize the table data as paragraphs.
    """

    return prompt

def generate_text(prompt, temp=0):
    co = cohere.Client(COHERE_API_KEY)
    response = co.chat(
        message=prompt,
        model="command-r",
        temperature=temp
    )
    
    return response.text

In [8]:
import time

In [19]:
def extract_data(file_path):
    doc = pymupdf.open(file_path)
    data = list()

    for i, page in enumerate(doc):
        print(f"Page no is {i}")
        try:
            page_data = ''
            tab_data = ''
            tabs = page.find_tables()
            n = len(tabs.tables)

            print(f"No of tables in page is {n}")

            bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
            
            for rect in bboxes:
                # print(page.get_text(clip=rect, sort=True))
                page_data += page.get_text(clip=rect, sort=True)
                page_data += '\n'

            print(page_data)
            for i in range(0, n, 1):
                tab = tabs[i]
                lines = tab.extract()
                print(lines)
                if len(lines) > 2:
                    print(lines)
                    time.sleep(1)

                    prompt = get_prompt_table_summarization(page_data, lines)
                    tab_data += generate_text(prompt)
                    print(tab_data)
                    tab_data += '\n'

            page_data += '\n'
            page_data += tab_data
            if page_data.strip():  # Check if page_data is not empty or just whitespace
                data.append(page_data)
                    
        except:
            print(f"No table in page {page}")
            page_data = ''
            bboxes = column_boxes(page, footer_margin=50, no_image_text=True)
            
            for rect in bboxes:
                # print(page.get_text(clip=rect, sort=True))
                page_data += page.get_text(clip=rect, sort=True)
                page_data += '\n'

            # print(page_data)
            
            page_data += '\n'
            if page_data.strip():  # Check if page_data is not empty or just whitespace
                data.append(page_data)
            # print(data)

    return data

In [20]:
# data = extract_data("F:\\psg\\bosch\\Next_Gen_Verna.pdf")
# data = extract_data("F:\\psg\\bosch\\CC - Question Bank Answers.pdf")
# F:\psg\bosch\punch-bsvi-09-09-21.pdf
data = extract_data("F:\\psg\\bosch\\punch-bsvi-09-09-21.pdf")

Page no is 0
No of tables in page is 0
OWNER’S MANUAL
Revision: Rev 00/SEPT 2021


Page no is 1
No of tables in page is 0
In our constant endeavour to provide assistance and complete 
service backup, TATA  MOTORS has established an all India cus­
tomer assistance centre.   
In case you have a query regarding any aspect of your vehicle, 
our Customer Assistance Centre will be glad to assist you on our 
Toll Free no. 1800 209 8282

You can also approach nearest TATA  MOTORS dealer.  
A separate Dealer network address booklet is provided with the 
Owner’s manual. 
TATA MOTORS 24X7 Roadside Assistance Program offers tech­
nical help in the event of a breakdown. Call the toll-free Road-
side Assistance.  
For additional information, refer to "24X7 Roadside Assis-
tance" section in the Owner’s manual.  


Page no is 2
No of tables in page is 0
Dear Customer,  
  
Welcome to the TATA  MOTORS family.  
  
We congratulate you on the purchase of your new vehicle and are privileged to have you as

In [21]:
len(data)

227

In [22]:
data[0]

'OWNER’S MANUAL\nRevision: Rev 00/SEPT 2021\n\n\n'

In [23]:
from fpdf import FPDF

def create_pdf_from_list(string_list, output_filename):
    pdf = FPDF(orientation='P', unit='mm', format='A4')
    pdf.set_auto_page_break(auto=True, margin=15)
    pdf.set_font("Arial", size=12)

    for string in string_list:
        print(string)
        if string.strip():  # Check if the string is not empty or just whitespace
            pdf.add_page()
            pdf.set_xy(10, 10)
            pdf.multi_cell(0, 10, txt=string.encode('latin-1', 'replace').decode('latin-1'))

    pdf.output(output_filename, 'F')

In [24]:
create_pdf_from_list(data, "F:\\psg\\bosch\\punch-bsvi.pdf")


OWNER’S MANUAL
Revision: Rev 00/SEPT 2021



In our constant endeavour to provide assistance and complete 
service backup, TATA  MOTORS has established an all India cus­
tomer assistance centre.   
In case you have a query regarding any aspect of your vehicle, 
our Customer Assistance Centre will be glad to assist you on our 
Toll Free no. 1800 209 8282

You can also approach nearest TATA  MOTORS dealer.  
A separate Dealer network address booklet is provided with the 
Owner’s manual. 
TATA MOTORS 24X7 Roadside Assistance Program offers tech­
nical help in the event of a breakdown. Call the toll-free Road-
side Assistance.  
For additional information, refer to "24X7 Roadside Assis-
tance" section in the Owner’s manual.  



Dear Customer,  
  
Welcome to the TATA  MOTORS family.  
  
We congratulate you on the purchase of your new vehicle and are privileged to have you as our valued customer. 
  
We urge you to read this Owner's Manual carefully and familiarize yourself with the equip