In [188]:
import fitz

In [233]:
document = fitz.open('data/test.pdf')
output = fitz.Document()

In [234]:
def get_content(blocks):
    title = ''
    
    spans = { 'title': [], 'body': [] }
    
    for block in blocks:
        lines = block.get("lines")

        for line in lines:
            span = line.get("spans")[0]

            span_size = int(span.get("size"))
            if span_size == 18:
                title += span.get('text')
                spans['title'].append(span)
            elif span_size == 16:
                spans['body'].append(span)
    
    return { 'title': title, 'spans': spans }

In [235]:
def fill_page(page, spans, image_path, image_start=(50, 0), image_end=(350, 250)):
    insert_y = 0 # y location of the next item

    if len(spans['title']) > 0:
        insert_y = spans['title'][0]['bbox'][1]

        for span in spans['title']:
            insert_x = span['bbox'][0]
            page.insertText(
                (insert_x, insert_y),
                fontname='tiro',
                fontsize=span['size'],
                text=span['text']
            )
            insert_y = span['bbox'][3]
            
    # add image if provided
    y_delta = 0
    
    if image_path is not None:
        img_width = image_end[0] - image_start[0]
        doc_width = page.getPixmap().width
        x_offset = (doc_width - img_width) / 2 
        rect = fitz.Rect(
            x_offset + image_start[0],
            insert_y + image_start[1],
            x_offset + image_end[0],
            insert_y + image_end[1])
        pix = fitz.Pixmap(image_path)
        page.insertImage(rect, pixmap=pix, overlay=True)
        y_delta = rect.height
    
    # add body
    if len(spans['body']) > 0:
        for span in spans['body']:
            try:
                insert_x = span['bbox'][0]
                insert_y = span['bbox'][3] + y_delta
                page.insertText(
                    (insert_x, insert_y),
                    fontname='tiro',
                    fontsize=span['size'],
                    text=span['text']
                )
            except:
                break

In [236]:
for pno, page in enumerate(document.pages()):
    blocks = page.getDisplayList().getTextPage().extractDICT().get("blocks")
    out_page = output.newPage(pno=pno, width=page.getPixmap().width, height=page.getPixmap().height)
    content = get_content(blocks)
    image_path = 'data/images/%s.jpg' % content['title'] if len(content['title']) != 0 else None
                
    fill_page(out_page, content['spans'], image_path)

148.0


In [237]:
output.save('data/output.pdf')
output.close()
document.close()