In [1]:
import fitz
from operator import itemgetter
import numpy as np
doc = fitz.open("C:\\Users\\Sofiane\\Documents\\TheAI\\Generate question\creationonline.pdf")

In [2]:
def getText_bis(page):
    p = page.getText("dict")["blocks"]
    liste = []
    for k in range(len(p)):
        liste.append(p[k]['bbox'][1])
    liste = np.argsort(liste)
    return [p[k] for k in liste]
    
    
    
def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = getText_bis(page)
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


In [3]:
font, styles = fonts(doc)
styles[font[0][0]]['size']

9.5

In [4]:
def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_sizes = []
    for (font_size, count) in font_counts:
        print(font_size,count)
        font_sizes.append(styles[font_size]['size'])
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

In [5]:
size_tag = font_tags(font, styles)
print(size_tag)

9.5 8245
9.0 2209
10.0 704
8.5 284
17.0 171
12.0 130
14.0 79
5.0 75
8.0 62
18.0 25
7.599899768829346 24
7.199900150299072 4
30.0 1
24.0 1
6.799900054931641 1
9.965530395507812 1
{30.0: '<h1>', 24.0: '<h2>', 18.0: '<h3>', 17.0: '<h4>', 14.0: '<h5>', 12.0: '<h6>', 10.0: '<h7>', 9.965530395507812: '<h8>', 9.5: '<p>', 9.0: '<s1>', 8.5: '<s2>', 8.0: '<s3>', 7.599899768829346: '<s4>', 7.199900150299072: '<s5>', 6.799900054931641: '<s6>', 5.0: '<s7>'}


In [6]:
def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = getText_bis(page)
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para

In [7]:
final = headers_para(doc, size_tag)

In [15]:
import re

questions = []
questions_index = []
for k in range(len(final)):
    if final[k][0:6]=='<h7>To' and "..." not in final[k]:
        quest = "how " + final[k][4:]
        questions_index.append(k)
        for k in quest.split("\n"):
            questions.append(re.sub(r"[^a-zA-Z0-9]+", ' ', k).lower())   
        

In [16]:
questions

['how to log on to creationonline using a smart card ',
 'how to log on to creationonline using a p12 credentials file ',
 'how to reveal the dashboard ',
 'how to conceal the dashboard ',
 'how to reveal the latest list of exception types ',
 'how to display all the current exceptions of a specific type in a list view ',
 'how to change your own p12 access credentials ',
 'how to change your own smart card access credentials ',
 'how to change your own p12 access password ',
 'how to change your own smart card access password in creationonline ',
 'how to set your own preferences ',
 'how to view your own permissions ',
 'how to set your ou preferences ',
 'how to view your current ou permissions ',
 'how to create a user ',
 'how to query users ',
 'how to view the details of an existing user ',
 'how to modify the details of a user ',
 'how to delete a user ',
 'how to confirm a user with pending status ',
 'how to reject a user with pending status ',
 'how to suspend a user ',
 'ho

In [10]:
f=open('questions.txt','w')
for ele in questions:
    f.write(ele+'\n')

f.close()

In [11]:
final[1100:1150]

['',
 '<p>Note:  User groups awaiting confirmation/rejection may also be listed in the alerts panel.|',
 '<p>1. In the navigator panel, select User Management-User Groups to display the User Groups page|',
 '<p>2. From the Action menu, select the Refresh command to list the user groups.|',
 '<p>3. Adjust the depth of the alerts panel as required.|',
 '<p>Note:  You can customise the displayed list (see  “Customising displayed lists”  on page 1-5).|',
 '<p>4. Either, on the User Groups tab, do one of the following:|',
 '<p>-| Highlight the appropriate user group in the list and select Reject from the Action menu.|',
 '<p>-| Right-click on the appropriate user group in the list and select Reject from the popup menu.|',
 '<p>Or, in the alerts panel:|',
 '<p>a) In the alerts panel, right-click on the appropriate alert in the list and select Goto Link from the | popup menu to display the details of the associated user group on a User Group tab.|',
 '<p>b) Select the Reject button or Action 

In [20]:
questions_index[:10]
par = []
for k in range(len(questions)):
    l = questions_index[k]+1
    par[k] = []
    while final[l][0:2] != "<h":
        par[k].append(final[l][4:])

IndexError: list assignment index out of range