In [28]:
import PyPDF2
import pickle
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import os
from datacleaningfuncs import *
lemmatizer = WordNetLemmatizer()

# stopwords
stopwords = pd.read_pickle('stopwords.pickle')
stopwords = list(stopwords['Word'])

clear = lambda: os.system('clear')

In [29]:
class stack:
    def __init__(self):
        self.__index = []

    def __len__(self):
        return len(self.__index)

    def push(self,item):
        self.__index.insert(0,item)

    def peek(self):
        if len(self) == 0:
            raise Exception("peek() called on empty stack.")
        return self.__index[0]

    def pop(self):
        if len(self) == 0:
            raise Exception("pop() called on empty stack.")
        return self.__index.pop(0)

    def __str__(self):
        return str(self.__index)

### Extracting the information on what the text in the pdf represents

#### Source - https://towardsdatascience.com/extracting-headers-and-paragraphs-from-pdf-using-pymupdf-676e8421c467

In [30]:
from operator import itemgetter
import fitz

In [31]:
def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles

def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag 
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag

def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para

In [32]:
# threshold = 100
# font_counts = [element for element in font_counts if element[1]>threshold]
# font_counts

In [33]:
headings = pd.DataFrame()
headings['headingnumber'] = []
headings['heading'] = []
headings['subheadings'] = []
headings['paragraphs'] = []
headings['nextheading'] = []

paragraphs = pd.DataFrame()
paragraphs['text'] = []
paragraphs['lines'] = []
paragraphs['preprocessed_lines'] = []
paragraphs['preprocessed_words'] = []

topics = pd.DataFrame()
topics['headingID'] = []

documents = pd.DataFrame()
documents['topicID'] = []

In [34]:
book = fitz.open('books/book.pdf')
font_counts, styles = fonts(book, granularity=False)
font_tags = font_tags(font_counts, styles)
parsed_data = headers_para(book, font_tags)
print(parsed_data[0:20])

['<s15>[', '<s27> i ', '<s15>]|', '', '<h2>Table of Contents|', '', '<h9>Preface | xv|', '<h9>Chapter 1: Blockchain 101 | 1|', '', '<h21>The growth of blockchain technology | 1|', '<h21>Progress toward maturity | 2|', '<h21>Increasing interest | 5|', '<h21>Distributed systems | 7|', '<h21>The history of blockchain and Bitcoin | 8|', '<h21>The events that led to blockchain | 9|', '<h21>Electronic cash | 10|', '<h21>Blockchain | 12|', '<h21>Blockchain defined | 12|', '<h21>Blockchain architecture | 14|', '<h21>Generic elements of a blockchain | 16|']


In [35]:
insideheading = stack()
lastheading = 0
lastheadingindex = 0
for elementindex in range(len(parsed_data)):
    element = parsed_data[elementindex]
    if element[0:2]=='<s':
        # not using subscripts right now
        pass
    elif element[0:2]=='<h':
        headingnumber = int(element[element.find('h')+1:element.find('>')])
        if '|' in element:
#             heading = ''.join([character for character in element if character!='|'])
            heading = element[element.find('>')+1:element.find('|')]
        else:
#             print('this got triggered')
#             continue
            element = element[0:1]+'p'+element[2:]
            parsed_data[elementindex] = element
            paragraphindex = len(paragraphs.index)
            paragraphtext = element[3:]
            paragraphtext = ''.join([atom for atom in paragraphtext if atom != '|'])
            paragraphs.loc[paragraphindex] = np.array([paragraphtext, [], [], []], dtype=object)
            headings.iloc[lastheadingindex]['paragraphs'].append(paragraphindex)
        temp = [word for word in element if word not in stopwords]
        if len(temp)!=0:
            newindex = len(headings.index)
            headings.loc[newindex] = np.array([headingnumber, heading, [], [], []], dtype=object)
            if lastheading!=0:
                if lastheading==headingnumber:
                    headings.iloc[newindex-1]['nextheading'].append(newindex)
                elif lastheading<headingnumber:
                    insideheading.push([lastheading, newindex-1])
                    headings.iloc[newindex-1]['subheadings'].append(newindex)
                else:
                    while True:
                        try:
                            poppedheading = insideheading.pop()
                            if poppedheading[0]>headingnumber:
                                continue
                            elif poppedheading[0]==headingnumber:
                                headings.iloc[poppedheading[1]]['nextheading'].append(newindex)
                                break
                            else:
                                insideheading = stack()
                                break
                        except:
                            insideheading = stack()
                            break
            lastheading = headingnumber
            lastheadingindex = newindex
    elif element[0:2]=='<p':
        paragraphindex = len(paragraphs.index)
        paragraphtext = element[3:]
        paragraphtext = ''.join([atom for atom in paragraphtext if atom != '|'])
        paragraphs.loc[paragraphindex] = np.array([paragraphtext, [], [], []], dtype=object)
        headings.iloc[lastheadingindex]['paragraphs'].append(paragraphindex)

In [36]:
headings.head(60)

Unnamed: 0,headingnumber,heading,subheadings,paragraphs,nextheading
0,2.0,Table of Contents,[1],[],[540]
1,9.0,Preface,[],[],[2]
2,9.0,Chapter 1: Blockchain 101,[3],[],[22]
3,21.0,The growth of blockchain technology,[],[],[4]
4,21.0,Progress toward maturity,[],[],[5]
5,21.0,Increasing interest,[],[],[6]
6,21.0,Distributed systems,[],[],[7]
7,21.0,The history of blockchain and Bitcoin,[],[],[8]
8,21.0,The events that led to blockchain,[],[],[9]
9,21.0,Electronic cash,[],[],[10]


In [37]:
paragraphs.head()

Unnamed: 0,text,lines,preprocessed_lines,preprocessed_words
0,This book has one goal. To teach the theory an...,[],[],[]
1,This approach makes this book a unique blend o...,[],[],[]
2,Soon after the advent of blockchain technology...,[],[],[]
3,"To fill this gap, the first edition of this bo...",[],[],[]
4,This book has four new chapters on some of the...,[],[],[]


In [38]:
# pre-processing headings

# preprocessing the list of keyword-defining components
# removing numbers and special characters
listkdc = headings['heading']
listkdc = remove_number_and_special_characters_from_list(listkdc)
# split into words and make lowercase
listkdc = make_kws_and_lower(listkdc)
# remove stopwords
listkdc = remove_stopwords(listkdc, stopwords)
# delete any list element if it becomes empty after the above operations
#=====
# update database
listkdc = remove_empty_KWs(listkdc)
for i in range(len(listkdc)):
    listkdc[i] = [lemmatizer.lemmatize(word) for word in listkdc[i]]
headings['preprocessed_heading'] = listkdc
#=====

In [39]:
headings.to_pickle('databasefrom1.pickle')

In [40]:
# pre-processing paragraphs
for paragraphid in paragraphs.index:
    paragraphs.iloc[paragraphid]['lines'] = paragraphs.iloc[paragraphid]['text'].split('.')
    listoflines = list(paragraphs.iloc[paragraphid]['lines'])
    listoflines = remove_number_and_special_characters_from_list(listoflines)
    listoflines = make_kws_and_lower(listoflines)
    listoflines = remove_stopwords(listoflines, stopwords)
    listoflines = remove_empty_KWs(listoflines)
    for i in range(len(listoflines)):
        listoflines[i] = [lemmatizer.lemmatize(word) for word in listoflines[i]] ### lemmatization
    paragraphs.iloc[paragraphid]['preprocessed_lines'] = listoflines
    for line in paragraphs.iloc[paragraphid]['preprocessed_lines']:
        for word in line:
            paragraphs.iloc[paragraphid]['preprocessed_words'].append(word)

In [41]:
paragraphs.head()

Unnamed: 0,text,lines,preprocessed_lines,preprocessed_words
0,This book has one goal. To teach the theory an...,"[This book has one goal, To teach the theory ...","[[book, one, goal], [teach, theory, distribute...","[book, one, goal, teach, theory, distributed, ..."
1,This approach makes this book a unique blend o...,[This approach makes this book a unique blend ...,"[[approach, make, book, unique, blend, theoret...","[approach, make, book, unique, blend, theoreti..."
2,Soon after the advent of blockchain technology...,[Soon after the advent of blockchain technolog...,"[[soon, advent, blockchain, technology, severa...","[soon, advent, blockchain, technology, several..."
3,"To fill this gap, the first edition of this bo...","[To fill this gap, the first edition of this b...","[[fill, gap, first, edition, book, written], [...","[fill, gap, first, edition, book, written, due..."
4,This book has four new chapters on some of the...,[This book has four new chapters on some of th...,"[[book, four, new, chapter, latest, topic, inc...","[book, four, new, chapter, latest, topic, incl..."


In [42]:
paragraphs.to_pickle('databasetextcomponentfrom1.pickle')

In [43]:
len(paragraphs)

7056