## This script processes documents, segments them into pages, calculates the occurrences of certain user-defined keywords, highlights these words with different colours and annotates the document with the page numbers where the keyphrases occur. <br> <br> Viktoria, June 2021

In [1]:
#This script involves convertion between file types. In order to enable this, give jupyter notebook complete disk access.

In [2]:
import os
import pandas as pd
import numpy as np
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import docx2txt
import re
import pdfplumber
import textract
import ocrmypdf
import pluggy
from tqdm import tqdm
import time
from wordcloud import WordCloud
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
from statistics import mean
from statistics import stdev
from decimal import *
import requests
from bs4 import BeautifulSoup
import textract
import urllib.request, urllib.error, urllib.parse
import ocrmypdf
import pluggy
import pdfplumber
from tqdm import tqdm
from io import BytesIO
from docx2pdf import convert
import pdfkit
import PyPDF2
from PyPDF2 import PdfFileReader
import fitz
import stamper
from PDFNetPython3 import *
import seaborn as sns
import textwrap
import sys
from datetime import datetime
import codecs
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import datefinder
import datetime
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from textwrap import wrap
from itertools import chain

#import custom functions
from segment_pdf import segment_pdf
from segment_word import segment_word
from segment_html import segment_html

pd.options.display.max_rows = 1000
pd.options.mode.chained_assignment = None

In [3]:
base = '/Users/Viktoria/Desktop/NLP_AnnotatePdf'
raw = os.path.join(base, 'Raw_data')
preproc = os.path.join(base, 'Preprocessed_data')
keyphraselists = os.path.join(base, 'Keyphrases')
results = os.path.join(base, 'Results')

os.chdir(base)

In [4]:
#pip install -r requirements.txt

In [5]:
#pip install git+https://github.com/jbarlow83/OCRmyPDF.git

### Step 1. Access the raw data ('Source_List.csv'). This is a list of titles and URLs to be scraped.

In [6]:
#This is a messy table where text data and URLs are mixed. Access everything and bring them to a clean format.
os.chdir(raw)

df = pd.read_csv('Source_List.csv')

print('N = ', len(df))
df = df.fillna('')
df.head()

N =  485


Unnamed: 0,Source,Document,URL,Text,Date
0,Australia AUSTRAC,Money laundering/terrorism financing risk asse...,,,30-Mar-21
1,Australia AUSTRAC,Fintel Alliance,,Fintel Alliance is an AUSTRAC initiative estab...,11-May-21
2,Australia AUSTRAC,SMR case study examples,,These case study examples demonstrate the impo...,12-May-21
3,Australia AUSTRAC,Coronavirus (COVID-19) – Working with our repo...,,Coronavirus update The global pandemic...,07-Apr-21
4,Australia AUSTRAC,Junket tour operations in Australia risk asses...,,,10-May-21


In [7]:
#Create a tidy title. Special characters in the filename will break the download later.

def prettify_title(title):
    
    #Create a nice and tidy title. Special characters in the title will throw errors.
    name = re.sub(r'\W+', ' ', title)
    name = re.sub(r'.pdf', '', name)
    name = re.sub('^\s*', '', name)
    name = re.sub('\s*$', '', name)
    name = re.sub('[\[\]:?!-/\+=&)(\"\'\*,]', '', name)
    name = name.strip()
    
    return name

In [8]:
df['Document']=df['Document'].apply(prettify_title)

### Step 2. Visit the URLs and get the text. <br> Everything is converted into pdf and broken down into pages, because the density of keyphrases will be calculated on each page.

In [9]:
#Call the segmenting functions for pdf, word and html. Get the text directly if it is already in the dataframe.
#Change download=0 if you don't want the documents to arrive into the pre-processed folder.

def process_document(title, url, text):
    
    try:

        #If the text column is empty, access the URL
        if not text:
            
            response = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'})
            soup = BeautifulSoup(response.text, 'html.parser')

            #word
            if '.docx' in url:
                
                content = segment_word(title, response, 1)

            #pdf 
            elif '.pdf' in url or 'PDF' in soup.text[0:50]:

                content = segment_pdf(title, response, 1)
                

            #probably html
            else:
                content = segment_html(title, url, 1)


        #text is already in the dataframe: just write it out as pdf
        elif text:

            #download as pdf
            document = pdfkit.from_string(text, title + '.pdf')

            #read in the text as pages
            content = {}

            with pdfplumber.open(document) as pdf:

                pages = pdf.pages
                for count,page in enumerate(pages):
                    content[count+1] = page.extract_text()
                    
    except:
        content = 'download error'
        print(title, ' ', url)
    
    
    return content

In [None]:
#Now access the urls and get the text all in one go.
os.chdir(preproc)

start = time.time()

df['TextResult'] = [m for m in map(process_document, df.Document, tqdm(df.URL), df.Text)]

end = time.time()
print(end - start)

In [None]:
df = df[df.TextResult!='download error']
df = df.reset_index(drop=True)
df.head()

In [10]:
os.chdir(base)
df = pd.read_csv('Keyphrases_input.csv')
df = df.drop(df.columns.difference(['Source', 'Document', 'URL', 'TextResult', 'Date']), axis=1)
df = df[df.TextResult!='download error']
df = df.reset_index(drop=True)

#make sure cleaned_text is an orderly list of strings
if type(df.loc[0, 'TextResult']) == str:
     df['TextResult'] = [eval(df.loc[i, 'TextResult']) for i,v in df.iterrows()]

df.head(20)

Unnamed: 0,Source,Document,URL,Date,TextResult
0,BIS Working Papers,Global safe assets,https://www.bis.org/publ/work399.pdf,18-Dec-12,{1: ' BIS Working Papers No 399 Glo...
1,BIS Working Papers,Crises and rescues liquidity transmission thro...,https://www.bis.org/publ/work576.pdf,26-Aug-16,{1: ' BIS Working Papers No 576 Cris...
2,BIS Working Papers,German unification and the demand for German M3,https://www.bis.org/publ/work21.pdf,20-Sep-94,{1: 'BIS Working paper No. 21 GER...
3,BIS Working Papers,The housing meltdown Why did it happen in the ...,https://www.bis.org/publ/work259.pdf,18-Sep-08,{1: ' BIS Working Papers No 259 The...
4,BIS Working Papers,Big data and machine learning in central banking,https://www.bis.org/publ/work930.pdf,04-Mar-21,{1: ' BIS Working Papers No 930 Big...
5,BIS Working Papers,Recent RMB policy and currency co movements,https://www.bis.org/publ/work727.pdf,11-Jun-18,{1: ' BIS Working Papers No 727 Rec...
6,BIS Working Papers,Exchange rate risk and local currency sovereig...,https://www.bis.org/publ/work474.pdf,18-Dec-14,{1: ' BIS Working Papers No 474 Exch...
7,BIS Working Papers,The pricing of portfolio credit risk,https://www.bis.org/publ/work214.pdf,15-Sep-06,{1: ' BIS Working Papers No 214 The...
8,BIS Working Papers,The response of short term bank lending rates ...,https://www.bis.org/publ/work27.pdf,21-May-95,{1: 'BIS Working paper No. 27 THE ...
9,BIS Working Papers,Trade linkages and the globalisation of inflat...,https://www.bis.org/publ/work447.pdf,14-Apr-14,{1: ' BIS Working Papers No 447 Trad...


### Step 3. Import the keyphrases that we are interested in for each topic.

In [11]:
#Import keywords. Some keywords contain other keywords. Order them by length, because the longer phrases will be searched for first.

os.chdir(keyphraselists)

with open('topic1.txt', 'r+') as f:
    wordlist = f.readlines()  
    wordlist = [re.sub('\n', '', w) for w in wordlist]
    wordlist = [w.lower() for w in wordlist]
    t1 = sorted(wordlist, key=len, reverse=True)
    del f, wordlist
    
with open('topic2.txt', 'r+') as f:
    wordlist = f.readlines()  
    wordlist = [re.sub('\n', '', w) for w in wordlist]
    wordlist = [w.lower() for w in wordlist]
    t2 = sorted(wordlist, key=len, reverse=True)
    del f, wordlist
    
with open('topic3.txt', 'r+') as f:
    wordlist = f.readlines()  
    wordlist = [re.sub('\n', '', w) for w in wordlist]
    wordlist = [w.lower() for w in wordlist]
    t3 = sorted(wordlist, key=len, reverse=True)
    del f, wordlist
    
with open('topic4.txt', 'r+') as f:
    wordlist = f.readlines()  
    wordlist = [re.sub('\n', '', w) for w in wordlist]
    wordlist = [w.lower() for w in wordlist]
    t4 = sorted(wordlist, key=len, reverse=True)
    del f, wordlist
    
with open('topic5.txt', 'r+') as f:
    wordlist = f.readlines()  
    wordlist = [re.sub('\n', '', w) for w in wordlist]
    wordlist = [w.lower() for w in wordlist]
    t5 = sorted(wordlist, key=len, reverse=True) #longer words before shorter words
    del f, wordlist

### Step 4. Apply some basic text cleaning. <br> The proportion of keyphrases/page will be calculated, so don't count stopwords, artefacts etc into the total number of words per page.

In [12]:
# All non-English words will be disposed of. Keep important non-English words on this list.
os.chdir(base)

result = docx2txt.process("informative_words.docx")
informative = re.findall(r'\w+', result)
informative = [i.lower() for i in informative]

en_words = set(nltk.corpus.words.words())
stop_words = [s for s in stopwords.words('english') if s not in informative]

In [13]:
def basic_clean(text):
    
    cleaned_text = {}
    
    if isinstance(text, dict):
    
        for k,v in text.items():
            
            if isinstance(v, str):
                v = re.sub(r'[\t]', ' ', v)
                v = re.sub(r'[\n]', ' ', v)
                v = re.sub('[^a-zA-Z0-9 ]', ' ', v)
                v = ' '.join([word for word in v.split() if word in en_words and word not in stop_words])
                cleaned_text[k] = v
                
                
    return cleaned_text

In [14]:
df['TextResult'] = df['TextResult'].apply(basic_clean)

In [15]:
df = df[df.TextResult!={}]
df = df.reset_index(drop=True)
print(len(df))
df.head()

450


Unnamed: 0,Source,Document,URL,Date,TextResult
0,BIS Working Papers,Global safe assets,https://www.bis.org/publ/work399.pdf,18-Dec-12,{1: 'No safe assets Pierre Olivier Olivier Jea...
1,BIS Working Papers,Crises and rescues liquidity transmission thro...,https://www.bis.org/publ/work576.pdf,26-Aug-16,{1: 'No liquidity transmission international C...
2,BIS Working Papers,German unification and the demand for German M3,https://www.bis.org/publ/work21.pdf,20-Sep-94,"{1: 'paper No Stefan September', 3: 'Stefan Se..."
3,BIS Working Papers,The housing meltdown Why did it happen in the ...,https://www.bis.org/publ/work259.pdf,18-Sep-08,{1: 'No The housing happen September classific...
4,BIS Working Papers,Big data and machine learning in central banking,https://www.bis.org/publ/work930.pdf,04-Mar-21,{1: 'No data machine learning central banking ...


### Step 5. Calculate the percentage of key phrases in each topic on each page

In [16]:
#Give more meaningful names to the topics. Here the names have been changed to conceal the purpose of the project.

topics = ['Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']
phrases = [t1, t2, t3, t4, t5]

for topic,phraselist in zip(topics, phrases):

    #the proportion of keyphrases on each page is saved here as an np array
    df[topic] = ''

    for r,v in df.iterrows():

        #each text
        doc = df.loc[r, 'TextResult']

        #number of hits on a page
        numsOnPage = [0] #Page 0 will not have anything
        
        #total number of words on a page
        lenPages = [0.001] #avoid division by zero

        #each page
        for k,v in doc.items():

            num = 0
            len_page = len(v.split()) #number of words on a page

            if len_page >= 50: #not an empty page

                #count the phrases in that topic
                for p in phraselist:

                    #length of the phrase
                    len_phrase = len(p.split())

                    #number of times the phrase appears (corrected for the length of the phrase)
                    target = ' '+p+' '  #only count the occurrence of full words
                    hits = v.lower().count(target)
                    v = v.replace(p, '') #pop the phrase we have already found
                    num = num + hits*len_phrase

            else:
                len_page = 0.001 

            numsOnPage.append(num)
            lenPages.append(len_page)
            
        #proportion of keyphrases on each page
        props = np.divide(numsOnPage, lenPages)
        
        
        if max(props) >= 0.05:
            df[topic][r] = props #list with the prop phrases on each page
        else:
            df[topic][r] = ''
            

In [18]:
#How many documents have been found?

topics = ['Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']

vals=[]
for r,v in df.iterrows():
    for t in topics:
        if df.loc[r,t] != '':
            vals.append(r)
            
print('Total number of relevant documents: ', len(set(vals)))
            

Total number of relevant documents:  115


  if df.loc[r,t] != '':


### Step 6. Highlight the keyphrases in the text so that the reader can get a sense of the topic density <br> Each topic will be highlighted with a different colour.

In [19]:
#Define a dictinary with the colours

colourdict = {'Topic1': (0.866, 0.627, 0.866), 'Topic2': (1, 0.27, 0), 'Topic3': (0, 0.75, 1), 'Topic4': (0.56, 0.933, 0.56), 'Topic5': (1, 0.843, 0)}

In [None]:
topics = ['Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']
phrases = [t1, t2, t3, t4, t5]

for r,v in tqdm(df.iterrows()):
    
    os.chdir(preproc)
    
    if r in vals:

        name = df.loc[r, 'Document'] + '.pdf'
        doc = fitz.open(name)

        for page in doc:
            ### SEARCH

            for topic,phraselist in zip(topics,phrases):

                for p in phraselist: 
                    text = ' '+p+' '
                    text_instances = page.searchFor(text)

                    ### HIGHLIGHT
                    for inst in text_instances:
                        highlight = page.addHighlightAnnot(inst)
                        highlight.setColors({"stroke": colourdict.get(topic)})
                        highlight.update()


        os.chdir(os.path.join(results))

        ### OUTPUT
        doc.save(name, incremental=True, encryption=fitz.PDF_ENCRYPT_KEEP) #same doc
        #doc.save('Rename '+name, garbage=4, deflate=True, clean=True) #new doc
        
    else:
        continue
        
        #Note that incremental save will only work if there have been changes made to the document. 
        #If the highlights are not found, the document will not be moved to the results folder. 
        #That is fine, because the document is likely to be an artefact.



54it [01:56,  1.44s/it]

### Step 7. Add a legend box to the first page of the documents to show which topic is highlighted with which colour.

In [None]:
PDFNet.Initialize()

os.chdir(os.path.join(results, 'Keyphrases'))

texts = os.listdir(os.path.join(results, 'Keyphrases'))

for text in tqdm(texts): 

    doc = PDFDoc(text)
    page = doc.GetPage(1)
    
    # Plum
    txtannot = FreeText.Create(doc.GetSDFDoc(), Rect(20, 760, 120, 770))
    txtannot.SetContentRect(Rect(20, 760, 120, 770))
    txtannot.SetContents( "Topic 1")
    txtannot.SetBorderStyle( BorderStyle( BorderStyle.e_solid, 0, 10, 20 ), True )
    txtannot.SetEndingStyle(LineAnnot.e_ClosedArrow )
    txtannot.SetColor(ColorPt( 1, 1, 1) )
    txtannot.SetLineColor(ColorPt(0.866, 0.627, 0.866), 3)
    txtannot.SetFontSize(8)
    txtannot.SetQuaddingFormat(1)
    page.AnnotPushBack(txtannot)
    txtannot.RefreshAppearance()
    
    # Orange red
    txtannot = FreeText.Create(doc.GetSDFDoc(), Rect(20, 780, 120, 790))
    txtannot.SetContentRect(Rect(20, 780, 120, 790))
    txtannot.SetContents( "Topic 2")
    txtannot.SetBorderStyle( BorderStyle( BorderStyle.e_solid, 0, 10, 20 ), True )
    txtannot.SetEndingStyle(LineAnnot.e_ClosedArrow )
    txtannot.SetColor(ColorPt( 1, 1, 1) )
    txtannot.SetLineColor(ColorPt(1, 0.27, 0), 3)
    txtannot.SetFontSize(8)
    txtannot.SetQuaddingFormat(1)
    page.AnnotPushBack(txtannot)
    txtannot.RefreshAppearance()
    
    # Deep skye blue
    txtannot = FreeText.Create(doc.GetSDFDoc(), Rect(20, 770, 120, 780))
    txtannot.SetContentRect(Rect(20, 770, 120, 780))
    txtannot.SetContents( "Topic 3")
    txtannot.SetBorderStyle( BorderStyle( BorderStyle.e_solid, 0, 10, 20 ), True )
    txtannot.SetEndingStyle(LineAnnot.e_ClosedArrow )
    txtannot.SetColor(ColorPt( 1, 1, 1) )
    txtannot.SetLineColor(ColorPt(0, 0.75, 1), 3)
    txtannot.SetFontSize(8)
    txtannot.SetQuaddingFormat(1)
    page.AnnotPushBack(txtannot)
    txtannot.RefreshAppearance()

    # Pale green
    txtannot = FreeText.Create(doc.GetSDFDoc(), Rect(20, 790, 120, 800))
    txtannot.SetContentRect(Rect(20, 790, 120, 800))
    txtannot.SetContents( "Topic 4")
    txtannot.SetBorderStyle( BorderStyle( BorderStyle.e_solid, 0, 10, 20 ), True )
    txtannot.SetEndingStyle(LineAnnot.e_ClosedArrow )
    txtannot.SetColor(ColorPt( 1, 1, 1) )
    txtannot.SetLineColor(ColorPt(0.56, 0.933, 0.56), 3)
    txtannot.SetFontSize(8)
    txtannot.SetQuaddingFormat(1)
    page.AnnotPushBack(txtannot)
    txtannot.RefreshAppearance()

    # Gold
    txtannot = FreeText.Create(doc.GetSDFDoc(), Rect(20, 750, 120, 760))
    txtannot.SetContentRect(Rect(20, 750, 120, 760))
    txtannot.SetContents( "Topic 5")
    txtannot.SetBorderStyle( BorderStyle( BorderStyle.e_solid, 0, 10, 20 ), True )
    txtannot.SetEndingStyle(LineAnnot.e_ClosedArrow )
    txtannot.SetColor(ColorPt( 1, 1, 1) )
    txtannot.SetLineColor(ColorPt(1, 0.843, 0), 3)
    txtannot.SetFontSize(8)
    txtannot.SetQuaddingFormat(1)
    page.AnnotPushBack(txtannot)
    txtannot.RefreshAppearance()

    doc.Save(text, SDFDoc.e_linearized)
    doc.Close()

### Step 8. Annotate the documents with the page numbers where relevant texts appear. <br> i.e. the proportion of keywords is > 5% on the page for a given topic.

In [None]:
# Create a dataframe called annotations where each document is represented with the page numbers for each topic

annotations = pd.DataFrame()

annotations['Document'] = df['Document']

topics = ['Topic1', 'Topic2', 'Topic3', 'Topic4', 'Topic5']
phrases = [t1, t2, t3, t4, t5]

for topic,phraselist in zip(topics, phrases):

    for r,v in annotations.iterrows():

        datapoint = df.loc[r, topic]

        #if at least 1 page exceeded the threshold on that topic
        if datapoint != '':

            #Select the page as relevant if the proportion of keyphrases is > x% on that page
            pages = []
            for pp,prop in enumerate(datapoint):
                if prop >= 0.05:
                    pages.append(pp)

            #Get the page annotations
            annot = []
            start = pages[0]

            for c,p in enumerate(pages):

                #only 1 page in total
                if len(pages) == 1:
                    annot.append(p)
                    break

                #if this is not the last page
                if p != pages[-1]:

                    if pages[c] + 1 == pages[c+1]:
                        continue
                    elif start != p:
                        annot.append((start,p))
                        start = pages[c+1]
                    elif start == p:
                        annot.append(p)
                        start = pages[c+1]

                #last page
                else:

                    if pages[-1] - 1 == pages[-2]:
                        annot.append((start, p))
                    else:
                        annot.append(p)


            tidy_annot = []
            for a in annot:
                if isinstance(a, tuple):
                    tidy_annot.append(''.join('p' + str(a[0]) + '-' + str(a[1])))
                else:
                    tidy_annot.append(''.join('p'+ str(a)))

            pagestring = ', '.join([t for t in tidy_annot])

            #add the nice and tidy pagestring into a column
            annotations.loc[r, topic] = pagestring
            
        else:
            annotations.loc[r, topic] = ''
                