In [1]:
#Python standard library imports
import datetime as dt
import pathlib
import re
#Scientific Python ecosystem imports
import pandas as pd
#Text mining packages
import nltk as nltk
from whoosh.lang.porter import stem
#MM Import
import datetime as dt
import pathlib
import logging
import copy

from docx2python import docx2python
from bs4 import BeautifulSoup
import html2text
import urllib.request

from polmap.polmap import preprocess_text, doc2text # replaced the keyword processing block

In [2]:
## 1.a) Read all files in input directory and select allowed filetypes

input_dir = pathlib.Path.cwd() / 'pdf_re' / 'Test' #MM let user provide an input dir
input_folder_name = input_dir.name

allowed_filetypes=['.pdf','.html','.mhtml','.doc','.docx']

files = sorted(input_dir.glob('**/*.*'))
files = [ file for file in files if file.suffix in allowed_filetypes]
print(*files, sep='\n')
#MM assert files==False and log assertion error.

/mnt/d/OneDrive/SDG/Policy-Mapping/pdf_re/Test/Eurlex/The_EU_Green_Deal_52019DC0640/CELEX_52019DC0640_EN_TXT.html
/mnt/d/OneDrive/SDG/Policy-Mapping/pdf_re/Test/Eurlex/The_EU_Green_Deal_52019DC0640/cellar_b828d165-1c22-11ea-8c1f-01aa75ed71a1.0002.01_DOC_1.doc
/mnt/d/OneDrive/SDG/Policy-Mapping/pdf_re/Test/Eurlex/The_EU_Green_Deal_52019DC0640/cellar_b828d165-1c22-11ea-8c1f-01aa75ed71a1.0002.01_DOC_2.doc
/mnt/d/OneDrive/SDG/Policy-Mapping/pdf_re/Test/Eurlex/The_EU_Green_Deal_52019DC0640/cellar_b828d165-1c22-11ea-8c1f-01aa75ed71a1.0002.02_DOC_1.pdf
/mnt/d/OneDrive/SDG/Policy-Mapping/pdf_re/Test/Eurlex/The_EU_Green_Deal_52019DC0640/cellar_b828d165-1c22-11ea-8c1f-01aa75ed71a1.0002.02_DOC_2.pdf
/mnt/d/OneDrive/SDG/Policy-Mapping/pdf_re/Test/JPB/JRC_group_A/WP_2021_P_PP_20000_PRJ_30000_08012021.docx
/mnt/d/OneDrive/SDG/Policy-Mapping/pdf_re/Test/JPB/JRC_group_A/WP_2021_P_PP_20000_PRJ_30025_08012021.docx
/mnt/d/OneDrive/SDG/Policy-Mapping/pdf_re/Test/JPB/JRC_group_B/WP_2021_P_PP_20020_PRJ_3002

In [None]:
## 1.b) Create output folder structure based on input name, date and time of exectution

date = dt.datetime.now().date().isoformat() #def make_directories(project='TEI'): #MM start func definition
hour = dt.datetime.now().time().isoformat(timespec='seconds').replace(':', '')
current_date = '_'+date+'_T'+hour

project_title = input_folder_name+str(current_date) 

out_dir = pathlib.Path.cwd() / 'output' / project_title #Beginning of try block
log_dir = out_dir / 'logs'
results_dir = out_dir / 'results'
docs2txt_dir = out_dir / 'docs2txt'
stemmed_doctext_dir = out_dir / 'docs2txt_stemmed'

dir_dict = { directory: directory.mkdir(mode=0o777, parents=True, exist_ok=True) for directory in [out_dir, log_dir, results_dir, docs2txt_dir, ] } 

In [None]:
######################################
########### 2) MM Read the list of keywords and apply the prepare_keyords text processing function from polmap

keys = pd.read_excel('keys_update_15012020.xlsx', sheet_name= 'Target_keys' ) #MM 'keys_from_RAKE-GBV_DB_SB_v3.xlsx', sheet_name= 'Sheet1' 
goal_keys = pd.read_excel('keys_update_15012020.xlsx', sheet_name= 'Goal_keys' ) #MM Create a dictionary of dataframes for each sheet
dev_count_keys = pd.read_excel('keys_update_15012020.xlsx', sheet_name= 'MOI' ) #MM 'keys_from_RAKE-GBV_DB_SB_v3.xlsx', sheet_name= 'Sheet2' 

#remove all from stop_words to keep in keywords
stop_words = set(nltk.corpus.stopwords.words("english"))
stop_words.remove("all")

keys['Keys']=keys['Keys'].apply(lambda x: preprocess_text(x, stop_words))
goal_keys['Keys']=goal_keys['Keys'].apply(lambda x: preprocess_text(x, stop_words))
dev_count_keys['Keys']=dev_count_keys['Keys'].apply(lambda x: preprocess_text(x, stop_words))

##Country names
countries_in = pd.read_excel('keys_update_15012020.xlsx', sheet_name= 'developing_countries') #MM 'keys_from_RAKE-GBV_DB_SB_v3.xlsx', sheet_name= 'developing_countries'
countries = countries_in['Name'].values.tolist()
country_ls = []
for element in countries:
    element = [re.sub(r"[^a-zA-Z-]+", '', t.lower().strip()) for t in element.split()]
    # countries = [x.strip(' ') for x in countries]
    element = [stem(word) for word in element if not word in stop_words]
    element = ' '.join(element)
    country_ls.append(element)


In [None]:
#doctext_dict = {}
PDFtext=[]
counter = 0
for doc_path in files:
    counter += 1
    try:
        policy_text=[]
        doc_text = doc2text(doc_path)
        while '\n\n\n\n' in doc_text : doc_text = doc_text.replace('\n\n\n\n', '\n\n\n') #docx2python specific fix. would probably fit better elsewhere
        policy_text.append(doc_text)
        doctext_ = doc_path.parts[doc_path.parts.index(input_dir.name)+1:]
        doctext_name =  docs2txt_dir.joinpath(*doctext_)
        doctext_name.parent.mkdir(mode=0o777, parents=True, exist_ok=True)
        doctext_name = doctext_name.parent.joinpath(doctext_name.stem+'.txt')
        with open(doctext_name, 'w') as file_:
           file_.write(doc_text)
        PDFtext.append(['/'.join(doctext_),' ; '.join(policy_text)])
        PDFtext
        #doctext_dict['/'.join(doctext_)]=' ; '.join(policy_text)])
    except Exception as excptn: #MM I'd log errors as described in https://realpython.com/python-logging/, we need to test this.
        logging.exception('{doc_file} raised exception {exception} \n\n'.format(doc_file=doc_item.name, exception=excptn))

PDFtext_cpy = copy.deepcopy(PDFtext)


In [None]:
print(*PDFtext_cpy, sep='\n\n')

In [None]:
print(PDFtext_cpy[5][1]+'\n\n')

In [None]:
######################################
########### 4) Read document files and convert them into text

PDFtext = copy.deepcopy(PDFtext_cpy)

lemmatizer = nltk.stem.WordNetLemmatizer()
for item in PDFtext:
    #detect soft hyphen that separates words
    item[1] = item[1].replace('.', ' .')
    item[1] = [re.sub(r'-\n', '', t) for t in item[1].split()]
    # #get indices of soft hyphens
    indices = [i for i, s in enumerate(item[1]) if '\xad' in s]
    #merge the separated words
    for index in indices:
        item[1][index] = item[1][index].replace('\xad', '')
        item[1][index+1] = item[1][index]+item[1][index+1]
    print(str(item[1])+'\n\n')
    # #remove unnecessary list elements
    # for index in sorted(indices, reverse=True):
    #     del item[1][index]
    # #remove special character, numbers, lowercase #MM from here until @ this code is identical to prepare keywords correct?
    # item[1] = [re.sub(r"[^a-zA-Z-\.]+", '', t.lower().strip()) for t in item[1]]
    # #add whitespaces
    # item[1] = [word.center(len(word)+2) for word in item[1]]
    # #recover R&D for detection
    # item[1] = [w.replace(" rd ", "R&D") for w in item[1]]
    # # remove words > 2
    # item[1] = [word for word in item[1] if len(word) > 2 or word == "ph"]
    # # remove '
    # # item[1] = [s.replace('\'', '') for s in item[1]]
    # #remove whitespaces
    # item[1] = [x.strip(' ') for x in item[1]]
    # #add special char to prevent aids from being stemmed to aid
    # item[1] = [w.replace("aids", "ai&ds&") for w in item[1]]
    # item[1] = [w.replace("productivity", "pro&ductivity&") for w in item[1]]
    # item[1] = [w.replace("remittances", "remit&tance&") for w in item[1]]
    # item[1] = [w.replace("remittance", "remit&tance&") for w in item[1]]
    # # stem words
    # item[1] = [stem(word) for word in item[1] if not word in stop_words]
    # #remove special char for detection in text
    # item[1] = [w.replace("ai&ds&", "aids") for w in item[1]]
    # item[1] = [w.replace("pro&ductivity&", "productivity") for w in item[1]]
    # item[1] = [w.replace("remit&tance&", "remittance") for w in item[1]]
    # #try lemmatizing
    # # item[1] = [lemmatizer.lemmatize(word) for word in item[1] if not word in stop_words]
    # # merge back together to 1 string
    # item[1] = ' '.join(item[1])
    # #add trailing leading whitespace
    # item[1] = " " + item[1] + " "
    # #save out
    # # item_path = stemmed_doctext_dir / pathlib.PurePath(item[0]) #stemmed_doctext_dir / pathlib.PurePath(item[0])
    # # item_path.parent.mkdir(mode=0o777, parents=True, exist_ok=True)
    # # item_path = item_path.parent.joinpath(item_path.stem+'_stemmed.txt')
    # # with open(item_path, 'w') as stemdoctext:
    # #        stemdoctext.write(item[1]+'\n\nTextlenght: {}'.format(len(item[1])))
    # # #Append textlenght
    # # item = item.append(len(item[1])) #MM @

In [6]:
print(files[0])

with open(files[0], 'r') as file_:

    #html_object = urllib.request.urlopen(file_).read()
    bs_text = BeautifulSoup(file_).get_text()

    file_read_ = file_.read()

    html_to_text = html2text.html2text(file_.read()) #BeautifulSoup(files[0])

    print(type(html_to_text))



/mnt/d/OneDrive/SDG/Policy-Mapping/pdf_re/Test/Eurlex/The_EU_Green_Deal_52019DC0640/CELEX_52019DC0640_EN_TXT.html
<class 'str'>


In [4]:
with open('bs4_text.txt', 'w') as bs4_file:
    bs4_file.write(bs_text)

with open('html2text_text.txt', 'w') as h2t_file:
    h2t_file.write(html_to_text)


In [8]:
docx2python(files[0]).text

BadZipFile: File is not a zip file