In [3]:
import glob, os, sys; sys.path.append('../src')

from typing import Callable, Dict, List, Optional

import pandas as pd
from pathlib import Path
import re
import logging
import string 
import pandas as pd
#from keybert import KeyBERT

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

logger = logging.getLogger(__name__)
import haystack
from haystack.utils import convert_files_to_docs, fetch_archive_from_http
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
from haystack.schema import Document
import pdfplumber

from haystack.nodes import PreProcessor
import streamlit as st

In [4]:
# Function for reading txt,pdf and docx files 

def load_document(
    file: str,
    encoding: Optional[str] = None,
    id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
    
    """
    takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
    does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
    via Haystack.

    Returns a list of type haystack.schema.Document
    """

    if file.endswith('.pdf'):
        converter = PDFToTextConverter(remove_numeric_tables=True)
    if file.endswith('.txt'):
        converter = TextConverter()
    if file.endswith('.docx'):
        converter = DocxToTextConverter()

    print(converter)
    documents = []

    logger.info("Converting {}".format(file))
    # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
    document = converter.convert(
                file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
            )[0]
    text = document.content
    documents.append(Document(content=text, meta={"name": file}, id_hash_keys=id_hash_keys))
    
    '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
    
    return documents

In [5]:
'''basic cleaning - suitable for transformer models'''
def basic(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # Text Lowercase
    s = s.lower() 
    # Remove punctuation
    translator = str.maketrans(' ', ' ', string.punctuation) 
    s = s.translate(translator)
    # Remove URLs
    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
    s = re.sub(r"http\S+", " ", s)
    # Remove new line characters
    s = re.sub('\n', ' ', s) 
  
    # Remove distracting single quotes
    s = re.sub("\'", " ", s) 
    # Remove all remaining numbers and non alphanumeric characters
    s = re.sub(r'\d+', ' ', s) 
    s = re.sub(r'\W+', ' ', s)

    # define custom words to replace:
    #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
    
    return s.strip()

 

def preprocessing(document):

    """
    takes in haystack document object and splits it into paragraphs and applies simple cleaning.

    Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and 
    list that contains all text joined together.
    """    

    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=120,
        split_respect_sentence_boundary=True,
        #split_overlap=5
    )
    for i in document:
        docs_processed = preprocessor.process([i])
        for item in docs_processed:
            item.content = basic(item.content)

    print("your document has been splitted to", len(docs_processed), "paragraphs")
    
    # create dataframe of text and list of all text
    df = pd.DataFrame(docs_processed)
    all_text = " ".join(df.content.to_list())
    par_list = df.content.to_list()

    return df

In [6]:
import os

# Change the current working directory
os.chdir('C:\\Users\\serva\\Downloads\\NDCs')

# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))


cwd = os.getcwd()  # Get the current working directory (cwd)
files = os.listdir(cwd)  # Get all the files in that directory
print("Files in %r: %s" % (cwd, files))


# Safe directory in a var
directory_in_str='C:\\Users\\serva\\Downloads\\NDCs'
directory = os.fsencode(directory_in_str)


Current working directory: C:\Users\serva\Downloads\NDCs
Files in 'C:\\Users\\serva\\Downloads\\NDCs': ['Australias NDC June 2022 Update.docx', 'BOTSWANA.docx', 'EU_NDC_Submission_December 2020.docx', 'Updated - First NDC - FINAL - PDF.docx']


In [38]:
#Test
""""data=pd.DataFrame(columns=["content","id","meta","score","embedding"])
df=preprocessing(docs)
data1=data.append(df)

print(data1)"""

data=pd.DataFrame(columns=["content","id","meta","score","embedding"])

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    docs=load_document(filename)
# Using the Preprocessor to create df and text 
    df["Country"]=filename
    df = preprocessing(docs)
    data=data.append(df)


INFO - __main__ -  Converting Australias NDC June 2022 Update.docx
2022-08-16 18:33:26.461 INFO    __main__: Converting Australias NDC June 2022 Update.docx


<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x00000260AD180AF0>


100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 37.15docs/s]

your document has been splitted to 14 paragraphs



  data=data.append(df)
INFO - __main__ -  Converting BOTSWANA.docx
2022-08-16 18:33:27.476 INFO    __main__: Converting BOTSWANA.docx


<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x00000260A622FA90>


100%|█████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 171.47docs/s]
  data=data.append(df)
INFO - __main__ -  Converting EU_NDC_Submission_December 2020.docx
2022-08-16 18:33:27.543 INFO    __main__: Converting EU_NDC_Submission_December 2020.docx


your document has been splitted to 9 paragraphs
<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x00000260A622FA90>


100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 59.92docs/s]

your document has been splitted to 38 paragraphs



  data=data.append(df)


UnboundLocalError: local variable 'converter' referenced before assignment

In [39]:
data.head(5)
len(data)

61

In [40]:
# Rename the columns 

data=df.rename(columns = {'content':'Text'})

data.head(10)

Unnamed: 0,Text,content_type,id,meta,score,embedding
0,submission by germany and the european commission on behalf of the european ...,text,ae798d25c5884dcb2d424ee6bd9da412,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 0}",,
1,in troduction background on the development of the eu s enhanced ndc the eur...,text,a35d2411c59b8325d3c8e81956faf3af,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 1}",,
2,on march the council of the european union adopted a longterm low greenhouse...,text,ee37ec5bfddd272ad7524d32966057d8,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 2}",,
3,the plan for european recovery will need massive public and private investme...,text,74a51aaf160b8d8b0f98a689652760f2,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 3}",,
4,an overall climate target of will apply to the total amount of expenditure f...,text,d10f6bad1b2110ed45b38858e7471437,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 4}",,
5,an effective methodology for monitoring climatespending and its performance ...,text,f227bd92e20b2e48835850c373a49039,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 5}",,
6,following the withdrawal agreement between the eu and the uk and the transit...,text,95c535f3c1e772d914e45df70459ba9d,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 6}",,
7,ambitious climate action is not just a way to confront the climate crisis an...,text,7e39c9a01d832c58dc0670a13e63e8c7,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 7}",,
8,the combined effect of the eu policies currently in force under this framewo...,text,a8131087364d6f63baf594f415adae5f,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 8}",,
9,emissions reduction targets under current eu legislation are divided between...,text,2c111f4b58b4c5206f00d7a1639b32b8,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 9}",,


In [41]:
data.columns = map(str.lower, data.columns)
data.head(6)

Unnamed: 0,text,content_type,id,meta,score,embedding
0,submission by germany and the european commission on behalf of the european ...,text,ae798d25c5884dcb2d424ee6bd9da412,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 0}",,
1,in troduction background on the development of the eu s enhanced ndc the eur...,text,a35d2411c59b8325d3c8e81956faf3af,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 1}",,
2,on march the council of the european union adopted a longterm low greenhouse...,text,ee37ec5bfddd272ad7524d32966057d8,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 2}",,
3,the plan for european recovery will need massive public and private investme...,text,74a51aaf160b8d8b0f98a689652760f2,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 3}",,
4,an overall climate target of will apply to the total amount of expenditure f...,text,d10f6bad1b2110ed45b38858e7471437,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 4}",,
5,an effective methodology for monitoring climatespending and its performance ...,text,f227bd92e20b2e48835850c373a49039,"{'name': 'EU_NDC_Submission_December 2020.docx', '_split_id': 5}",,


In [42]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [43]:

documents=data["text"]

documents.head(10)



0    submission by germany and the european commission on behalf of the european ...
1    in troduction background on the development of the eu s enhanced ndc the eur...
2    on march the council of the european union adopted a longterm low greenhouse...
3    the plan for european recovery will need massive public and private investme...
4    an overall climate target of will apply to the total amount of expenditure f...
5    an effective methodology for monitoring climatespending and its performance ...
6    following the withdrawal agreement between the eu and the uk and the transit...
7    ambitious climate action is not just a way to confront the climate crisis an...
8    the combined effect of the eu policies currently in force under this framewo...
9    emissions reduction targets under current eu legislation are divided between...
Name: text, dtype: object

In [44]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [45]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [46]:

processed_docs = data['text'].map(preprocess)



In [47]:
processed_docs[:10]

0    [submiss, germani, european, commiss, behalf, european, union, member, state...
1    [troduct, background, develop, enhanc, european, union, member, state, submi...
2    [march, council, european, union, adopt, longterm, greenhous, emiss, develop...
3    [plan, european, recoveri, need, massiv, public, privat, invest, european, l...
4    [overal, climat, target, appli, total, expenditur, ngeu, reflect, appropri, ...
5    [effect, methodolog, monitor, climatespend, perform, includ, report, relev, ...
6    [follow, withdraw, agreement, transit, period, decemb, unit, kingdom, longer...
7    [ambiti, climat, action, confront, climat, crisi, biodivers, crisi, growth, ...
8    [combin, effect, polici, current, forc, framework, deliv, reduct, pledg, ini...
9    [emiss, reduct, target, current, legisl, divid, sector, cover, emiss, trade,...
Name: text, dtype: object

# Bag of Words


In [48]:
dictionary = gensim.corpora.Dictionary(processed_docs)

INFO - gensim.corpora.dictionary -  adding document #0 to Dictionary(0 unique tokens: [])
2022-08-16 18:33:31.347 INFO    gensim.corpora.dictionary: adding document #0 to Dictionary(0 unique tokens: [])
INFO - gensim.corpora.dictionary -  built Dictionary(586 unique tokens: ['behalf', 'berlin', 'clariti', 'commiss', 'consist']...) from 38 documents (total 1814 corpus positions)
2022-08-16 18:33:31.353 INFO    gensim.corpora.dictionary: built Dictionary(586 unique tokens: ['behalf', 'berlin', 'clariti', 'commiss', 'consist']...) from 38 documents (total 1814 corpus positions)
INFO - gensim.utils -  Dictionary lifecycle event {'msg': "built Dictionary(586 unique tokens: ['behalf', 'berlin', 'clariti', 'commiss', 'consist']...) from 38 documents (total 1814 corpus positions)", 'datetime': '2022-08-16T18:33:31.356511', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
2022-08-16 18:3

In [49]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 behalf
1 berlin
2 clariti
3 commiss
4 consist
5 contribut
6 decemb
7 determin
8 enhanc
9 european
10 facilit


In [50]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[10]

[(15, 1),
 (19, 1),
 (57, 1),
 (63, 1),
 (65, 1),
 (99, 1),
 (121, 1),
 (128, 2),
 (132, 1),
 (133, 1),
 (136, 1),
 (138, 1),
 (152, 1),
 (165, 1),
 (172, 1),
 (178, 1),
 (192, 1),
 (214, 1),
 (218, 1),
 (223, 1),
 (245, 1),
 (248, 1),
 (249, 1),
 (250, 1),
 (251, 1),
 (252, 1),
 (253, 1),
 (254, 1),
 (255, 1),
 (256, 1),
 (257, 2),
 (258, 1),
 (259, 2),
 (260, 1),
 (261, 1),
 (262, 1),
 (263, 1),
 (264, 1)]

In [51]:
bow_doc_4310 = bow_corpus[10]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))



Word 15 ("member") appears 1 time.
Word 19 ("state") appears 1 time.
Word 57 ("agre") appears 1 time.
Word 63 ("emiss") appears 1 time.
Word 65 ("greenhous") appears 1 time.
Word 99 ("level") appears 1 time.
Word 121 ("appli") appears 1 time.
Word 128 ("legisl") appears 2 time.
Word 132 ("shall") appears 1 time.
Word 133 ("target") appears 1 time.
Word 136 ("address") appears 1 time.
Word 138 ("annual") appears 1 time.
Word 152 ("order") appears 1 time.
Word 165 ("longer") appears 1 time.
Word 172 ("bind") appears 1 time.
Word 178 ("deliv") appears 1 time.
Word 192 ("initi") appears 1 time.
Word 214 ("review") appears 1 time.
Word 218 ("allow") appears 1 time.
Word 223 ("cover") appears 1 time.
Word 245 ("set") appears 1 time.
Word 248 ("acceler") appears 1 time.
Word 249 ("amend") appears 1 time.
Word 250 ("aviat") appears 1 time.
Word 251 ("buildup") appears 1 time.
Word 252 ("certain") appears 1 time.
Word 253 ("decreas") appears 1 time.
Word 254 ("function") appears 1 time.
Word 25

In [52]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

INFO - gensim.models.ldamodel -  using symmetric alpha at 0.1
2022-08-16 18:33:32.215 INFO    gensim.models.ldamodel: using symmetric alpha at 0.1
INFO - gensim.models.ldamodel -  using symmetric eta at 0.1
2022-08-16 18:33:32.217 INFO    gensim.models.ldamodel: using symmetric eta at 0.1
INFO - gensim.models.ldamodel -  using serial LDA version on this node
2022-08-16 18:33:32.218 INFO    gensim.models.ldamodel: using serial LDA version on this node
INFO - gensim.models.ldamulticore -  running online LDA training, 10 topics, 2 passes over the supplied corpus of 38 documents, updating every 4000 documents, evaluating every ~38 documents, iterating 50x with a convergence threshold of 0.001000
2022-08-16 18:33:32.222 INFO    gensim.models.ldamulticore: running online LDA training, 10 topics, 2 passes over the supplied corpus of 38 documents, updating every 4000 documents, evaluating every ~38 documents, iterating 50x with a convergence threshold of 0.001000
INFO - gensim.models.ldamultic

In [53]:
for index, score in sorted(lda_model[bow_corpus[12]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9799983501434326	 
Topic: 0.036*"target" + 0.024*"energi" + 0.024*"emiss" + 0.020*"reduct" + 0.015*"greenhous" + 0.011*"member" + 0.011*"state" + 0.010*"sector" + 0.010*"increas" + 0.010*"final"


In [54]:
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

from pprint import pprint

import spacy

import pickle
import re 
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import matplotlib.pyplot as plt 
import pandas as pd

In [55]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)
p

  default_term_info = default_term_info.sort_values(


In [57]:
# Save the visualization in a html file
p = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.save_html(p, 'lda.html')

  default_term_info = default_term_info.sort_values(


In [56]:
unseen_document = 'Determining this huge process was quiet nice'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.699964165687561	 Topic: 0.033*"land" + 0.023*"categori" + 0.023*"includ" + 0.021*"account" + 0.020*"forest"
Score: 0.03334146738052368	 Topic: 0.025*"european" + 0.019*"union" + 0.019*"member" + 0.019*"state" + 0.018*"updat"
Score: 0.03334113582968712	 Topic: 0.025*"inform" + 0.022*"agreement" + 0.021*"european" + 0.020*"emiss" + 0.018*"pari"
Score: 0.03334016725420952	 Topic: 0.029*"agreement" + 0.028*"pari" + 0.022*"articl" + 0.022*"contribut" + 0.017*"emiss"
Score: 0.03333818539977074	 Topic: 0.044*"forest" + 0.027*"account" + 0.026*"land" + 0.019*"ageclass" + 0.018*"decis"
Score: 0.033336568623781204	 Topic: 0.037*"target" + 0.032*"climat" + 0.020*"european" + 0.017*"contribut" + 0.016*"emiss"
Score: 0.03333577513694763	 Topic: 0.026*"climat" + 0.023*"emiss" + 0.023*"european" + 0.018*"union" + 0.018*"energi"
Score: 0.03333428129553795	 Topic: 0.036*"target" + 0.024*"energi" + 0.024*"emiss" + 0.020*"reduct" + 0.015*"greenhous"
Score: 0.033334165811538696	 Topic: 0.041*"emi