In [6]:
import glob, os, sys; sys.path.append('../src')

from typing import Callable, Dict, List, Optional

import pandas as pd
from pathlib import Path
import re
import logging
import string 
import pandas as pd
#from keybert import KeyBERT

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

logger = logging.getLogger(__name__)
import haystack
from haystack.utils import convert_files_to_docs, fetch_archive_from_http
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
from haystack.schema import Document
import pdfplumber

from haystack.nodes import PreProcessor
import streamlit as st

In [7]:
# Function for reading txt,pdf and docx files 

def load_document(
    file: str,
    encoding: Optional[str] = None,
    id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
    
    """
    takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
    does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
    via Haystack.

    Returns a list of type haystack.schema.Document
    """

    if file.endswith('.pdf'):
        converter = PDFToTextConverter(remove_numeric_tables=True)
    if file.endswith('.txt'):
        converter = TextConverter()
    if file.endswith('.docx'):
        converter = DocxToTextConverter()

    print(converter)
    documents = []

    logger.info("Converting {}".format(file))
    # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
    document = converter.convert(
                file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
            )[0]
    text = document.content
    documents.append(Document(content=text, meta={"name": file}, id_hash_keys=id_hash_keys))
    
    '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
    
    return documents

In [8]:
'''basic cleaning - suitable for transformer models'''
def basic(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # Text Lowercase
    s = s.lower() 
    # Remove punctuation
    translator = str.maketrans(' ', ' ', string.punctuation) 
    s = s.translate(translator)
    # Remove URLs
    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
    s = re.sub(r"http\S+", " ", s)
    # Remove new line characters
    s = re.sub('\n', ' ', s) 
  
    # Remove distracting single quotes
    s = re.sub("\'", " ", s) 
    # Remove all remaining numbers and non alphanumeric characters
    s = re.sub(r'\d+', ' ', s) 
    s = re.sub(r'\W+', ' ', s)

    # define custom words to replace:
    #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
    
    return s.strip()

 

def preprocessing(document):

    """
    takes in haystack document object and splits it into paragraphs and applies simple cleaning.

    Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and 
    list that contains all text joined together.
    """    

    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=120,
        split_respect_sentence_boundary=True,
        #split_overlap=5
    )
    for i in document:
        docs_processed = preprocessor.process([i])
        for item in docs_processed:
            item.content = basic(item.content)

    print("your document has been splitted to", len(docs_processed), "paragraphs")
    
    # create dataframe of text and list of all text
    df = pd.DataFrame(docs_processed)
    all_text = " ".join(df.content.to_list())
    par_list = df.content.to_list()

    return df

In [9]:
import os

# Change the current working directory
os.chdir('C:\\Users\\serva\\Downloads\\NDCs')

# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))


cwd = os.getcwd()  # Get the current working directory (cwd)
files = os.listdir(cwd)  # Get all the files in that directory
print("Files in %r: %s" % (cwd, files))


# Safe directory in a var
directory_in_str='C:\\Users\\serva\\Downloads\\NDCs'
directory = os.fsencode(directory_in_str)


Current working directory: C:\Users\serva\Downloads\NDCs
Files in 'C:\\Users\\serva\\Downloads\\NDCs': ['Australias NDC June 2022 Update.docx', 'BOTSWANA.docx', 'EU_NDC_Submission_December 2020.docx', 'Updated - First NDC - FINAL - PDF.docx']


In [11]:
#Test
""""data=pd.DataFrame(columns=["content","id","meta","score","embedding"])
df=preprocessing(docs)
data1=data.append(df)

print(data1)"""

data=pd.DataFrame(columns=["content","id","meta","score","embedding"])

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    docs=load_document(filename)
# Using the Preprocessor to create df and text 
    df = preprocessing(docs)
    df["Country"]=filename
    data=data.append(df)


INFO - __main__ -  Converting Australias NDC June 2022 Update.docx
2022-08-16 18:56:49.913 INFO    __main__: Converting Australias NDC June 2022 Update.docx


<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x0000023AFC734760>


100%|██████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 81.32docs/s]

your document has been splitted to 14 paragraphs



  data=data.append(df)
INFO - __main__ -  Converting BOTSWANA.docx
2022-08-16 18:56:51.201 INFO    __main__: Converting BOTSWANA.docx


<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x0000023AFBD26CA0>


100%|█████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 634.35docs/s]
  data=data.append(df)
INFO - __main__ -  Converting EU_NDC_Submission_December 2020.docx
2022-08-16 18:56:51.251 INFO    __main__: Converting EU_NDC_Submission_December 2020.docx


your document has been splitted to 9 paragraphs
<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x0000023AFBC1BF40>


100%|█████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 164.22docs/s]
  data=data.append(df)
INFO - __main__ -  Converting Updated - First NDC - FINAL - PDF.docx
2022-08-16 18:56:51.319 INFO    __main__: Converting Updated - First NDC - FINAL - PDF.docx


your document has been splitted to 38 paragraphs
<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x0000023AFBD1FA90>


100%|█████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 140.95docs/s]

your document has been splitted to 40 paragraphs



  data=data.append(df)


In [12]:
data.head(5)
len(data)

101

In [13]:
# Rename the columns 

data=df.rename(columns = {'content':'Text'})

data.head(10)

Unnamed: 0,Text,content_type,id,meta,score,embedding,Country
0,federative republic of brazil paris agreement nationally determined contribu...,text,986c33e39b7ad3e80d26bb72742c73d6,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 0}",,,Updated - First NDC - FINAL - PDF.docx
1,brazil s updated ndc is broad in scope and includes a consideration of means...,text,76065630447e731a0abdb92a88f14ea2,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 1}",,,Updated - First NDC - FINAL - PDF.docx
2,annex information to facilitate clarity transparency and understanding of br...,text,7165ebf036a1aa030c7d5ce64244c15b,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 2}",,,Updated - First NDC - FINAL - PDF.docx
3,brazil will adopt the latest national inventory report available and submitt...,text,ac64b0d84efa5f32553ac705532ba503,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 3}",,,Updated - First NDC - FINAL - PDF.docx
4,information on sources of data used in quantifying the reference points nati...,text,d93c9fc0d4b66988120244c50593d662,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 4}",,,Updated - First NDC - FINAL - PDF.docx
5,net emissions from to compared with net emissions from to whether it is a si...,text,f4192996eea8b1b3d7955372b5815946,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 5}",,,Updated - First NDC - FINAL - PDF.docx
6,how the party has taken into consideration paragraph c and d of decision cp ...,text,ecbf8a1825265edc67e394a357c8c9f3,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 6}",,,Updated - First NDC - FINAL - PDF.docx
7,according to the working group i contribution to the sixth assessment report...,text,783a824d994caeffb6b4e40378471545,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 7}",,,Updated - First NDC - FINAL - PDF.docx
8,adaptation actions implemented in the context of this ndc will aim at reduci...,text,87727313fe09b33ffe8f651608b87c50,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 8}",,,Updated - First NDC - FINAL - PDF.docx
9,adaptation policies will be based on the best available science regarding cl...,text,3a502ab088e2c33f7ce088babee719a,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 9}",,,Updated - First NDC - FINAL - PDF.docx


In [14]:
data.columns = map(str.lower, data.columns)
data.head(6)

Unnamed: 0,text,content_type,id,meta,score,embedding,country
0,federative republic of brazil paris agreement nationally determined contribu...,text,986c33e39b7ad3e80d26bb72742c73d6,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 0}",,,Updated - First NDC - FINAL - PDF.docx
1,brazil s updated ndc is broad in scope and includes a consideration of means...,text,76065630447e731a0abdb92a88f14ea2,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 1}",,,Updated - First NDC - FINAL - PDF.docx
2,annex information to facilitate clarity transparency and understanding of br...,text,7165ebf036a1aa030c7d5ce64244c15b,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 2}",,,Updated - First NDC - FINAL - PDF.docx
3,brazil will adopt the latest national inventory report available and submitt...,text,ac64b0d84efa5f32553ac705532ba503,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 3}",,,Updated - First NDC - FINAL - PDF.docx
4,information on sources of data used in quantifying the reference points nati...,text,d93c9fc0d4b66988120244c50593d662,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 4}",,,Updated - First NDC - FINAL - PDF.docx
5,net emissions from to compared with net emissions from to whether it is a si...,text,f4192996eea8b1b3d7955372b5815946,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 5}",,,Updated - First NDC - FINAL - PDF.docx


In [15]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [16]:

documents=data["text"]

documents.head(10)



0    federative republic of brazil paris agreement nationally determined contribu...
1    brazil s updated ndc is broad in scope and includes a consideration of means...
2    annex information to facilitate clarity transparency and understanding of br...
3    brazil will adopt the latest national inventory report available and submitt...
4    information on sources of data used in quantifying the reference points nati...
5    net emissions from to compared with net emissions from to whether it is a si...
6    how the party has taken into consideration paragraph c and d of decision cp ...
7    according to the working group i contribution to the sixth assessment report...
8    adaptation actions implemented in the context of this ndc will aim at reduci...
9    adaptation policies will be based on the best available science regarding cl...
Name: text, dtype: object

In [17]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [18]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [19]:

processed_docs = data['text'].map(preprocess)



In [20]:
processed_docs[:10]

0    [feder, republ, brazil, pari, agreement, nation, determin, contribut, brasíl...
1    [brazil, updat, broad, scope, includ, consider, mean, implement, implement, ...
2    [annex, inform, facilit, clariti, transpar, understand, brazil, quantifi, in...
3    [brazil, adopt, latest, nation, inventori, report, avail, submit, unfccc, ti...
4    [inform, sourc, data, quantifi, refer, point, nation, inventori, anthropogen...
5    [emiss, compar, emiss, singleyear, multiyear, target, applic, singleyear, ta...
6    [parti, take, consider, paragraph, decis, gas, previous, indic, indc, keep, ...
7    [accord, work, group, contribut, sixth, assess, report, ipcc, publish, augus...
8    [adapt, action, implement, context, reduc, vulner, term, water, energi, food...
9    [adapt, polici, base, best, avail, scienc, climat, chang, nation, circumst, ...
Name: text, dtype: object

# Bag of Words


In [21]:
dictionary = gensim.corpora.Dictionary(processed_docs)

INFO - gensim.corpora.dictionary -  adding document #0 to Dictionary(0 unique tokens: [])
2022-08-16 18:56:57.469 INFO    gensim.corpora.dictionary: adding document #0 to Dictionary(0 unique tokens: [])
INFO - gensim.corpora.dictionary -  built Dictionary(631 unique tokens: ['achiev', 'addit', 'adopt', 'agreement', 'brasília']...) from 40 documents (total 2198 corpus positions)
2022-08-16 18:56:57.472 INFO    gensim.corpora.dictionary: built Dictionary(631 unique tokens: ['achiev', 'addit', 'adopt', 'agreement', 'brasília']...) from 40 documents (total 2198 corpus positions)
INFO - gensim.utils -  Dictionary lifecycle event {'msg': "built Dictionary(631 unique tokens: ['achiev', 'addit', 'adopt', 'agreement', 'brasília']...) from 40 documents (total 2198 corpus positions)", 'datetime': '2022-08-16T18:56:57.473964', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
2022-08-16 18:5

In [22]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 achiev
1 addit
2 adopt
3 agreement
4 brasília
5 brazil
6 chang
7 climat
8 commit
9 communic
10 compar


In [23]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[10]

[(6, 1),
 (7, 1),
 (18, 1),
 (21, 1),
 (26, 1),
 (40, 2),
 (71, 1),
 (115, 3),
 (153, 1),
 (163, 2),
 (221, 1),
 (249, 1),
 (254, 1),
 (255, 1),
 (256, 1),
 (257, 1),
 (258, 1),
 (259, 1),
 (260, 1),
 (261, 1),
 (262, 1),
 (263, 1),
 (264, 1)]

In [24]:
bow_doc_4310 = bow_corpus[10]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))



Word 6 ("chang") appears 1 time.
Word 7 ("climat") appears 1 time.
Word 18 ("feder") appears 1 time.
Word 21 ("govern") appears 1 time.
Word 26 ("nation") appears 1 time.
Word 40 ("adapt") appears 2 time.
Word 71 ("sector") appears 1 time.
Word 115 ("plan") appears 3 time.
Word 153 ("polici") appears 1 time.
Word 163 ("effort") appears 2 time.
Word 221 ("resili") appears 1 time.
Word 249 ("scientif") appears 1 time.
Word 254 ("complement") appears 1 time.
Word 255 ("criteria") appears 1 time.
Word 256 ("entiti") appears 1 time.
Word 257 ("incorpor") appears 1 time.
Word 258 ("knowledg") appears 1 time.
Word 259 ("mainstream") appears 1 time.
Word 260 ("promot") appears 1 time.
Word 261 ("strateg") appears 1 time.
Word 262 ("subject") appears 1 time.
Word 263 ("subnat") appears 1 time.
Word 264 ("support") appears 1 time.


In [25]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

INFO - gensim.models.ldamodel -  using symmetric alpha at 0.1
2022-08-16 18:56:57.924 INFO    gensim.models.ldamodel: using symmetric alpha at 0.1
INFO - gensim.models.ldamodel -  using symmetric eta at 0.1
2022-08-16 18:56:57.926 INFO    gensim.models.ldamodel: using symmetric eta at 0.1
INFO - gensim.models.ldamodel -  using serial LDA version on this node
2022-08-16 18:56:57.927 INFO    gensim.models.ldamodel: using serial LDA version on this node
INFO - gensim.models.ldamulticore -  running online LDA training, 10 topics, 2 passes over the supplied corpus of 40 documents, updating every 4000 documents, evaluating every ~40 documents, iterating 50x with a convergence threshold of 0.001000
2022-08-16 18:56:57.930 INFO    gensim.models.ldamulticore: running online LDA training, 10 topics, 2 passes over the supplied corpus of 40 documents, updating every 4000 documents, evaluating every ~40 documents, iterating 50x with a convergence threshold of 0.001000
INFO - gensim.models.ldamultic

In [26]:
for index, score in sorted(lda_model[bow_corpus[12]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9859341382980347	 
Topic: 0.032*"refer" + 0.026*"approach" + 0.025*"nation" + 0.023*"includ" + 0.022*"inform" + 0.021*"brazil" + 0.020*"emiss" + 0.020*"applic" + 0.019*"contribut" + 0.019*"methodolog"


In [27]:
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

from pprint import pprint

import spacy

import pickle
import re 
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import matplotlib.pyplot as plt 
import pandas as pd

In [28]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)
p

  default_term_info = default_term_info.sort_values(


In [29]:
# Save the visualization in a html file
p = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.save_html(p, 'lda.html')

  default_term_info = default_term_info.sort_values(


In [30]:
unseen_document = 'Determining this huge process was quiet nice'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.6999505162239075	 Topic: 0.024*"climat" + 0.020*"nation" + 0.020*"chang" + 0.016*"implement" + 0.014*"contribut"
Score: 0.033346109092235565	 Topic: 0.033*"nation" + 0.025*"climat" + 0.023*"chang" + 0.016*"includ" + 0.013*"impact"
Score: 0.033341437578201294	 Topic: 0.032*"refer" + 0.026*"approach" + 0.025*"nation" + 0.023*"includ" + 0.022*"inform"
Score: 0.0333399698138237	 Topic: 0.040*"brazil" + 0.028*"climat" + 0.021*"commit" + 0.018*"nation" + 0.016*"contribut"
Score: 0.03333881497383118	 Topic: 0.022*"nation" + 0.019*"applic" + 0.018*"refer" + 0.016*"level" + 0.014*"contribut"
Score: 0.033337995409965515	 Topic: 0.019*"climat" + 0.017*"contribut" + 0.017*"articl" + 0.015*"energi" + 0.014*"emiss"
Score: 0.03333771973848343	 Topic: 0.038*"agreement" + 0.036*"articl" + 0.035*"pari" + 0.026*"increas" + 0.025*"level"
Score: 0.03333723545074463	 Topic: 0.022*"brazil" + 0.019*"emiss" + 0.019*"sector" + 0.018*"ipcc" + 0.017*"global"
Score: 0.03333597257733345	 Topic: 0.019*"braz