In [1]:
import glob, os, sys; sys.path.append('../src')

from typing import Callable, Dict, List, Optional

import pandas as pd
from pathlib import Path
import re
import logging
import string 
import pandas as pd
#from keybert import KeyBERT

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

logger = logging.getLogger(__name__)
import haystack
from haystack.utils import convert_files_to_docs, fetch_archive_from_http
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
from haystack.schema import Document
import pdfplumber

from haystack.nodes import PreProcessor
import streamlit as st

In [2]:
# Function for reading txt,pdf and docx files 

def load_document(
    file: str,
    encoding: Optional[str] = None,
    id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
    
    """
    takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
    does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
    via Haystack.

    Returns a list of type haystack.schema.Document
    """

    if file.endswith('.pdf'):
        converter = PDFToTextConverter(remove_numeric_tables=True)
    if file.endswith('.txt'):
        converter = TextConverter()
    if file.endswith('.docx'):
        converter = DocxToTextConverter()

    print(converter)
    documents = []

    logger.info("Converting {}".format(file))
    # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
    document = converter.convert(
                file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
            )[0]
    text = document.content
    documents.append(Document(content=text, meta={"name": file}, id_hash_keys=id_hash_keys))
    
    '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
    
    return documents

In [3]:
'''basic cleaning - suitable for transformer models'''
def basic(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # Text Lowercase
    s = s.lower() 
    # Remove punctuation
    translator = str.maketrans(' ', ' ', string.punctuation) 
    s = s.translate(translator)
    # Remove URLs
    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
    s = re.sub(r"http\S+", " ", s)
    # Remove new line characters
    s = re.sub('\n', ' ', s) 
  
    # Remove distracting single quotes
    s = re.sub("\'", " ", s) 
    # Remove all remaining numbers and non alphanumeric characters
    s = re.sub(r'\d+', ' ', s) 
    s = re.sub(r'\W+', ' ', s)

    # define custom words to replace:
    #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
    
    return s.strip()

 

def preprocessing(document):

    """
    takes in haystack document object and splits it into paragraphs and applies simple cleaning.

    Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and 
    list that contains all text joined together.
    """    

    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=120,
        split_respect_sentence_boundary=True,
        #split_overlap=5
    )
    for i in document:
        docs_processed = preprocessor.process([i])
        for item in docs_processed:
            item.content = basic(item.content)

    print("your document has been splitted to", len(docs_processed), "paragraphs")
    
    # create dataframe of text and list of all text
    df = pd.DataFrame(docs_processed)
    all_text = " ".join(df.content.to_list())
    par_list = df.content.to_list()

    return df

In [4]:
import os

# Change the current working directory
os.chdir('C:\\Users\\serva\\Downloads\\NDCs')

# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))


cwd = os.getcwd()  # Get the current working directory (cwd)
files = os.listdir(cwd)  # Get all the files in that directory
print("Files in %r: %s" % (cwd, files))


# Safe directory in a var
directory_in_str='C:\\Users\\serva\\Downloads\\NDCs'
directory = os.fsencode(directory_in_str)


Current working directory: C:\Users\serva\Downloads\NDCs
Files in 'C:\\Users\\serva\\Downloads\\NDCs': ['Australias NDC June 2022 Update.docx', 'BOTSWANA.docx', 'EU_NDC_Submission_December 2020.docx', 'Updated - First NDC - FINAL - PDF.docx']


In [5]:
#Test
""""data=pd.DataFrame(columns=["content","id","meta","score","embedding"])
df=preprocessing(docs)
data1=data.append(df)

print(data1)"""

data=pd.DataFrame(columns=["content","id","meta","score","embedding"])

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    docs=load_document(filename)
# Using the Preprocessor to create df and text 
    df = preprocessing(docs)
    df["Country"]=filename
    data=data.append(df)


INFO - __main__ -  Converting Australias NDC June 2022 Update.docx
2022-08-17 10:04:28.839 INFO    __main__: Converting Australias NDC June 2022 Update.docx


<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x0000023933D4C6A0>


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 91.33docs/s]

your document has been splitted to 14 paragraphs



  data=data.append(df)
INFO - __main__ -  Converting BOTSWANA.docx
2022-08-17 10:04:30.163 INFO    __main__: Converting BOTSWANA.docx


<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x000002391F2E4550>


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 499.98docs/s]
  data=data.append(df)
INFO - __main__ -  Converting EU_NDC_Submission_December 2020.docx
2022-08-17 10:04:30.214 INFO    __main__: Converting EU_NDC_Submission_December 2020.docx


your document has been splitted to 9 paragraphs
<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x000002391F2E4550>


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 199.47docs/s]
  data=data.append(df)
INFO - __main__ -  Converting Updated - First NDC - FINAL - PDF.docx
2022-08-17 10:04:30.288 INFO    __main__: Converting Updated - First NDC - FINAL - PDF.docx


your document has been splitted to 38 paragraphs
<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x0000023933DA9FD0>


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 152.02docs/s]

your document has been splitted to 40 paragraphs



  data=data.append(df)


In [6]:
data.head(5)
len(data)

101

In [7]:
# Rename the columns 

data=df.rename(columns = {'content':'Text'})

data.head(10)

Unnamed: 0,Text,content_type,id,meta,score,embedding,Country
0,federative republic of brazil paris agreement nationally determined contribu...,text,986c33e39b7ad3e80d26bb72742c73d6,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 0}",,,Updated - First NDC - FINAL - PDF.docx
1,brazil s updated ndc is broad in scope and includes a consideration of means...,text,76065630447e731a0abdb92a88f14ea2,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 1}",,,Updated - First NDC - FINAL - PDF.docx
2,annex information to facilitate clarity transparency and understanding of br...,text,7165ebf036a1aa030c7d5ce64244c15b,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 2}",,,Updated - First NDC - FINAL - PDF.docx
3,brazil will adopt the latest national inventory report available and submitt...,text,ac64b0d84efa5f32553ac705532ba503,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 3}",,,Updated - First NDC - FINAL - PDF.docx
4,information on sources of data used in quantifying the reference points nati...,text,d93c9fc0d4b66988120244c50593d662,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 4}",,,Updated - First NDC - FINAL - PDF.docx
5,net emissions from to compared with net emissions from to whether it is a si...,text,f4192996eea8b1b3d7955372b5815946,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 5}",,,Updated - First NDC - FINAL - PDF.docx
6,how the party has taken into consideration paragraph c and d of decision cp ...,text,ecbf8a1825265edc67e394a357c8c9f3,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 6}",,,Updated - First NDC - FINAL - PDF.docx
7,according to the working group i contribution to the sixth assessment report...,text,783a824d994caeffb6b4e40378471545,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 7}",,,Updated - First NDC - FINAL - PDF.docx
8,adaptation actions implemented in the context of this ndc will aim at reduci...,text,87727313fe09b33ffe8f651608b87c50,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 8}",,,Updated - First NDC - FINAL - PDF.docx
9,adaptation policies will be based on the best available science regarding cl...,text,3a502ab088e2c33f7ce088babee719a,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 9}",,,Updated - First NDC - FINAL - PDF.docx


In [8]:
data.columns = map(str.lower, data.columns)
data.head(6)

Unnamed: 0,text,content_type,id,meta,score,embedding,country
0,federative republic of brazil paris agreement nationally determined contribu...,text,986c33e39b7ad3e80d26bb72742c73d6,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 0}",,,Updated - First NDC - FINAL - PDF.docx
1,brazil s updated ndc is broad in scope and includes a consideration of means...,text,76065630447e731a0abdb92a88f14ea2,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 1}",,,Updated - First NDC - FINAL - PDF.docx
2,annex information to facilitate clarity transparency and understanding of br...,text,7165ebf036a1aa030c7d5ce64244c15b,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 2}",,,Updated - First NDC - FINAL - PDF.docx
3,brazil will adopt the latest national inventory report available and submitt...,text,ac64b0d84efa5f32553ac705532ba503,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 3}",,,Updated - First NDC - FINAL - PDF.docx
4,information on sources of data used in quantifying the reference points nati...,text,d93c9fc0d4b66988120244c50593d662,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 4}",,,Updated - First NDC - FINAL - PDF.docx
5,net emissions from to compared with net emissions from to whether it is a si...,text,f4192996eea8b1b3d7955372b5815946,"{'name': 'Updated - First NDC - FINAL - PDF.docx', '_split_id': 5}",,,Updated - First NDC - FINAL - PDF.docx


In [9]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [10]:

documents=data["text"]

documents.head(10)



0    federative republic of brazil paris agreement nationally determined contribu...
1    brazil s updated ndc is broad in scope and includes a consideration of means...
2    annex information to facilitate clarity transparency and understanding of br...
3    brazil will adopt the latest national inventory report available and submitt...
4    information on sources of data used in quantifying the reference points nati...
5    net emissions from to compared with net emissions from to whether it is a si...
6    how the party has taken into consideration paragraph c and d of decision cp ...
7    according to the working group i contribution to the sixth assessment report...
8    adaptation actions implemented in the context of this ndc will aim at reduci...
9    adaptation policies will be based on the best available science regarding cl...
Name: text, dtype: object

In [11]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

In [12]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [13]:

processed_docs = data['text'].map(preprocess)



In [14]:
processed_docs[:10]

0    [feder, republ, brazil, pari, agreement, nation, determin, contribut, brasíl...
1    [brazil, updat, broad, scope, includ, consider, mean, implement, implement, ...
2    [annex, inform, facilit, clariti, transpar, understand, brazil, quantifi, in...
3    [brazil, adopt, latest, nation, inventori, report, avail, submit, unfccc, ti...
4    [inform, sourc, data, quantifi, refer, point, nation, inventori, anthropogen...
5    [emiss, compar, emiss, singleyear, multiyear, target, applic, singleyear, ta...
6    [parti, take, consider, paragraph, decis, gas, previous, indic, indc, keep, ...
7    [accord, work, group, contribut, sixth, assess, report, ipcc, publish, augus...
8    [adapt, action, implement, context, reduc, vulner, term, water, energi, food...
9    [adapt, polici, base, best, avail, scienc, climat, chang, nation, circumst, ...
Name: text, dtype: object

# Bag of Words


In [15]:
dictionary = gensim.corpora.Dictionary(processed_docs)

INFO - gensim.corpora.dictionary -  adding document #0 to Dictionary(0 unique tokens: [])
2022-08-17 10:04:35.398 INFO    gensim.corpora.dictionary: adding document #0 to Dictionary(0 unique tokens: [])
INFO - gensim.corpora.dictionary -  built Dictionary(631 unique tokens: ['achiev', 'addit', 'adopt', 'agreement', 'brasília']...) from 40 documents (total 2198 corpus positions)
2022-08-17 10:04:35.401 INFO    gensim.corpora.dictionary: built Dictionary(631 unique tokens: ['achiev', 'addit', 'adopt', 'agreement', 'brasília']...) from 40 documents (total 2198 corpus positions)
INFO - gensim.utils -  Dictionary lifecycle event {'msg': "built Dictionary(631 unique tokens: ['achiev', 'addit', 'adopt', 'agreement', 'brasília']...) from 40 documents (total 2198 corpus positions)", 'datetime': '2022-08-17T10:04:35.402556', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
2022-08-17 10:0

In [34]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[10]

[(6, 1),
 (7, 1),
 (18, 1),
 (21, 1),
 (26, 1),
 (40, 2),
 (71, 1),
 (115, 3),
 (153, 1),
 (163, 2),
 (221, 1),
 (249, 1),
 (254, 1),
 (255, 1),
 (256, 1),
 (257, 1),
 (258, 1),
 (259, 1),
 (260, 1),
 (261, 1),
 (262, 1),
 (263, 1),
 (264, 1)]

In [26]:
# Calculate coherence score for optimal num of topics

from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import numpy as np 

best_num = float('NaN')
best_score = 0

# compute the coherence scores for each number of topics
for i in range(2,11):
    
    # create lda model with i topics
    lda = LdaModel(corpus=bow_corpus, num_topics=i, id2word=dictionary, random_state=42)
    
    # obtain the coherence score
    coherence_model = CoherenceModel(model=lda, texts=processed_docs, dictionary=dictionary, coherence='c_v')
    coherence_score = np.round(coherence_model.get_coherence(),2)
    if coherence_score > best_score:
        best_num = i
        best_score = coherence_score

print(f'The coherence score is highest ({best_score}) with {best_num} topics.')

INFO - gensim.models.ldamodel -  using symmetric alpha at 0.5
2022-08-17 10:14:07.877 INFO    gensim.models.ldamodel: using symmetric alpha at 0.5
INFO - gensim.models.ldamodel -  using symmetric eta at 0.5
2022-08-17 10:14:07.879 INFO    gensim.models.ldamodel: using symmetric eta at 0.5
INFO - gensim.models.ldamodel -  using serial LDA version on this node
2022-08-17 10:14:07.882 INFO    gensim.models.ldamodel: using serial LDA version on this node
INFO - gensim.models.ldamodel -  running online (single-pass) LDA training, 2 topics, 1 passes over the supplied corpus of 40 documents, updating model once every 40 documents, evaluating perplexity every 40 documents, iterating 50x with a convergence threshold of 0.001000
2022-08-17 10:14:07.883 INFO    gensim.models.ldamodel: running online (single-pass) LDA training, 2 topics, 1 passes over the supplied corpus of 40 documents, updating model once every 40 documents, evaluating perplexity every 40 documents, iterating 50x with a converge

2022-08-17 10:14:14.247 INFO    gensim.utils: LdaModel lifecycle event {'msg': 'trained LdaModel(num_terms=631, num_topics=3, decay=0.5, chunksize=2000) in 0.09s', 'datetime': '2022-08-17T10:14:14.247619', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
INFO - gensim.topic_coherence.probability_estimation -  using ParallelWordOccurrenceAccumulator(processes=15, batch_size=64) to estimate probabilities from sliding windows
2022-08-17 10:14:14.249 INFO    gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator(processes=15, batch_size=64) to estimate probabilities from sliding windows
INFO - gensim.topic_coherence.text_analysis -  15 accumulators retrieved from output queue
2022-08-17 10:14:20.756 INFO    gensim.topic_coherence.text_analysis: 15 accumulators retrieved from output queue
INFO - gensim.topic_coherence.text_analysis -  accumulated word 

INFO - gensim.models.ldamodel -  PROGRESS: pass 0, at document #40/40
2022-08-17 10:14:27.348 INFO    gensim.models.ldamodel: PROGRESS: pass 0, at document #40/40
INFO - gensim.models.ldamodel -  topic #0 (0.200): 0.017*"nation" + 0.013*"adapt" + 0.013*"implement" + 0.012*"includ" + 0.011*"brazil" + 0.010*"inform" + 0.010*"account" + 0.010*"develop" + 0.010*"plan" + 0.010*"polici"
2022-08-17 10:14:27.374 INFO    gensim.models.ldamodel: topic #0 (0.200): 0.017*"nation" + 0.013*"adapt" + 0.013*"implement" + 0.012*"includ" + 0.011*"brazil" + 0.010*"inform" + 0.010*"account" + 0.010*"develop" + 0.010*"plan" + 0.010*"polici"
INFO - gensim.models.ldamodel -  topic #1 (0.200): 0.025*"brazil" + 0.022*"contribut" + 0.021*"nation" + 0.019*"agreement" + 0.018*"emiss" + 0.018*"articl" + 0.017*"pari" + 0.013*"parti" + 0.013*"determin" + 0.013*"climat"
2022-08-17 10:14:27.377 INFO    gensim.models.ldamodel: topic #1 (0.200): 0.025*"brazil" + 0.022*"contribut" + 0.021*"nation" + 0.019*"agreement" + 0

2022-08-17 10:14:34.321 INFO    gensim.models.ldamodel: topic #4 (0.167): 0.027*"climat" + 0.022*"chang" + 0.021*"nation" + 0.017*"plan" + 0.015*"adapt" + 0.014*"includ" + 0.011*"brazil" + 0.011*"contribut" + 0.010*"inform" + 0.010*"refer"
INFO - gensim.models.ldamodel -  topic diff=2.827023, rho=1.000000
2022-08-17 10:14:34.323 INFO    gensim.models.ldamodel: topic diff=2.827023, rho=1.000000
INFO - gensim.utils -  LdaModel lifecycle event {'msg': 'trained LdaModel(num_terms=631, num_topics=6, decay=0.5, chunksize=2000) in 0.09s', 'datetime': '2022-08-17T10:14:34.324088', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}
2022-08-17 10:14:34.324 INFO    gensim.utils: LdaModel lifecycle event {'msg': 'trained LdaModel(num_terms=631, num_topics=6, decay=0.5, chunksize=2000) in 0.09s', 'datetime': '2022-08-17T10:14:34.324088', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022,

INFO - gensim.models.ldamodel -  running online (single-pass) LDA training, 8 topics, 1 passes over the supplied corpus of 40 documents, updating model once every 40 documents, evaluating perplexity every 40 documents, iterating 50x with a convergence threshold of 0.001000
2022-08-17 10:14:47.509 INFO    gensim.models.ldamodel: running online (single-pass) LDA training, 8 topics, 1 passes over the supplied corpus of 40 documents, updating model once every 40 documents, evaluating perplexity every 40 documents, iterating 50x with a convergence threshold of 0.001000
INFO - gensim.models.ldamodel -  -9.301 per-word bound, 630.7 perplexity estimate based on a held-out corpus of 40 documents with 2198 words
2022-08-17 10:14:47.550 INFO    gensim.models.ldamodel: -9.301 per-word bound, 630.7 perplexity estimate based on a held-out corpus of 40 documents with 2198 words
INFO - gensim.models.ldamodel -  PROGRESS: pass 0, at document #40/40
2022-08-17 10:14:47.551 INFO    gensim.models.ldamodel

INFO - gensim.models.ldamodel -  topic #6 (0.111): 0.039*"applic" + 0.023*"target" + 0.020*"agreement" + 0.020*"inform" + 0.020*"includ" + 0.020*"contribut" + 0.019*"emiss" + 0.019*"nation" + 0.019*"determin" + 0.019*"pari"
2022-08-17 10:14:54.269 INFO    gensim.models.ldamodel: topic #6 (0.111): 0.039*"applic" + 0.023*"target" + 0.020*"agreement" + 0.020*"inform" + 0.020*"includ" + 0.020*"contribut" + 0.019*"emiss" + 0.019*"nation" + 0.019*"determin" + 0.019*"pari"
INFO - gensim.models.ldamodel -  topic #2 (0.111): 0.028*"brazilian" + 0.018*"implement" + 0.012*"nation" + 0.011*"energi" + 0.010*"emiss" + 0.010*"protect" + 0.010*"polici" + 0.010*"account" + 0.009*"govern" + 0.009*"brazil"
2022-08-17 10:14:54.270 INFO    gensim.models.ldamodel: topic #2 (0.111): 0.028*"brazilian" + 0.018*"implement" + 0.012*"nation" + 0.011*"energi" + 0.010*"emiss" + 0.010*"protect" + 0.010*"polici" + 0.010*"account" + 0.009*"govern" + 0.009*"brazil"
INFO - gensim.models.ldamodel -  topic #5 (0.111): 0.0

2022-08-17 10:15:00.934 INFO    gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator(processes=15, batch_size=64) to estimate probabilities from sliding windows
INFO - gensim.topic_coherence.text_analysis -  15 accumulators retrieved from output queue
2022-08-17 10:15:07.308 INFO    gensim.topic_coherence.text_analysis: 15 accumulators retrieved from output queue
INFO - gensim.topic_coherence.text_analysis -  accumulated word occurrence stats for 40 virtual documents
2022-08-17 10:15:07.331 INFO    gensim.topic_coherence.text_analysis: accumulated word occurrence stats for 40 virtual documents


The coherence score is highest (0.42) with 10 topics.


In [27]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)

INFO - gensim.models.ldamodel -  using symmetric alpha at 0.1
2022-08-17 10:24:21.296 INFO    gensim.models.ldamodel: using symmetric alpha at 0.1
INFO - gensim.models.ldamodel -  using symmetric eta at 0.1
2022-08-17 10:24:21.301 INFO    gensim.models.ldamodel: using symmetric eta at 0.1
INFO - gensim.models.ldamodel -  using serial LDA version on this node
2022-08-17 10:24:21.303 INFO    gensim.models.ldamodel: using serial LDA version on this node
INFO - gensim.models.ldamulticore -  running online LDA training, 10 topics, 2 passes over the supplied corpus of 40 documents, updating every 4000 documents, evaluating every ~40 documents, iterating 50x with a convergence threshold of 0.001000
2022-08-17 10:24:21.307 INFO    gensim.models.ldamulticore: running online LDA training, 10 topics, 2 passes over the supplied corpus of 40 documents, updating every 4000 documents, evaluating every ~40 documents, iterating 50x with a convergence threshold of 0.001000
INFO - gensim.models.ldamultic

In [33]:
# show the words most strongly associated with each topic
for topic in lda_model.print_topics():
    print(topic)

INFO - gensim.models.ldamodel -  topic #0 (0.100): 0.027*"plan" + 0.026*"adapt" + 0.021*"year" + 0.020*"refer" + 0.019*"climat" + 0.016*"brazil" + 0.015*"nation" + 0.015*"mitig" + 0.014*"chang" + 0.014*"implement"
2022-08-17 10:25:46.085 INFO    gensim.models.ldamodel: topic #0 (0.100): 0.027*"plan" + 0.026*"adapt" + 0.021*"year" + 0.020*"refer" + 0.019*"climat" + 0.016*"brazil" + 0.015*"nation" + 0.015*"mitig" + 0.014*"chang" + 0.014*"implement"
INFO - gensim.models.ldamodel -  topic #1 (0.100): 0.021*"plan" + 0.020*"increas" + 0.019*"individu" + 0.016*"temperatur" + 0.015*"contribut" + 0.015*"relat" + 0.015*"sector" + 0.014*"includ" + 0.013*"carbon" + 0.012*"climat"
2022-08-17 10:25:46.087 INFO    gensim.models.ldamodel: topic #1 (0.100): 0.021*"plan" + 0.020*"increas" + 0.019*"individu" + 0.016*"temperatur" + 0.015*"contribut" + 0.015*"relat" + 0.015*"sector" + 0.014*"includ" + 0.013*"carbon" + 0.012*"climat"
INFO - gensim.models.ldamodel -  topic #2 (0.100): 0.030*"brazil" + 0.020*

(0, '0.027*"plan" + 0.026*"adapt" + 0.021*"year" + 0.020*"refer" + 0.019*"climat" + 0.016*"brazil" + 0.015*"nation" + 0.015*"mitig" + 0.014*"chang" + 0.014*"implement"')
(1, '0.021*"plan" + 0.020*"increas" + 0.019*"individu" + 0.016*"temperatur" + 0.015*"contribut" + 0.015*"relat" + 0.015*"sector" + 0.014*"includ" + 0.013*"carbon" + 0.012*"climat"')
(2, '0.030*"brazil" + 0.020*"climat" + 0.017*"commit" + 0.014*"brazilian" + 0.012*"effort" + 0.011*"glasgow" + 0.011*"pact" + 0.010*"unit" + 0.009*"chang" + 0.009*"communic"')
(3, '0.025*"account" + 0.025*"energi" + 0.024*"sourc" + 0.018*"renew" + 0.013*"solar" + 0.013*"biomass" + 0.012*"electr" + 0.012*"demand" + 0.012*"wind" + 0.012*"transport"')
(4, '0.029*"approach" + 0.026*"brazilian" + 0.021*"brazil" + 0.019*"emiss" + 0.015*"adapt" + 0.014*"sector" + 0.013*"subsequ" + 0.012*"consist" + 0.011*"govern" + 0.011*"product"')
(5, '0.029*"institut" + 0.027*"nation" + 0.026*"chang" + 0.025*"climat" + 0.016*"polici" + 0.015*"plan" + 0.012*"imp

In [28]:
for index, score in sorted(lda_model[bow_corpus[12]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))


Score: 0.9859353303909302	 
Topic: 0.039*"nation" + 0.031*"contribut" + 0.021*"determin" + 0.021*"brazil" + 0.017*"climat" + 0.016*"brazilian" + 0.015*"agreement" + 0.015*"articl" + 0.015*"parti" + 0.013*"includ"


In [29]:
import gensim
import gensim.corpora as corpora
from gensim.corpora import Dictionary
from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel

from pprint import pprint

import spacy

import pickle
import re 
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

import matplotlib.pyplot as plt 
import pandas as pd

In [30]:
#Creating Topic Distance Visualization 
pyLDAvis.enable_notebook()
p = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)
p

  default_term_info = default_term_info.sort_values(


In [29]:
# Save the visualization in a html file
p = pyLDAvis.gensim_models.prepare(lda_model, bow_corpus, dictionary)
pyLDAvis.save_html(p, 'lda.html')

  default_term_info = default_term_info.sort_values(


In [31]:
unseen_document = 'Determining this huge process was quiet nice'
bow_vector = dictionary.doc2bow(preprocess(unseen_document))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.39343810081481934	 Topic: 0.029*"institut" + 0.027*"nation" + 0.026*"chang" + 0.025*"climat" + 0.016*"polici"
Score: 0.33987119793891907	 Topic: 0.039*"nation" + 0.031*"contribut" + 0.021*"determin" + 0.021*"brazil" + 0.017*"climat"
Score: 0.03334071487188339	 Topic: 0.042*"agreement" + 0.034*"pari" + 0.031*"articl" + 0.030*"applic" + 0.029*"paragraph"
Score: 0.03333934769034386	 Topic: 0.023*"emiss" + 0.021*"target" + 0.021*"temperatur" + 0.020*"increas" + 0.019*"level"
Score: 0.0333390049636364	 Topic: 0.021*"plan" + 0.020*"increas" + 0.019*"individu" + 0.016*"temperatur" + 0.015*"contribut"
Score: 0.033334605395793915	 Topic: 0.027*"plan" + 0.026*"adapt" + 0.021*"year" + 0.020*"refer" + 0.019*"climat"
Score: 0.03333446756005287	 Topic: 0.038*"climat" + 0.025*"chang" + 0.023*"global" + 0.017*"temperatur" + 0.017*"brazil"
Score: 0.033334311097860336	 Topic: 0.030*"brazil" + 0.020*"climat" + 0.017*"commit" + 0.014*"brazilian" + 0.012*"effort"
Score: 0.03333412855863571	 Topic: