In [18]:
from typing import Callable, Dict, List, Optional

from pathlib import Path
import re
import logging
import string 
import streamlit as st
logger = logging.getLogger(__name__)

import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from haystack.utils import convert_files_to_docs, fetch_archive_from_http
from haystack.nodes.file_converter import BaseConverter, DocxToTextConverter, PDFToTextConverter, TextConverter
from haystack.schema import Document
import pdfplumber

import pandas as pd
from haystack.nodes import PreProcessor
import tempfile
import sqlite3

# Text import and cleaning


In [19]:
# Function for reading txt,pdf and docx files 

def load_document(
    file: str,
    encoding: Optional[str] = None,
    id_hash_keys: Optional[List[str]] = None,
) -> List[Document]:
    
    """
    takes docx, txt and pdf files as input and extracts text as well as the filename as metadata. Since haystack
    does not take care of all pdf files, pdfplumber is attached to the pipeline in case the pdf extraction fails
    via Haystack.

    Returns a list of type haystack.schema.Document
    """

    if file.endswith('.pdf'):
        converter = PDFToTextConverter(remove_numeric_tables=True)
    if file.endswith('.txt'):
        converter = TextConverter()
    if file.endswith('.docx'):
        converter = DocxToTextConverter()

    print(converter)
    documents = []

    logger.info("Converting {}".format(file))
    # PDFToTextConverter, TextConverter, and DocxToTextConverter return a list containing a single Document
    document = converter.convert(
                file_path=file, meta=None, encoding=encoding, id_hash_keys=id_hash_keys
            )[0]
    text = document.content
    documents.append(Document(content=text, meta={"name": file}, id_hash_keys=id_hash_keys))
    
    '''check if text is empty and apply different pdf processor. This can happen whith certain pdf types.'''
    
    return documents

In [20]:
'''basic cleaning - suitable for transformer models'''
def basic(s):
    """
    :param s: string to be processed
    :return: processed string: see comments in the source code for more info
    """
    # Text Lowercase
    s = s.lower() 
    # Remove punctuation
    translator = str.maketrans(' ', ' ', string.punctuation) 
    s = s.translate(translator)
    # Remove URLs
    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
    s = re.sub(r"http\S+", " ", s)
    # Remove new line characters
    s = re.sub('\n', ' ', s) 
  
    # Remove distracting single quotes
    s = re.sub("\'", " ", s) 
    # Remove all remaining numbers and non alphanumeric characters
    s = re.sub(r'\d+', ' ', s) 
    s = re.sub(r'\W+', ' ', s)

    # define custom words to replace:
    #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
    
    return s.strip()

 

def preprocessing(document):

    """
    takes in haystack document object and splits it into paragraphs and applies simple cleaning.

    Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and 
    list that contains all text joined together.
    """    

    preprocessor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        split_by="word",
        split_length=120,
        split_respect_sentence_boundary=True,
        #split_overlap=5
    )
    for i in document:
        docs_processed = preprocessor.process([i])
        for item in docs_processed:
            item.content = basic(item.content)

    print("your document has been splitted to", len(docs_processed), "paragraphs")
    
    # create dataframe of text and list of all text
    df = pd.DataFrame(docs_processed)
    all_text = " ".join(df.content.to_list())
    par_list = df.content.to_list()

    return df

In [21]:
import os

# Change the current working directory
os.chdir('C:\\Users\\serva\\Downloads\\NDCs')

# Print the current working directory
print("Current working directory: {0}".format(os.getcwd()))


cwd = os.getcwd()  # Get the current working directory (cwd)
files = os.listdir(cwd)  # Get all the files in that directory
print("Files in %r: %s" % (cwd, files))


# Safe directory in a var
directory_in_str='C:\\Users\\serva\\Downloads\\NDCs'
directory = os.fsencode(directory_in_str)


Current working directory: C:\Users\serva\Downloads\NDCs
Files in 'C:\\Users\\serva\\Downloads\\NDCs': ['Australias NDC June 2022 Update.docx', 'BOTSWANA.docx', 'EU_NDC_Submission_December 2020.docx', 'Updated - First NDC - FINAL - PDF.docx']


In [22]:
#Test
""""data=pd.DataFrame(columns=["content","text","id","meta","score","embedding"])
df=preprocessing(docs)
data1=data.append(df)

print(data1)"""

#Create an empty df to append the others to

data=pd.DataFrame(columns=["content","id","meta","score","embedding"])

# Iterate through the files in that directory

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    docs=load_document(filename)
    
# Using the Preprocessor to create a df and append it to data 
    df = preprocessing(docs)
    df["Country"]=filename
    data=data.append(df)


2022-08-17 10:14:31.987 INFO    __main__: Converting Australias NDC June 2022 Update.docx


<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x0000016B4CA489A0>


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 166.06docs/s]

your document has been splitted to 14 paragraphs



  data=data.append(df)
2022-08-17 10:14:32.369 INFO    __main__: Converting BOTSWANA.docx


<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x0000016B4CA040A0>


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<?, ?docs/s]
  data=data.append(df)
2022-08-17 10:14:32.485 INFO    __main__: Converting EU_NDC_Submission_December 2020.docx


your document has been splitted to 9 paragraphs
<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x0000016B49C21880>


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 86.02docs/s]


your document has been splitted to 38 paragraphs


  data=data.append(df)
2022-08-17 10:14:32.639 INFO    __main__: Converting Updated - First NDC - FINAL - PDF.docx


<haystack.nodes.file_converter.docx.DocxToTextConverter object at 0x0000016B49C21880>


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 45.58docs/s]
  data=data.append(df)


your document has been splitted to 40 paragraphs


In [23]:
data.head(5)

Unnamed: 0,content,id,meta,score,embedding,content_type,Country
0,commonwealth of australia creative commons attribution international licence...,d2055926c3e3d72d804f0a85ff79b42,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 0}",,,text,Australias NDC June 2022 Update.docx
1,the full licence terms are available from content contained herein should be...,cedb83948b28dc12f54f342b734deeeb,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 1}",,,text,Australias NDC June 2022 Update.docx
2,no representation expressed or implied is made as to the currency accuracy r...,470a3cb556e9eb11cb75633cac46e8b2,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 2}",,,text,Australias NDC June 2022 Update.docx
3,both targets are economywide emissions reduction commitments covering all se...,677fca625f05d36da91dad3b9fc1de07,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 3}",,,text,Australias NDC June 2022 Update.docx
4,it reflects the australian government s resolve to urgently step up action a...,4e12db243fde45e6e1b8b99fb865bdad,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 4}",,,text,Australias NDC June 2022 Update.docx


In [24]:
# Rename the columns 

data=data.rename(columns = {'content':'Text'})

data.head(10)

Unnamed: 0,Text,id,meta,score,embedding,content_type,Country
0,commonwealth of australia creative commons attribution international licence...,d2055926c3e3d72d804f0a85ff79b42,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 0}",,,text,Australias NDC June 2022 Update.docx
1,the full licence terms are available from content contained herein should be...,cedb83948b28dc12f54f342b734deeeb,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 1}",,,text,Australias NDC June 2022 Update.docx
2,no representation expressed or implied is made as to the currency accuracy r...,470a3cb556e9eb11cb75633cac46e8b2,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 2}",,,text,Australias NDC June 2022 Update.docx
3,both targets are economywide emissions reduction commitments covering all se...,677fca625f05d36da91dad3b9fc1de07,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 3}",,,text,Australias NDC June 2022 Update.docx
4,it reflects the australian government s resolve to urgently step up action a...,4e12db243fde45e6e1b8b99fb865bdad,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 4}",,,text,Australias NDC June 2022 Update.docx
5,the australian government is working to urgently implement these policies to...,c2ee6398537ac65e7591b85b957bdd79,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 5}",,,text,Australias NDC June 2022 Update.docx
6,a powering the regions fund to support the development of new clean energy i...,604457c5620e049fb968306436885fa3,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 6}",,,text,Australias NDC June 2022 Update.docx
7,australia s first national electric vehicle strategy to reduce emissions and...,6d20bd5f94f1c1306614dafac6eb4c4e,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 7}",,,text,Australias NDC June 2022 Update.docx
8,these new measures will build on existing emissions reduction and low emissi...,29888d9e51beeface2fe54477104ba68,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 8}",,,text,Australias NDC June 2022 Update.docx
9,the annual statement and other climate policy will be informed by australia ...,177da64dfdc21898f9fdf2972508994c,"{'name': 'Australias NDC June 2022 Update.docx', '_split_id': 9}",,,text,Australias NDC June 2022 Update.docx


In [25]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [26]:

documents=data["Text"]

documents.head(10)



0    commonwealth of australia creative commons attribution international licence...
1    the full licence terms are available from content contained herein should be...
2    no representation expressed or implied is made as to the currency accuracy r...
3    both targets are economywide emissions reduction commitments covering all se...
4    it reflects the australian government s resolve to urgently step up action a...
5    the australian government is working to urgently implement these policies to...
6    a powering the regions fund to support the development of new clean energy i...
7    australia s first national electric vehicle strategy to reduce emissions and...
8    these new measures will build on existing emissions reduction and low emissi...
9    the annual statement and other climate policy will be informed by australia ...
Name: Text, dtype: object

In [27]:
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2:
            result.append(lemmatize_stemming(token))
    return result

In [28]:
stemmer = SnowballStemmer('english')
original_words = ['caresses', 'flies', 'dies', 'mules', 'denied','died', 'agreed', 'owned', 
           'humbled', 'sized','meeting', 'stating', 'siezing', 'itemization','sensational', 
           'traditional', 'reference', 'colonizer','plotted']
singles = [stemmer.stem(plural) for plural in original_words]
pd.DataFrame(data = {'original word': original_words, 'stemmed': singles})

Unnamed: 0,original word,stemmed
0,caresses,caress
1,flies,fli
2,dies,die
3,mules,mule
4,denied,deni
5,died,die
6,agreed,agre
7,owned,own
8,humbled,humbl
9,sized,size


In [29]:

processed_docs = data['Text'].map(preprocess)



In [30]:
processed_docs[:10]

0    [commonwealth, australia, creativ, common, attribut, intern, licenc, note, c...
1    [licenc, term, avail, content, contain, attribut, australia, nation, determi...
2    [represent, express, impli, currenc, accuraci, reliabl, complet, inform, con...
3    [target, economywid, emiss, reduct, commit, cover, sector, gas, includ, aust...
4    [reflect, australian, govern, resolv, urgent, step, action, work, alongsid, ...
5    [australian, govern, work, urgent, implement, polici, maximis, emiss, reduct...
6    [power, region, fund, support, develop, new, clean, energi, industri, decarb...
7    [australia, nation, electr, vehicl, strategi, reduc, emiss, acceler, uptak, ...
8    [new, measur, build, exist, emiss, reduct, low, emiss, technolog, acceler, p...
9    [annual, statement, climat, polici, inform, australia, climat, chang, author...
Name: Text, dtype: object

# Bag of Words


In [40]:
dictionary = gensim.corpora.Dictionary(processed_docs)

2022-08-17 10:16:10.759 INFO    gensim.corpora.dictionary: adding document #0 to Dictionary(0 unique tokens: [])
2022-08-17 10:16:10.767 INFO    gensim.corpora.dictionary: built Dictionary(1152 unique tokens: ['adapt', 'agreement', 'allow', 'arm', 'attribut']...) from 101 documents (total 5635 corpus positions)
2022-08-17 10:16:10.768 INFO    gensim.utils: Dictionary lifecycle event {'msg': "built Dictionary(1152 unique tokens: ['adapt', 'agreement', 'allow', 'arm', 'attribut']...) from 101 documents (total 5635 corpus positions)", 'datetime': '2022-08-17T10:16:10.768046', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'created'}


In [41]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
bow_corpus[10]

[(0, 1),
 (1, 1),
 (5, 4),
 (27, 2),
 (40, 1),
 (58, 1),
 (63, 1),
 (69, 2),
 (85, 1),
 (88, 2),
 (95, 1),
 (107, 1),
 (108, 1),
 (117, 3),
 (119, 1),
 (125, 1),
 (127, 1),
 (135, 1),
 (137, 1),
 (151, 2),
 (159, 1),
 (184, 1),
 (205, 1),
 (257, 4),
 (268, 1),
 (283, 1),
 (285, 1),
 (295, 2),
 (313, 1),
 (314, 2),
 (315, 1),
 (316, 1),
 (317, 2),
 (318, 1),
 (319, 1),
 (320, 1),
 (321, 1),
 (322, 1),
 (323, 1),
 (324, 1),
 (325, 1),
 (326, 1),
 (327, 2)]

# TF-IDF


In [42]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)

2022-08-17 10:16:13.507 INFO    gensim.models.tfidfmodel: collecting document frequencies
2022-08-17 10:16:13.509 INFO    gensim.models.tfidfmodel: PROGRESS: processing document #0
2022-08-17 10:16:13.514 INFO    gensim.utils: TfidfModel lifecycle event {'msg': 'calculated IDF weights for 101 documents and 1152 features (4346 matrix non-zeros)', 'datetime': '2022-08-17T10:16:13.514599', 'gensim': '4.1.2', 'python': '3.9.12 (main, Apr  4 2022, 05:22:27) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'initialize'}


In [43]:
corpus_tfidf = tfidf[bow_corpus]



In [44]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.0386516605937095),
 (1, 0.027528524873248885),
 (2, 0.07945985312622485),
 (3, 0.11357588818407703),
 (4, 0.3178394125048994),
 (5, 0.10484707001246571),
 (6, 0.056910380511656374),
 (7, 0.11357588818407703),
 (8, 0.20844465760547684),
 (9, 0.2596187101922551),
 (10, 0.09651787065515095),
 (11, 0.11357588818407703),
 (12, 0.11357588818407703),
 (13, 0.3407276645522311),
 (14, 0.11357588818407703),
 (15, 0.11357588818407703),
 (16, 0.11357588818407703),
 (17, 0.16369452897972267),
 (18, 0.48258935327575475),
 (19, 0.11357588818407703),
 (20, 0.11357588818407703),
 (21, 0.22715177636815406),
 (22, 0.22715177636815406),
 (23, 0.11357588818407703),
 (24, 0.03436090789708789),
 (25, 0.11357588818407703),
 (26, 0.07396839804058246),
 (27, 0.05456484299324088),
 (28, 0.20181489042916792),
 (29, 0.09651787065515095),
 (30, 0.11357588818407703),
 (31, 0.08653957006408501),
 (32, 0.11357588818407703),
 (33, 0.11357588818407703),
 (34, 0.052423535006232855),
 (35, 0.11357588818407703),
 (3

In [46]:
from gensim.models import LdaModel
from gensim.models.coherencemodel import CoherenceModel
import numpy as np 

best_num = float('NaN')
best_score = 0

# compute the coherence scores for each number of topics
for i in range(2,11):
    
    # create lda model with i topics
    lda = LdaModel(corpus=corpus_tfidf, num_topics=i, id2word=dictionary, random_state=42)
    
    # obtain the coherence score
    coherence_model = CoherenceModel(model=lda, texts=processed_docs, dictionary=dictionary, coherence='c_v')
    coherence_score = np.round(coherence_model.get_coherence(),2)
    if coherence_score > best_score:
        best_num = i
        best_score = coherence_score

print(f'The coherence score is highest ({best_score}) with {best_num} topics.')

2022-08-17 10:16:33.726 INFO    gensim.models.ldamodel: using symmetric alpha at 0.5
2022-08-17 10:16:33.726 INFO    gensim.models.ldamodel: using symmetric eta at 0.5
2022-08-17 10:16:33.727 INFO    gensim.models.ldamodel: using serial LDA version on this node
2022-08-17 10:16:33.729 INFO    gensim.models.ldamodel: running online (single-pass) LDA training, 2 topics, 1 passes over the supplied corpus of 101 documents, updating model once every 101 documents, evaluating perplexity every 101 documents, iterating 50x with a convergence threshold of 0.001000
2022-08-17 10:16:33.831 INFO    gensim.models.ldamodel: -8.711 per-word bound, 419.2 perplexity estimate based on a held-out corpus of 101 documents with 587 words
2022-08-17 10:16:33.832 INFO    gensim.models.ldamodel: PROGRESS: pass 0, at document #101/101
2022-08-17 10:16:33.883 INFO    gensim.models.ldamodel: topic #0 (0.500): 0.003*"target" + 0.003*"climat" + 0.003*"refer" + 0.003*"emiss" + 0.003*"articl" + 0.003*"agreement" + 0.

2022-08-17 10:16:52.740 INFO    gensim.models.ldamodel: -14.667 per-word bound, 26005.5 perplexity estimate based on a held-out corpus of 101 documents with 587 words
2022-08-17 10:16:52.748 INFO    gensim.models.ldamodel: PROGRESS: pass 0, at document #101/101
2022-08-17 10:16:52.774 INFO    gensim.models.ldamodel: topic #0 (0.200): 0.005*"refer" + 0.004*"applic" + 0.004*"account" + 0.003*"remov" + 0.003*"methodolog" + 0.003*"emiss" + 0.003*"target" + 0.003*"assumpt" + 0.003*"agreement" + 0.003*"approach"
2022-08-17 10:16:52.782 INFO    gensim.models.ldamodel: topic #1 (0.200): 0.003*"energi" + 0.003*"european" + 0.002*"state" + 0.002*"member" + 0.002*"emiss" + 0.002*"institut" + 0.002*"chang" + 0.002*"climat" + 0.002*"water" + 0.002*"green"
2022-08-17 10:16:52.783 INFO    gensim.models.ldamodel: topic #2 (0.200): 0.005*"articl" + 0.004*"paragraph" + 0.004*"target" + 0.003*"polici" + 0.003*"forest" + 0.003*"address" + 0.003*"pari" + 0.003*"contribut" + 0.003*"goal" + 0.003*"object"
20

2022-08-17 10:17:05.367 INFO    gensim.topic_coherence.probability_estimation: using ParallelWordOccurrenceAccumulator(processes=15, batch_size=64) to estimate probabilities from sliding windows
2022-08-17 10:17:11.367 INFO    gensim.topic_coherence.text_analysis: 15 accumulators retrieved from output queue
2022-08-17 10:17:11.384 INFO    gensim.topic_coherence.text_analysis: accumulated word occurrence stats for 101 virtual documents
2022-08-17 10:17:11.567 INFO    gensim.models.ldamodel: using symmetric alpha at 0.125
2022-08-17 10:17:11.574 INFO    gensim.models.ldamodel: using symmetric eta at 0.125
2022-08-17 10:17:11.575 INFO    gensim.models.ldamodel: using serial LDA version on this node
2022-08-17 10:17:11.576 INFO    gensim.models.ldamodel: running online (single-pass) LDA training, 8 topics, 1 passes over the supplied corpus of 101 documents, updating model once every 101 documents, evaluating perplexity every 101 documents, iterating 50x with a convergence threshold of 0.00

2022-08-17 10:17:25.310 INFO    gensim.models.ldamodel: topic #3 (0.100): 0.005*"agreement" + 0.004*"articl" + 0.004*"joint" + 0.004*"pari" + 0.004*"climat" + 0.004*"australia" + 0.004*"applic" + 0.004*"transpar" + 0.004*"transit" + 0.004*"econom"
2022-08-17 10:17:25.310 INFO    gensim.models.ldamodel: topic #8 (0.100): 0.004*"botswana" + 0.004*"period" + 0.004*"combin" + 0.004*"place" + 0.003*"energi" + 0.003*"frame" + 0.003*"intend" + 0.003*"achiev" + 0.003*"recoveri" + 0.003*"refer"
2022-08-17 10:17:25.311 INFO    gensim.models.ldamodel: topic #4 (0.100): 0.004*"light" + 0.004*"subject" + 0.004*"develop" + 0.004*"inform" + 0.004*"temperatur" + 0.004*"approach" + 0.003*"brazil" + 0.003*"brazilian" + 0.003*"plan" + 0.003*"climat"
2022-08-17 10:17:25.312 INFO    gensim.models.ldamodel: topic diff=6.390325, rho=1.000000
2022-08-17 10:17:25.313 INFO    gensim.utils: LdaModel lifecycle event {'msg': 'trained LdaModel(num_terms=1152, num_topics=10, decay=0.5, chunksize=2000) in 0.10s', 'da

The coherence score is highest (0.38) with 4 topics.


In [48]:
#Generating the lda_model with TF-IDF

lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=4, id2word=dictionary, passes=2, workers=4)

2022-08-17 10:28:14.945 INFO    gensim.models.ldamodel: using symmetric alpha at 0.25
2022-08-17 10:28:14.946 INFO    gensim.models.ldamodel: using symmetric eta at 0.25
2022-08-17 10:28:14.946 INFO    gensim.models.ldamodel: using serial LDA version on this node
2022-08-17 10:28:14.948 INFO    gensim.models.ldamulticore: running online LDA training, 4 topics, 2 passes over the supplied corpus of 101 documents, updating every 8000 documents, evaluating every ~101 documents, iterating 50x with a convergence threshold of 0.001000
2022-08-17 10:28:14.952 INFO    gensim.models.ldamulticore: training LDA model using 4 processes
2022-08-17 10:28:15.439 INFO    gensim.models.ldamulticore: PROGRESS: pass 0, dispatched chunk #0 = documents up to #101/101, outstanding queue size 1
2022-08-17 10:28:17.529 INFO    gensim.models.ldamodel: topic #0 (0.250): 0.004*"australia" + 0.004*"energi" + 0.004*"polici" + 0.003*"account" + 0.003*"climat" + 0.003*"methodolog" + 0.003*"target" + 0.003*"articl" + 

In [49]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))


2022-08-17 10:28:17.708 INFO    gensim.models.ldamodel: topic #0 (0.250): 0.004*"australia" + 0.004*"energi" + 0.004*"polici" + 0.003*"account" + 0.003*"climat" + 0.003*"methodolog" + 0.003*"target" + 0.003*"remov" + 0.003*"articl" + 0.003*"contribut"
2022-08-17 10:28:17.710 INFO    gensim.models.ldamodel: topic #1 (0.250): 0.005*"paragraph" + 0.004*"articl" + 0.004*"agreement" + 0.004*"pari" + 0.003*"brazil" + 0.003*"refer" + 0.003*"parti" + 0.003*"target" + 0.003*"increas" + 0.003*"emiss"
2022-08-17 10:28:17.711 INFO    gensim.models.ldamodel: topic #2 (0.250): 0.003*"ndc" + 0.003*"approach" + 0.003*"adapt" + 0.003*"european" + 0.003*"applic" + 0.003*"account" + 0.002*"reduct" + 0.002*"member" + 0.002*"nation" + 0.002*"sector"
2022-08-17 10:28:17.711 INFO    gensim.models.ldamodel: topic #3 (0.250): 0.003*"transpar" + 0.003*"target" + 0.003*"clariti" + 0.003*"climat" + 0.003*"institut" + 0.003*"expenditur" + 0.003*"understand" + 0.002*"emiss" + 0.002*"follow" + 0.002*"submiss"


Topic: 0 Word: 0.004*"australia" + 0.004*"energi" + 0.004*"polici" + 0.003*"account" + 0.003*"climat" + 0.003*"methodolog" + 0.003*"target" + 0.003*"remov" + 0.003*"articl" + 0.003*"contribut"
Topic: 1 Word: 0.005*"paragraph" + 0.004*"articl" + 0.004*"agreement" + 0.004*"pari" + 0.003*"brazil" + 0.003*"refer" + 0.003*"parti" + 0.003*"target" + 0.003*"increas" + 0.003*"emiss"
Topic: 2 Word: 0.003*"ndc" + 0.003*"approach" + 0.003*"adapt" + 0.003*"european" + 0.003*"applic" + 0.003*"account" + 0.002*"reduct" + 0.002*"member" + 0.002*"nation" + 0.002*"sector"
Topic: 3 Word: 0.003*"transpar" + 0.003*"target" + 0.003*"clariti" + 0.003*"climat" + 0.003*"institut" + 0.003*"expenditur" + 0.003*"understand" + 0.002*"emiss" + 0.002*"follow" + 0.002*"submiss"


In [51]:
# show the words most strongly associated with each topic
for topic in lda_model_tfidf.print_topics():
    print(topic)

2022-08-17 10:29:02.450 INFO    gensim.models.ldamodel: topic #0 (0.250): 0.004*"australia" + 0.004*"energi" + 0.004*"polici" + 0.003*"account" + 0.003*"climat" + 0.003*"methodolog" + 0.003*"target" + 0.003*"remov" + 0.003*"articl" + 0.003*"contribut"
2022-08-17 10:29:02.452 INFO    gensim.models.ldamodel: topic #1 (0.250): 0.005*"paragraph" + 0.004*"articl" + 0.004*"agreement" + 0.004*"pari" + 0.003*"brazil" + 0.003*"refer" + 0.003*"parti" + 0.003*"target" + 0.003*"increas" + 0.003*"emiss"
2022-08-17 10:29:02.453 INFO    gensim.models.ldamodel: topic #2 (0.250): 0.003*"ndc" + 0.003*"approach" + 0.003*"adapt" + 0.003*"european" + 0.003*"applic" + 0.003*"account" + 0.002*"reduct" + 0.002*"member" + 0.002*"nation" + 0.002*"sector"
2022-08-17 10:29:02.454 INFO    gensim.models.ldamodel: topic #3 (0.250): 0.003*"transpar" + 0.003*"target" + 0.003*"clariti" + 0.003*"climat" + 0.003*"institut" + 0.003*"expenditur" + 0.003*"understand" + 0.002*"emiss" + 0.002*"follow" + 0.002*"submiss"


(0, '0.004*"australia" + 0.004*"energi" + 0.004*"polici" + 0.003*"account" + 0.003*"climat" + 0.003*"methodolog" + 0.003*"target" + 0.003*"remov" + 0.003*"articl" + 0.003*"contribut"')
(1, '0.005*"paragraph" + 0.004*"articl" + 0.004*"agreement" + 0.004*"pari" + 0.003*"brazil" + 0.003*"refer" + 0.003*"parti" + 0.003*"target" + 0.003*"increas" + 0.003*"emiss"')
(2, '0.003*"ndc" + 0.003*"approach" + 0.003*"adapt" + 0.003*"european" + 0.003*"applic" + 0.003*"account" + 0.002*"reduct" + 0.002*"member" + 0.002*"nation" + 0.002*"sector"')
(3, '0.003*"transpar" + 0.003*"target" + 0.003*"clariti" + 0.003*"climat" + 0.003*"institut" + 0.003*"expenditur" + 0.003*"understand" + 0.002*"emiss" + 0.002*"follow" + 0.002*"submiss"')


In [50]:
for index, score in sorted(lda_model_tfidf[bow_corpus[5]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))


Score: 0.9888909459114075	 
Topic: 0.005*"paragraph" + 0.004*"articl" + 0.004*"agreement" + 0.004*"pari" + 0.003*"brazil" + 0.003*"refer" + 0.003*"parti" + 0.003*"target" + 0.003*"increas" + 0.003*"emiss"
