# NLP and Statistical analysis of the Estonian National Reform Program Action Plan
## TF-IDF, LDA Model, Summarization, Cosine Similarity, Word Embeddings

I use the tools described above to supplement my human/manual analysis of the Estonia NRP-AP. The paper can be found at http://jeanmonnetchair.eubga.uom.gr/download/jean-monnet-essay-3-2020-podiotis/ . For word vectors used see https://nlp.stanford.edu/projects/glove/ .

In [1]:
from tika import parser
import pandas as pd
import re
import numpy as np
import nltk
import heapq
from nltk import PorterStemmer, word_tokenize

In [2]:
stop_words = set(nltk.corpus.stopwords.words('english'))
stop_words.update(["estonia"])
paragraphs_list = []
processed_list =[]

In [3]:
pdf = parser.from_file('nrp_estonia_2020_action_plan_2019-2020_30.05.2019.pdf')
pdf_text = pdf['content']
paragraphs_list = pdf_text.split("\n\n")

In [4]:
for paragraph in paragraphs_list:
    paragraph_tokenized = []
    paragraph_tokenized_lemmatized = []

    # STRING PROCESSING
    paragraph = ' '.join(paragraph.split())
    paragraph = paragraph.lower()
    paragraph = re.sub("x ", "", paragraph)
    paragraph = ' '.join(paragraph.split())
    paragraph = re.sub("ministry", "", paragraph)
    paragraph = ' '.join(paragraph.split())
    paragraph = re.sub("of education and research", "", paragraph)
    paragraph = re.sub("of culture", "", paragraph)
    paragraph = re.sub("of social affairs", "", paragraph)
    paragraph = re.sub("of economic affairs and communications", "", paragraph)
    paragraph = re.sub("of finance", "", paragraph)
    paragraph = re.sub("government office", "", paragraph)
    paragraph = re.sub("of justice", "", paragraph)
    paragraph = re.sub("of rural affairs", "", paragraph)
    paragraph = re.sub("of the interior", "", paragraph)
    paragraph = re.sub("of the environment", "", paragraph)
    paragraph = re.sub("of defence", "", paragraph)
    paragraph = re.sub("[\d+()‘’“”\%:\*,&.\-–;:!?__]", "", paragraph)

    # TOKENIZATON
    paragraph_tokenized = word_tokenize(paragraph)

    # POS TAGGING
    tags = nltk.pos_tag(paragraph_tokenized)
    paragraph_tagged = []
    for i in tags:
        if "NN" in i[1]:
            paragraph_tagged.append(i[0])
    paragraph_tokenized = paragraph_tagged


    # LEMMATIZATION/STEMMING & STOPWORDS
    for word in paragraph_tokenized:
        if word not in stop_words and len(word) > 1:
            #lemmatizer = WordNetLemmatizer()
            #word = lemmatizer.lemmatize(word)
            stemmer = PorterStemmer()
            word = stemmer.stem(word)
            paragraph_tokenized_lemmatized.append(word)



    if len(paragraph_tokenized_lemmatized) > 0:
        processed_list.append(paragraph_tokenized_lemmatized)

del processed_list[0:3]

In [5]:
wordfreq = {}
for paragraph in processed_list:
    for word in paragraph:
        if word not in wordfreq.keys():
            wordfreq[word] = 1
        else:
            wordfreq[word] += 1
most_freq = heapq.nlargest(100, wordfreq, key=wordfreq.get)

In [6]:
word_idf_values = {}
for token in most_freq:
    doc_containing_word = 0
    for document in processed_list:
        if token in document:
            doc_containing_word += 1
    word_idf_values[token] = np.log(len(processed_list)/(1 + doc_containing_word))

In [7]:
word_tf_values = {}
for token in most_freq:
    sent_tf_vector = []
    for document in processed_list:
        doc_freq = 0
        for word in document:
            if token == word:
                  doc_freq += 1
        word_tf = doc_freq/len(document)
        sent_tf_vector.append(word_tf)
    word_tf_values[token] = sent_tf_vector

In [8]:
word_tf_values_total = word_tf_values
for key in word_tf_values:
    average = 0
    average = sum(word_tf_values[key]) / len(word_tf_values[key])
    word_tf_values_total[key] = average

In [9]:
word_tfidf_values_total = word_tf_values
for key in word_tf_values:
    product = 0
    product = word_tf_values[key] * word_idf_values[key]
    word_tfidf_values_total[key] = product

In [10]:
index = word_tfidf_values_total.keys()
dftfid = pd.DataFrame(index=index, columns=["TF-IDF / Weight"])
for term in index:
    dftfid.loc[term, "TF-IDF / Weight"] = word_tfidf_values_total[term]
dftfid.to_excel("TF-IDF.xlsx")
print(dftfid)

          TF-IDF / Weight
develop         0.0419833
servic           0.028245
activ           0.0489176
measur          0.0373603
project         0.0219841
...                   ...
unemploy       0.00495458
insur          0.00608646
strategi        0.0123694
construct      0.00504765
process        0.00543679

[100 rows x 1 columns]


In [11]:
'''
dictionary = gensim.corpora.Dictionary(processed_list)
df = pd.DataFrame(list(dictionary.items()), columns=["0", "word"])
df = df.drop(["0"], axis=1)
# BAG OF WORDS
corpus = [dictionary.doc2bow(paragraph) for paragraph in processed_list]
'''

'\ndictionary = gensim.corpora.Dictionary(processed_list)\ndf = pd.DataFrame(list(dictionary.items()), columns=["0", "word"])\ndf = df.drop(["0"], axis=1)\n# BAG OF WORDS\ncorpus = [dictionary.doc2bow(paragraph) for paragraph in processed_list]\n'

In [12]:
from gensim.models import Phrases
bigram_model = Phrases(processed_list)
trigram_model = Phrases(bigram_model[processed_list], min_count=1)
tokens = list(trigram_model[bigram_model[processed_list]])
print(bigram_model)

Phrases<7454 vocab, min_count=5, threshold=10.0, max_vocab_size=40000000>


In [13]:
'''
model = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=100)
model.save('ldamodel.gensim')
topics = model.print_topics(num_words=7)
for topic in topics:
    print(topic)

# Compute Perplexity
print('\nPerplexity: ', model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.
'''

"\nmodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = 10, id2word=dictionary, passes=100)\nmodel.save('ldamodel.gensim')\ntopics = model.print_topics(num_words=7)\nfor topic in topics:\n    print(topic)\n\n# Compute Perplexity\nprint('\nPerplexity: ', model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.\n"

In [14]:
from gensim import corpora
dictionary_LDA = corpora.Dictionary(tokens)
dictionary_LDA.filter_extremes(no_below=3)
corpus = [dictionary_LDA.doc2bow(tok) for tok in tokens]
from gensim import models
import numpy as np
num_topics = 20
lda_model = models.LdaModel(corpus, num_topics=num_topics, \
                                  id2word=dictionary_LDA, \
                                  passes=3, alpha=[0.01]*num_topics, \
                                  eta=[0.01]*len(dictionary_LDA.keys()))

for i,topic in lda_model.show_topics(formatted=True, num_topics=num_topics, num_words=5):
    print(str(i)+": "+ topic)
    
    
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

0: 0.033*"market" + 0.031*"project" + 0.027*"measur" + 0.023*"state" + 0.022*"research"
1: 0.062*"research" + 0.062*"activ" + 0.046*"develop" + 0.030*"measur" + 0.028*"implement"
2: 0.042*"implement" + 0.037*"oil_shale" + 0.037*"act" + 0.033*"govern" + 0.031*"use"
3: 0.049*"fund" + 0.032*"servic" + 0.030*"project" + 0.023*"march" + 0.020*"support"
4: 0.057*"construct" + 0.043*"prepar" + 0.041*"plan" + 0.040*"project" + 0.032*"implement"
5: 0.173*"servic" + 0.037*"support" + 0.029*"develop" + 0.023*"provis" + 0.020*"transport"
6: 0.047*"measur" + 0.045*"develop" + 0.034*"activ" + 0.025*"servic" + 0.025*"system"
7: 0.043*"act" + 0.042*"busi" + 0.036*"student" + 0.035*"propos" + 0.033*"develop"
8: 0.032*"economi" + 0.028*"energi" + 0.027*"measur" + 0.026*"data" + 0.025*"eu"
9: 0.063*"project" + 0.037*"system" + 0.032*"develop" + 0.031*"research" + 0.023*"activ"
10: 0.063*"develop" + 0.043*"measur" + 0.037*"plan" + 0.032*"implement" + 0.026*"resourc"
11: 0.037*"support" + 0.035*"project" +

In [15]:
nltk.download('punkt') # one time execution
from tika import parser

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [16]:
stop_words = set(nltk.corpus.stopwords.words('english'))
stop_words.update()
paragraphs_list = []
processed_list =[]

In [17]:
pdf = parser.from_file('nrp_estonia_2020_action_plan_2019-2020_30.05.2019.pdf')
pdf_text = pdf['content']
pdf_text = pdf_text.lower()
pdf_text = re.sub("x ", "", pdf_text)
pdf_text = ' '.join(pdf_text.split())
pdf_text = pdf_text.split(". ")

In [18]:
del pdf_text[0:2]
sentences = pdf_text

In [19]:
# remove punctuations, numbers and special characters
clean_sentences = pd.Series(sentences).str.replace("[^a-zA-Z]", " ")

# make alphabets lowercase
clean_sentences = [s.lower() for s in clean_sentences]

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

# function to remove stopwords
def remove_stopwords(sen):
    sen_new = " ".join([i for i in sen if i not in stop_words])
    return sen_new

# remove stopwords from the sentences
clean_sentences = [remove_stopwords(r.split()) for r in clean_sentences]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

sentence_vectors = []
for i in clean_sentences:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [21]:
# similarity matrix
sim_mat = np.zeros([len(sentences), len(sentences)])

from sklearn.metrics.pairwise import cosine_similarity
for i in range(len(sentences)):
  for j in range(len(sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [22]:
import networkx as nx

nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(sentences)), reverse=True)

In [23]:
# Extract top 10 sentences as the summary
for i in range(10):
  print(ranked_sentences[i][1])
#https://www.analyticsvidhya.com/blog/2018/11/introduction-text-summarization-textrank-python/

to establish a support scheme for involving the development workers in companies that make a significant contribution to employment but currently have low added value ministry of economic affairs and communications the development voucher measure is still open
to increase the motivation of universities and businesses to diversity financing sources, by making use of the research and development needs of companies located outside of estonia, but also the eu’s various r&d&i financing programmes (e.g., horizon 2020, era-net, jti, kic etc.)
activities: a) information about coming to work in estonia, development and maintenance of a common e-environment of information, materials, and public services; b) increasing readiness of entrepreneurs to involve foreign specialists; c) it labour force campaigns in target countries, including the industrial sector; d) development of settlement services for foreign specialists and their families
creating a system for assessing work capacity, the provisio