In [3]:
from nltk import ngrams
from nltk.tokenize import word_tokenize
import unicodedata as ud
import pymongo
import re
import networkx as nx
mongo_client = pymongo.MongoClient("mongodb://localhost:27017/")
#mongo_client.drop_database("GreekParliamentProceedings")
client = mongo_client["GreekParliamentProceedings"]
index = client["InvertedIndex"]
database = client["Database"]

### Tests

In [None]:
import spacy
nlp = spacy.load('el_core_news_sm')

In [None]:
text = '0015'
doc = nlp(text)
for token in doc:
    print(token.pos_)

In [None]:

text = "ΟΙ ΠΡΟΤΕΙΝΟΝΤΕΣ ΒΟΥΛΕΥΤΕΣ 1. Ι. ΧΑΡΑΛΑΜΠΟΠΟΥΛΟΣ"
text = text.lower()
doc = nlp(text)
for token in doc:
    print(token)

### Methods

In [4]:
from greek_stemmer import GreekStemmer
import spacy
def get_words(sentence:spacy, stemmer, domain_specific_stopwords):
    words = []
    regex = "^[0-9][0-9][0-9].*"
    for token in sentence:
        #remove words with specified format
        if(re.search(regex, token.text)==None):
            #only keep NOUNs and ADJECTIVEs that are not stowords or punctuation
            if(token.pos_ in ('NOUN','ADJ') and token.is_stop==False and token.is_punct==False):

                d = {ord('\N{COMBINING ACUTE ACCENT}'):None}
                if(stemmer.stem(ud.normalize('NFD',token.text.upper()).translate(d)).lower() not in domain_specific_stopwords):
                    words.append(token.text)
    return words
def get_candidate_words(text, stopwords, domain_specific_stopwords, language):
    stemmer = GreekStemmer()
    #STEP 1: SPLIT TEXT INTO SENTENCES AND PRE-PROCESS 
    nlp = language
    nlp.Defaults.stop_words |= set(stopwords)
    text = text.lower()
    sentences = nlp(text)
    processed_sentences = []
    for sentence in sentences.sents:
        words_to_consider = get_words(sentence, stemmer, domain_specific_stopwords)
        if(len(words_to_consider)>0):
            processed_sentences.append(words_to_consider)
    return processed_sentences

In [5]:
def extract_pairs(word_list, window):
    pairs = []
    #print(word_list)
    g = nx.Graph()
    for a_list in word_list:
        if(len(a_list)>window):
            candidate_lists = [a_list[k:window+k] for k in range(0, len(a_list)) if window+k<=len(a_list)]
        else:
            candidate_lists = [a_list]
        for candidate_list in candidate_lists:
            #print(candidate_list)
            for i in range(0, len(candidate_list)-1):
                for j in range(i+1, len(candidate_list)):
                    pairs.append((candidate_list[i], candidate_list[j]))
                    #avoid self-loops
                    if(candidate_list[i] == candidate_list[j]):
                        pass
                    #if edge does not exist create it, else increase edge weight
                    elif((candidate_list[i], candidate_list[j]) not in g.edges):
                        g.add_edge(candidate_list[i], candidate_list[j])

    return pairs, g

In [6]:
keywords_len = {}
import math
def document_keywords(speech, stopwords, domain_specific_stopwords, window, language):
    sentences = get_candidate_words(speech, stopwords, domain_specific_stopwords, language)
    pairs, graph = extract_pairs(sentences, window=window)
    scores = nx.pagerank(graph)
    scores = dict(sorted(scores.items(), key=lambda item: -item[1]))
    all_words = list(scores.keys())
    if(len(graph.nodes)>0):
        keywords = all_words[:int(math.log2(len(graph.nodes))+8)]
    else:
        keywords = []
    return scores, keywords

In [7]:
import string
def extract_phrases(speech, keywords):
    tokenized = speech.split()
    bigrams = ngrams(tokenized, 2)
    for gram in bigrams:
        if(gram[0] in keywords and gram[1] in keywords and gram[0]!=gram[1]):
            keywords.append(gram[0]+" "+gram[1])
            keywords.remove(gram[0])
            keywords.remove(gram[1])
    return keywords
def stem_top_keywords(keywords:dict):
    #dict(keyword_frequency[:20])
    stemmer = GreekStemmer()
    d = {ord('\N{COMBINING ACUTE ACCENT}'):None}
    final = set()
    for token in keywords:
        #split to tokens
        words = word_tokenize(token)
        stemmed = ""
        for word in words:
            #remove punctuation
            word = word.translate(str.maketrans('', '', string.punctuation))
            if(word!=''):
                #remove word if it is duplicate or already exists in keyphrase
                stemmed_word = stemmer.stem(ud.normalize('NFD',word.upper()).translate(d)).lower()
                stemmed = stemmed + stemmed_word
                if len(words)>1:
                    stemmed = stemmed + " "
                    if(stemmed_word in final or stemmed_word in string.punctuation and stemmed_word in final):
                        final.remove(stemmed_word)            

            final.add(stemmed)
    return final

def get_member_keywords(member_name:string, stopwords, domain_specific_stopwords, collection)->list:
    #get keywords for member from MongoDB
    pipeline = [{'$match' : {'member_name':member_name}},
                {'$group':{'_id':'$keywords'}}]
    keywords_by_speech = database.aggregate(pipeline)
    total_keywords = []
    language = spacy.load('el_core_news_sm')
    print(f"Member {member_name}")
    counter = 0
    for keyword_list in keywords_by_speech:
        total_keywords.extend(keyword_list['_id'])
        #print('Doc: ', counter)
        counter+=len(keyword_list['_id'])
    keyword_frequency = {}
    for keyword in set(total_keywords):
        keyword_frequency[keyword] = total_keywords.count(keyword)
        
    keyword_frequency = sorted(keyword_frequency.items(), key=lambda d: d[1], reverse=True)
    number_of_keywords = int(math.log(counter))
    top_keywords = dict(keyword_frequency[:number_of_keywords])
    return top_keywords

### Keywords for a single document

In [None]:
mongo_client = pymongo.MongoClient("mongodb://localhost:27017/")
#mongo_client.drop_database("GreekParliamentProceedings")
client = mongo_client["GreekParliamentProceedings"]
index = client["InvertedIndex"]
database = client["Database"]

In [None]:
document = list(database.find({"_id":"9433"}, { "_id": 0, "speech": 1 }))
speech = document[0]['speech']

In [None]:
#words_in_row = ind.preprocess_doc(document[0]['speech'], stopwords)
with open('../stopwords.txt', encoding='utf-8') as file:
  stopwords = [line.rstrip() for line in file]
  stopwords = set(stopwords)
with open('../domain-specific-stopwords.txt', encoding='utf-8') as file:
  domain_specific_stopwords = [line.rstrip() for line in file]
language = spacy.load('el_core_news_sm')
scores, keywords = document_keywords(speech, stopwords, domain_specific_stopwords, window = 3, language=language)
print(keywords)
#scores, keywords = document_keywords(speech, stopwords, domain_specific_stopwords, window = 4)
#print(keywords)
#scores, keywords = document_keywords(speech, stopwords, domain_specific_stopwords, window = 2)
#print(keywords)


In [None]:
document = list(database.find({"_id":"1"}, { "_id": 0, "speech": 1 }))
speech = document[0]['speech']
#words_in_row = ind.preprocess_doc(document[0]['speech'], stopwords)
with open('../stopwords.txt', encoding='utf-8') as file:
  stopwords = [line.rstrip() for line in file]
  stopwords = set(stopwords)
with open('../domain-specific-stopwords.txt', encoding='utf-8') as file:
  domain_specific_stopwords = [line.rstrip() for line in file]
language = spacy.load('el_core_news_sm')
scores, keywords = document_keywords(speech, stopwords, domain_specific_stopwords, window = 3, language = language )
#scores, keywords = document_keywords(speech, stopwords, domain_specific_stopwords, window = 4)
print(speech)
#scores, keywords = document_keywords(speech, stopwords, domain_specific_stopwords, window = 2)
print(keywords)


### Keywords per party

In [None]:
parties = database.distinct("political_party")

In [None]:
# RUN FOR ALL MEMBERS
def get_party_keywords(party, stopwords, domain_specific_stopwords, collection)->list:
    #get keywords for member from MongoDB
    pipeline = [{'$match' : {'political_party':party}},
                {'$group':{'_id':'$keywords'}}]
    keywords_by_speech = database.aggregate(pipeline)
    total_keywords = []
    language = spacy.load('el_core_news_sm')
    print(f"Party {party}")
    counter = 0
    for keyword_list in keywords_by_speech:
        total_keywords.extend(keyword_list['_id'])
        #print('Doc: ', counter)
        counter+=len(keyword_list['_id'])
    keyword_frequency = {}
    for keyword in set(total_keywords):
        keyword_frequency[keyword] = total_keywords.count(keyword)
        
    keyword_frequency = sorted(keyword_frequency.items(), key=lambda d: d[1], reverse=True)
    if(counter>0):
        number_of_keywords = int(math.log(counter))
    else:
        number_of_keywords = 0
    top_keywords = dict(keyword_frequency[:number_of_keywords+5])
    return top_keywords

In [None]:
# TEST FOR ONE party
with open('../stopwords.txt', encoding='utf-8') as file:
    stopwords = [line.rstrip() for line in file]
    stopwords = set(stopwords)
with open('../domain-specific-stopwords.txt', encoding='utf-8') as file:
    domain_specific_stopwords = [line.rstrip() for line in file]
top_keywords = get_party_keywords(parties[15], stopwords, domain_specific_stopwords, database)
print(top_keywords)

In [None]:
keywords_by_party = {}
for i in range(1, len(parties)):
    top_keywords = get_party_keywords(parties[i], stopwords, domain_specific_stopwords, database)
    keywords_by_party[parties[i]] = top_keywords

In [None]:
print(keywords_by_party['συνασπισμος ριζοσπαστικης αριστερας'])

### Keywords per parliament member

In [None]:
#get distinct parliament member names in a list
member_names = database.distinct("member_name")
#years = database.distinct("year")

In [None]:
pipeline = [{'$match' : {'member_name':member_names[3]}},
                {'$group':{'_id':'$keywords'}}]
r = database.aggregate(pipeline)

In [None]:
# TEST FOR ONE MEMBER
keywords_by_member = {}
with open('stopwords.txt', encoding='utf-8') as file:
    stopwords = [line.rstrip() for line in file]
    stopwords = set(stopwords)
with open('domain-specific-stopwords.txt', encoding='utf-8') as file:
    domain_specific_stopwords = [line.rstrip() for line in file]
top_keywords = get_member_keywords(member_names[5], stopwords, domain_specific_stopwords, database)
print(top_keywords)


In [None]:
pipeline = [{'$group':{'_id':'$member_name', 'keywords':{'$push':{'total':'$keywords'}}}}]
keywords_by_speech = database.aggregate(pipeline)
for i in keywords_by_speech:
    print(i)
    break

In [None]:
# RUN FOR ALL MEMBERS
def get_member_keywords(member_name, stopwords, domain_specific_stopwords, collection)->list:
    #get keywords for member from MongoDB
    pipeline = [{'$match' : {'member_name':member_name}},
                {'$group':{'_id':'$keywords'}}]
    keywords_by_speech = database.aggregate(pipeline)
    total_keywords = []
    language = spacy.load('el_core_news_sm')
    print(f"Member {member_name}")
    counter = 0
    for keyword_list in keywords_by_speech:
        total_keywords.extend(keyword_list['_id'])
        #print('Doc: ', counter)
        counter+=len(keyword_list['_id'])
    keyword_frequency = {}
    for keyword in set(total_keywords):
        keyword_frequency[keyword] = total_keywords.count(keyword)
        
    keyword_frequency = sorted(keyword_frequency.items(), key=lambda d: d[1], reverse=True)
    if(counter>0):
        number_of_keywords = int(math.log(counter))
    else:
        number_of_keywords = 0
    top_keywords = dict(keyword_frequency[:number_of_keywords])
    return top_keywords
keywords_by_member = {}
with open('../stopwords.txt', encoding='utf-8') as file:
    stopwords = [line.rstrip() for line in file]
    stopwords = set(stopwords)
with open('../domain-specific-stopwords.txt', encoding='utf-8') as file:
    domain_specific_stopwords = [line.rstrip() for line in file]
for i in range(1, len(member_names)):
    top_keywords = get_member_keywords(member_names[i], stopwords, domain_specific_stopwords, database)
    keywords_by_member[member_names[i]] = top_keywords

In [None]:
print(keywords_by_member['αβδελας κωνσταντινου αποστολος'])

In [None]:
import pickle

#with open('member_keywords.pickle', 'wb') as handle:
#    pickle.dump(keywords_by_member, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('member_keywords.pickle', 'rb') as handle:
    mm = pickle.load(handle)


In [None]:
print(mm)

### Keywords by speech

In [None]:
speeches = list(database.find({ }, { "_id": 1, "speech": 1 })[874674:])

In [None]:
print(len(speeches))

In [None]:
s = "~~αποψη~~"
new_string = s.translate(str.maketrans('', '', string.punctuation))
print(new_string)

In [None]:
keywords_by_speech = {}
language = spacy.load('el_core_news_sm')

with open('stopwords.txt', encoding='utf-8') as file:
    stopwords = [line.rstrip() for line in file]
    stopwords = set(stopwords)
with open('domain-specific-stopwords.txt', encoding='utf-8') as file:
    domain_specific_stopwords = [line.rstrip() for line in file]
for i in range(0, len(speeches)):
    print("Doc:", speeches[i]['_id'])
    scores, keywords = document_keywords(speeches[i]['speech'], stopwords, domain_specific_stopwords, window = 3, language = language)
    keywords = extract_phrases(speeches[i]['speech'], keywords)
    top_keywords = stem_top_keywords(keywords)
    database.update_one({"_id":speeches[i]['_id']},{'$set':{'keywords':list(top_keywords)}})
    

In [1]:
import pickle
with open('../speech_indexes.pickle', 'rb') as handle:
    speech_indexes = pickle.load(handle)

In [13]:
speech = list(database.find({"_id":"1" }, { "_id": 0, "speech": 1 }))
print(speech[0]['speech'])

 Παρακαλείται ο κύριος Γραμματέας να συνοδεύσει την Ιερά Σύνοδο εκτός της Αιθούσης της Βουλής.  . Παρακαλείται ο συνάδελφος Βουλευτής κ. Σαδίκ Αμέτ, που ανήκει στο Μωαμεθανικό Θρήσκευμα να προσέλθει και να δώσει τον οριζόμενο από το Σύνταγμα όρκο επί του Κορανίου.  : ~"Ορκίζομαι στο όνομα του Παντοδύναμου Θεού και του μόνου αυτού Προφήτη ο οποίος είναι ο Μωάμεθ να είμαι πιστός στην πατρίδα και το δημοκρατικό πολίτευμα, να υπακούω στο Σύνταγμα και τους νόμους και να εκπληρώνω ευσυνείδητα τα καθήκοντά μου".


In [16]:
#RECALCULATE KEYWORDS ONLY FOR DOCUMENTS IN INDEX
with open('../stopwords.txt', encoding='utf-8') as file:
    stopwords = [line.rstrip() for line in file]
    stopwords = set(stopwords)
with open('../domain-specific-stopwords.txt', encoding='utf-8') as file:
    domain_specific_stopwords = [line.rstrip() for line in file]
language = spacy.load('el_core_news_sm')

for i in range(0, len(speech_indexes)):
    print("Doc:", speech_indexes[i])
    speech = list(database.find({"_id": str(speech_indexes[i]) }, { "_id": 0, "speech": 1 }))
    scores, keywords = document_keywords(speech[0]['speech'], stopwords, domain_specific_stopwords, window = 3, language = language)
    keywords = extract_phrases(speech[0]['speech'], keywords)
    top_keywords = stem_top_keywords(keywords)
    database.update_one({"_id":str(speech_indexes[i])},{'$set':{'keywords':list(top_keywords)}})

Doc: 1236023
Doc: 621711
Doc: 609477
Doc: 825209
Doc: 804047
Doc: 217461
Doc: 331980
Doc: 1192378
Doc: 46007
Doc: 164325
Doc: 742927
Doc: 349936
Doc: 361403
Doc: 479411
Doc: 902100
Doc: 481185
Doc: 1185942
Doc: 321327
Doc: 917748
Doc: 73606
Doc: 765393
Doc: 594938
Doc: 318564
Doc: 590843
Doc: 1127669
Doc: 150654
Doc: 520874
Doc: 149044
Doc: 505939
Doc: 1221967
Doc: 393682
Doc: 261441
Doc: 666965
Doc: 854897
Doc: 1272563
Doc: 683310
Doc: 263909
Doc: 765147
Doc: 812145
Doc: 1219123
Doc: 11031
Doc: 641362
Doc: 1024735
Doc: 316735
Doc: 135670
Doc: 1053611
Doc: 1023266
Doc: 79378
Doc: 668612
Doc: 7985
Doc: 834809
Doc: 459966
Doc: 616998
Doc: 39700
Doc: 463493
Doc: 551343
Doc: 905096
Doc: 947223
Doc: 458641
Doc: 473405
Doc: 786342
Doc: 485372
Doc: 478656
Doc: 170095
Doc: 113714
Doc: 660624
Doc: 324885
Doc: 888827
Doc: 589135
Doc: 1259127
Doc: 59455
Doc: 55165
Doc: 138764
Doc: 264581
Doc: 916269
Doc: 252500
Doc: 51516
Doc: 522903
Doc: 1023730
Doc: 12622
Doc: 700121
Doc: 4520
Doc: 438584
Doc: 