In [132]:
from gensim.models import KeyedVectors
from itertools import islice
from sklearn.feature_extraction.text import TfidfVectorizer

import json
import pandas as pd
import string
import numpy as np
import collections
import math

# import natural language toolkit
from nltk.corpus   import stopwords
from nltk.tokenize import word_tokenize

In [133]:
# prepare stopword list
stop_words = stopwords.words('english')

In [134]:
wiki_en_align = './../data/fasttext/wiki.en.align.vec' #'../../data/fasttext/wiki.en.align.vec'
# get fasttext wiki embeddings for english
wv_wiki_en = KeyedVectors.load_word2vec_format(wiki_en_align)
print('english words {}'.format(len(list(wv_wiki_en.vocab.keys()))))

KeyboardInterrupt: 

In [None]:
#list of terms
def tokenize(text, stopwords):
    """Tokenizes and removes stopwords from the document"""
    tokens = word_tokenize(text)
    filtered = [w.lower() for w in tokens if not w in stopwords]
    return filtered

In [None]:
def similarity(token, token_list, wv ):
    """calculates the similarity between token and list of tokens
    Args:
        token_list (list): List of tokens to which we are calculating similarity.
        token (str): String for wich we are calculating similarity.
        wv (Word2VecKeyedVectors): Word embeddings.
    Returns:
        avreage_similarity (float): Number that signifes the similarity of token to token list words.
    """
    similarity = 0
    num_of_tokens = 0
    for toks in token_list:
        # check if the token is in the vocabulary
        if toks in wv.vocab.keys():
            num_of_tokens += 1
            similarity += wv.similarity(toks, token)
            avreage_similarity = similarity/num_of_tokens
    return avreage_similarity


In [None]:
# import postgresql
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from modules.library.postgresql import PostgresQL
# connect to the postgresql database
pg = PostgresQL() 
pg.connect(database="eurlex_environment_only", user="postgres", password="dbpass")

In [None]:
#import documents 
documents = pg.execute("""
    SELECT * FROM documents;
""")

In [None]:
docs = documents[0:1000]

In [None]:
tokenized_docs = {}
tokenized_titles = {}
textsT = {}
titles = {}
for document in docs:
    doc_id = document.get('document_id')
    text = document.get('document_text')
    textsT.update({doc_id: text})
    title = document.get('document_title')
    titles.update({doc_id: title})
    text = text.translate(str.maketrans('','',string.punctuation))
    tokenized = tokenize(text, stop_words)
    title = title.translate(str.maketrans('','',string.punctuation))
    tokenized_title = tokenize(title, stop_words)
    for token in tokenized:
        if len(token) == 1:
            if token.isalpha():
                tokenized.remove(token)
    tokenized_docs.update({doc_id: tokenized})
    for title in tokenized_title:
        if len(title) == 1:
            if title.isalpha():
                tokenized_title.remove(title)
    tokenized_titles.update({doc_id: tokenized_title})


In [135]:
test = ['water', 'pollution', 'underground']
top_list = ['pollution', 'pollutions', 'undergrounding', 'earpollution', 'pollution,']

In [136]:
def probability_multiply(probability, token_frequency, n):
    probability_value = probability*(token_frequency/n)
    return probability_value

In [137]:
def probability_sum(probability, token_frequency, n):
    probability_value = probability+(token_frequency/n) 
    return probability_value

In [138]:
def word_value(word, alpha, original_tokens, top_expansion, wv):
    """values word based on whether is in original token set or expanded, if alpha -1 value equals to cosine similarity
    Args:
        word (string): Word or token for which we are calculating value.
        alpha (float): Number between 0 and 1. Weight that emphasizes the difference between original query words and expansions. 
                        Usually between 0.5 (all words are treated equal) and 1 (expansion words have value 0). 
                        For alpha -1 values equal to cosine similarity to query words.
        original_tokens(list): List of strings. Tokenized original query. Usually also ext (extension by summation of 2 consecutive words)
        top_expansion (list): List of expanded words. Usually candidates (kNN expansion).
        wv (): 
    Returns:
        value (float): Value of the word based on whether is in original token set or expanded set.
    """
    only_expanded = []
    for token in top_expansion:
        if token not in original_tokens:
            only_expanded.append(token)       
    sum_similarity = 0
    for exp_token in only_expanded:
            sum_similarity += similarity(exp_token,original_tokens, wv)        
    if alpha == -1:
        if word in original_tokens:
            value = 1
        else:
            value = similarity(word, original_tokens, wv)/sum_similarity
    else:
        if word in original_tokens:
            value = alpha
        else:
            value = (1-alpha)*similarity(word, original_tokens, wv)/sum_similarity
    return value

In [139]:
def probability_sum_weight(probability, token_frequency, n, word, alpha, original_tokens, top_expansion, wv):
    probability_value = probability+(token_frequency/n)*word_value(word, alpha, original_tokens, top_expansion, wv)
    return probability_value

In [140]:
def top_positives(dictionary,n):
    """Takes dict and returns first n tuples of key,values sorted by values descending, returns only items with positive values.
    Args:
        dictionary (dict): Dictionary we want to sort by values.
        n (int): Number of returned items. If there are less than n items in dictonary or less than n items with positive values,
                 returns all items (with positive valuses) sorted.
    Returns:
        sorted_positives_top (list): List of n tuples. If there are less than n items in dictonary or less than n items with 
                                     positive values, returns all items (with positive valuses) sorted.
    """
    positives = {} 
    for k,v in dictionary.items():
        if v > 0:
            positives.update({k: v})
    sorted_positives = sorted(positives.items(), key=lambda x: x[1],reverse=True)
    if len(sorted_positives) > n:
        sorted_positives_top = sorted_positives[0:n]
    else:
        sorted_positives_top = sorted_positives
    return sorted_positives_top

In [141]:
def probability_score(tokens,texts, probability_function,n, *args):
    break_loop = False
    #args[0] == top_expansion
    #args[1] == alpha
    #args[2] == wv
    document_probability = {}
    for k, v in texts.items():
        n = len(v)
        if probability_function == probability_multiply:
            probability = 1
        else:
            probability = 0
        if probability_function == probability_sum_weight:
            if len(args) == 3:
                for i in range(len(tokens)):
                    token_frequency = v.count(tokens[i])
                    probability = probability_sum_weight(probability, token_frequency, n,tokens[i], args[1], tokens, args[0], args[2])
                document_probability.update({k: probability})
            else:
                print("Error, number of arguments does not match.")
                break_loop = True
                break 
        elif break_loop:
            break
        elif probability_function == probability_sum or probability_function == probability_multiply:
            if len(args) == 0:
                for i in range(len(tokens)):
                    token_frequency = v.count(tokens[i])
                    probability = probability_function(probability, token_frequency, n)
                document_probability.update({k: probability})
            else:
                print("Error, number of arguments does not match.")
                break_loop = True
                break 
        elif break_loop:
            break
        else:
            print("Error, metric function not defined.")
            
    document_probability = top_positives(document_probability,n)        
    return document_probability

In [142]:
type(probability_sum)

function

In [152]:
probability_score(test,textsT, probability_sum, 10)

[(161, 0.0028789782244556116),
 (565, 0.0028431808085295426),
 (313, 0.0024473924977127175),
 (190, 0.0021591912847188145),
 (1219, 0.0021296014602981443),
 (402, 0.0019286403085824494),
 (438, 0.001875784654013736),
 (12, 0.001874573960463531),
 (226, 0.0018612731108077925),
 (79, 0.00173053152039555),
 (22, 0.0016660716410805665),
 (101, 0.0015841215114382893),
 (51, 0.0015501247365998982),
 (95, 0.001544799176107106),
 (258, 0.001490056031032572),
 (2, 0.0014892373958965781),
 (261, 0.00145441323516044),
 (73, 0.001422556048708319),
 (12679, 0.0014181302498090978),
 (257, 0.00141247988557125),
 (463, 0.0014044943820224719),
 (225, 0.0012693807235470125),
 (647, 0.0011530084980996712),
 (309, 0.0011179429849077697),
 (574, 0.0010999486690621103),
 (567, 0.001089070400622326),
 (787, 0.001081665765278529),
 (634, 0.001055594651653765),
 (698, 0.0010230179028132991),
 (662, 0.001006036217303823),
 (926, 0.001001134619235133),
 (516, 0.000998003992015968),
 (144, 0.0009958299620339828),

In [144]:
def probability_score_sum_weights(original_tokens, top_expansion, texts,n, alpha, wv): 
    """As probability_score_sum only weighted.
        Args:
        texts (dict):  Keys represent document ids, values are document text.
        alpha (float): Number between 0 and 1. Weight that emphasizes the difference between original query words and expansions. 
                       For alpha 0.5 all words have same weights (but not same values!), for alpha 1 expansion words have value 0. 
                        For alpha -1 values equal to cosine similarity to query words.
        original_tokens(list): List of strings. Tokenized original query. Usually also extension (extension by summation of 2 consecutive words)
        top_expansion (list): List of expanded words. Usually candidates (kNN expansion).
        wv (): 
    Returns:
        document_probability (dict): Keys represent document ids, values are scores that measure adequacy of the document.
    """
    tokens = original_tokens+top_expansion
    document_score = probability_score(tokens,texts, probability_sum_weight,n, top_expansion, alpha, wv)
    return document_score


In [145]:
probability_score_sum_weights(test, top_list,textsT,10, 0.6, wv_wiki_en)

[(565, 0.0027187916481563753),
 (1219, 0.002555521752357773),
 (12, 0.002198364008179959),
 (226, 0.0021590768085370388),
 (22, 0.002070689039628704),
 (95, 0.0020082389289392375),
 (190, 0.0019432721562469332),
 (161, 0.0019158291457286433),
 (313, 0.0017840805123513267),
 (73, 0.001775349948787982),
 (402, 0.0017357762777242044),
 (463, 0.0016853932584269661),
 (51, 0.0016712282316467654),
 (258, 0.0014186318576442335),
 (257, 0.001394600393348829),
 (438, 0.001302710287275681),
 (2, 0.0012303392024560962),
 (652, 0.0011583011583011582),
 (793, 0.0011299435028248586),
 (11, 0.0011202240448089618),
 (79, 0.00103831891223733),
 (574, 0.0010339517489183839),
 (931, 0.001015146632291331),
 (101, 0.0009504729068629736),
 (647, 0.0009224067984797369),
 (698, 0.0009207161125319692),
 (662, 0.0009054325955734407),
 (575, 0.0008967578753736492),
 (261, 0.000872647941096264),
 (807, 0.0008724100327153762),
 (691, 0.0008649687650168188),
 (12679, 0.0008508781498854587),
 (80, 0.0008382016764033

In [146]:
def number_documents_tokens_appear(tokens,texts):
    """For each token in tokens counts the number of documents in which token has appeared.
        Args:
        tokens (list): List of tokens.
        texts (dict):  Keys represent document ids, values are document text.
    Returns:
        documents_per_token (list): List of numbers that count number of documnets in which certain token appears.
                                    Index of element in tokens list is the same as index in documents_per_token list for that element value.
    """
    documents_per_token = []
    for i in range(len(tokens)):
        documents_per_token.append(0)
    for text in texts.values():
        for i in range(len(tokens)):
            token = tokens[i]
            if token in text:
                documents_per_token[i] = documents_per_token[i]+1
    return documents_per_token



In [147]:
def tfidf_sum(probability,  token_frequency, n, idf):
    tfidf_value = probability+((token_frequency/n)*idf)
    return tfidf_value
    

In [148]:
def tfidf_sum_weight(probability,  token_frequency, n, idf, word, alpha, original_tokens, top_expansion, wv):
    tfidf_value = probability+((token_frequency/n)*idf)*word_value( word, alpha, original_tokens, top_expansion, wv)
    return tfidf_value

In [149]:
def tfidf_score(tokens, texts, tfidf_function,n, *args):
    #args[0] == top_expansion
    #args[1] == alpha
    #args[2] == wv
    break_loop = False
    if len(args):
        tokens_together = tokens+args[0]
    else:
        tokens_together = tokens
    nb_docs_tokens_appeared = number_documents_tokens_appear(tokens_together,texts)
    filtered_nb_docs_tokens_appeared = [elt for elt in nb_docs_tokens_appeared if not elt == 0]
    not_appear = []
    appear = []
    for i in range(len(nb_docs_tokens_appeared)):
        if nb_docs_tokens_appeared[i] == 0:
            not_appear.append(tokens_together[i])
        else:
            appear.append(tokens_together[i])    
    l = len(texts)
    document_probability = {}
    for k, v in texts.items():
        n = len(v)
        probability = 0
        for i in range(len(appear)):
            token_frequency = v.count(appear[i])
            idf = math.log(l/filtered_nb_docs_tokens_appeared[i])
            if tfidf_function == tfidf_sum:
                if len(args) == 0:
                    probability = tfidf_sum(probability,  token_frequency, n, idf)
                else:
                    print("Error, number of arguments does not match")
                    break_loop = True
                    break 
            elif tfidf_function == tfidf_sum_weight:
                if len(args) == 3:
                    probability = tfidf_sum_weight(probability,  token_frequency, n, idf,appear[i], args[1], tokens, args[0], args[2])
                else:
                    print("Error, number of arguments does not match")
                    break_loop = True
                    break 
        if break_loop:
            break
        document_probability.update({k: probability})
        
    document_probability = top_positives(document_probability,n)
    return document_probability, not_appear
    

In [153]:
tfidf_score(test, textsT, tfidf_sum_weight,10, top_list, 0.6,wv_wiki_en)


([(1219, 0.0066871760661186995),
  (565, 0.0060723867296320925),
  (12, 0.0057214189628176735),
  (226, 0.005594400302676522),
  (22, 0.005330541854712906),
  (95, 0.005100355257824119),
  (73, 0.004653882845817686),
  (463, 0.004555723583580625),
  (190, 0.004351410664252874),
  (51, 0.0040858550199482715),
  (402, 0.004056263658691651),
  (313, 0.003467572632799459),
  (161, 0.003386705179939872),
  (257, 0.003370564069466134),
  (258, 0.00333355513254562),
  (652, 0.0031309606095650633),
  (793, 0.0030543080899335263),
  (11, 0.002877639772514816),
  (931, 0.002744004955756426),
  (2, 0.0026523715939707954),
  (574, 0.0024806479118889696),
  (575, 0.0024075255417902),
  (438, 0.00239390574925898),
  (807, 0.002358178983285645),
  (698, 0.0021515833320016583),
  (662, 0.0021158679145187338),
  (691, 0.002021309665575801),
  (739, 0.0020209814277317166),
  (647, 0.0019866396519431914),
  (530, 0.001839861140958256),
  (576, 0.0018389846598593378),
  (80, 0.0018219920203983223),
  (534