# Query Expansion
### Using FastText Word Embedding
Based on this paper: https://arxiv.org/pdf/1606.07608.pdf

Pre-made vector models: https://fasttext.cc/docs/en/aligned-vectors.html

In [1]:
from gensim.models import KeyedVectors
from itertools import islice
from sklearn.feature_extraction.text import TfidfVectorizer

import json
import pandas as pd
import string
import numpy as np
import collections
import math

# import natural language toolkit
from nltk.corpus   import stopwords
from nltk.tokenize import word_tokenize



In [2]:
# prepare stopword list
stop_words = stopwords.words('english')

In [3]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [4]:
import os
os.listdir()

['.ipynb_checkpoints',
 'document_embeddings.ipynb',
 'enviroLENS-deliverable-D4.2-images.ipynb',
 'query-expansion.ipynb',
 'query_modules.ipynb',
 'testing db functions.ipynb',
 'testing.ipynb',
 'titles_topics',
 'trectesting2.ipynb',
 't_tfidf_wsum006',
 't_tfidf_wsum007',
 't_tfidf_wsum008',
 't_tfidf_wsum009',
 't_tfidf_wsum01',
 'Untitled.ipynb']

In [5]:
wiki_en_align = './../data/fasttext/wiki.en.align.vec' #'../../data/fasttext/wiki.en.align.vec'
# get fasttext wiki embeddings for english
wv_wiki_en = KeyedVectors.load_word2vec_format(wiki_en_align)
print('english words {}'.format(len(list(wv_wiki_en.vocab.keys()))))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


english words 2519370


In [6]:
type(wv_wiki_en)

gensim.models.keyedvectors.Word2VecKeyedVectors

## Pre-retrieval kNN Based Approach

In [7]:
#list of terms
def tokenize(text, stopwords):
    """Tokenizes and removes stopwords from the document"""
    tokens = word_tokenize(text)
    filtered = [w.lower() for w in tokens if not w in stopwords]
    return filtered

In [8]:
#extended list of terms ###
def extend_tokens(token_list, wv):
    """Extends token list summing vector pairs"""
    tokens = []
    for token in token_list:
        # check if the token is in the vocabulary
        if token in wv.vocab.keys():
            tokens.append(token)
    extention = set()
    for i in range(len(tokens)-1):
        new_token = wv.most_similar(positive=[tokens[i], tokens[i+1]])[0][0]
        extention.add(new_token)
    extention = list(extention)
    return extention

In [9]:
test = tokenize('water pollution underground', stop_words)
print(test)
ext = extend_tokens(test,wv_wiki_en)
print(ext)


['water', 'pollution', 'underground']
['undergrounding', 'pollutions']


In [10]:
test1 = tokenize('annex fishing agreement europe', stop_words)
print(test1)
ext1 = extend_tokens(test1,wv_wiki_en)
print(ext1)

['annex', 'fishing', 'agreement', 'europe']
['agreements', 'flwfishing']


In [11]:
# knn nearest
def get_candidate_expansion_terms(tokens, k, wv):
    """Gets the candidate expansion terms"""
    candidates = set()
    for token in tokens:
        # check if the token is in the vocabulary
        if token in wv.vocab.keys():
            result = wv.similar_by_word(token)
            limit = k if len(result) > k else len(result)
            # iterate through the most similar words
            for i in range(limit):
                candidates.add(result[i][0])
    # return list of candidates
    candidates = list(candidates)
    return candidates
        

In [12]:
get_candidate_expansion_terms(["fish"], 5, wv_wiki_en)

['baitfish', 'fishes', 'milkfishes', 'gamefish', 'shellfishes']

In [13]:
candidates = get_candidate_expansion_terms(test+ext, 5, wv_wiki_en)
print(candidates)
witout = get_candidate_expansion_terms(test, 5, wv_wiki_en)
print(witout)

['undergrounded', 'undergrounder', 'sewage', 'undergrounding', 'pollution,', 'undergrounds', 'undergrounders', '#pollution', 'water—', 'undergroung', 'seawater', 'biopollution', 'potable', 'groundwater', 'pollutions', 'pollution', 'earpollution', 'undergroun', 'pollutants']
['undergrounded', '#pollution', 'water—', 'sewage', 'groundwater', 'undergroung', 'pollutions', 'undergrounding', 'seawater', 'undergrounds', 'earpollution', 'undergroun', 'biopollution', 'potable', 'pollution,']


In [14]:
# similarity between word and list of words
def similarity(token, token_list, wv ):
    """calculates the similarity between word and list of words"""
    # calculate the similarity of the token to all tokens
    similarity = 0
    num_of_tokens = 0
    for toks in token_list:
        # check if the token is in the vocabulary
        if toks in wv.vocab.keys():
            num_of_tokens += 1
            similarity += wv.similarity(toks, token)
    return similarity/num_of_tokens

In [15]:
# calculates similarity and sorts
def get_top_expansion_terms(tokens, candidates, wv):
    """Gets the actual expansion terms"""
    similarity_pairs = []
    for candidate in candidates:
        sim = similarity(candidate, tokens, wv)
        similarity_pairs.append((candidate, sim))
    # return the list of expansion terms with their similarities
    return similarity_pairs

In [17]:
# get actual expansion terms for test set; with and without extension
top = get_top_expansion_terms(test+ext, candidates,  wv_wiki_en)
topwithout = get_top_expansion_terms(test, candidates,  wv_wiki_en)
def takeSecond(elem):
    return elem[1]
top = sorted(top, key=takeSecond)[::-1]
topw = sorted(topwithout, key=takeSecond)[::-1]
print((top))
print((topw))
top = top[0:5]
topw = topw[0:5]
top_list = []
for tupl in top:
    top_list.append(tupl[0])
topw_list = []
for tupl in topw:
    topw_list.append(tupl[0])

# get actual expansion terms for test1 set; with and without extension
top1 = get_top_expansion_terms(test1+ext1, candidates,  wv_wiki_en)
topwithout1 = get_top_expansion_terms(test1, candidates,  wv_wiki_en)

top1 = sorted(top1, key=takeSecond)[::-1]
topw1 = sorted(topwithout1, key=takeSecond)[::-1]
top1 = top1[0:5]
topw1 = topw1[0:5]
top_list1 = []
for tupl in top1:
    top_list1.append(tupl[0])
topw_list1 = []
for tupl in topw1:
    topw_list1.append(tupl[0])


[('pollution', 0.6260276407951795), ('pollutions', 0.5989783010567384), ('undergrounding', 0.5868486694404182), ('earpollution', 0.5791309410113682), ('pollution,', 0.5584437076938438), ('pollutants', 0.5473505877035749), ('groundwater', 0.5472185974670385), ('sewage', 0.5438533174483885), ('undergrounds', 0.5373294534380543), ('#pollution', 0.5307803259871704), ('biopollution', 0.5239432296935493), ('undergrounded', 0.5166503920961191), ('undergrounders', 0.5048834686641379), ('undergroun', 0.4968187167322256), ('undergrounder', 0.4930006599112927), ('undergroung', 0.489668825408463), ('seawater', 0.48062588166948944), ('potable', 0.47213486261999227), ('water—', 0.4339509338693007)]
[('pollution', 0.6073097696558459), ('groundwater', 0.572225614085378), ('sewage', 0.5672777675468184), ('earpollution', 0.5479508482970045), ('pollutions', 0.5392823009654663), ('pollution,', 0.5361565112564385), ('pollutants', 0.5332377192240293), ('seawater', 0.5219145986771884), ('undergrounding', 0.5

In [16]:
# all functions together, finds k nearest for each term, returns top n
def pre_retrieval_KNN(string, k, wv, n):
    """Find the most similar tokens to the given query"""
    tokens = tokenize(string, stop_words)
    candidates = get_candidate_expansion_terms(tokens, k, wv)
    candidates_sim = get_top_expansion_terms(tokens, candidates, wv)
    def takeSecond(elem):
        return elem[1]
    sort = sorted(candidates_sim, key=takeSecond)[::-1]
    return sort[:n]

In [17]:
pre_retrieval_KNN('deep', 15, wv_wiki_en, 15)

[('deeper', 0.7391939305931274),
 ('deepest', 0.6943081694334498),
 ('shallow', 0.6193613248537374),
 ('deeps', 0.6047575334790283),
 ('depths', 0.6011057073807676),
 ('deepe', 0.5938460628287918),
 ('deep,', 0.5859666598255622),
 ('shallowed', 0.583433385064088),
 ('deepers', 0.5757998018436017),
 ('deepesh', 0.570137993578689)]

In [18]:
pre_retrieval_KNN('fishing and pollution', 15, wv_wiki_en, 15)

[('pollutions', 0.6271618716914852),
 ('—fishing', 0.6187853691901083),
 ('shellfishing', 0.6062972260940005),
 ('‘fishing', 0.6062562445167072),
 ('earpollution', 0.6010677191786318),
 ('flwfishing', 0.5958800042388847),
 ('pollution,', 0.5895427243095528),
 ('fishing,\u3000', 0.5847228653899453),
 ('#pollution', 0.5825691062808801),
 ('biopollution', 0.5803963372819559),
 ('fishings', 0.579665719484446),
 ('polluting', 0.5758477844891869),
 ('billfishing', 0.5739594348713932),
 ('sollution', 0.5724698912220131),
 ('gamefishing', 0.5716465085305145)]

In [21]:
pre_retrieval_KNN('underground', 15, wv_wiki_en, 15)

[('undergrounds', 0.9179600940200481),
 ('undergroun', 0.8998362698613858),
 ('undergroung', 0.8908817840768637),
 ('undergrounded', 0.8905654223562418),
 ('undergrounding', 0.8774652255413747),
 ('underground,', 0.864975977115042),
 ('undergrounder', 0.863371207446057),
 ('‘underground', 0.8486077380180744),
 ('undergrounders', 0.8468026308820736),
 ('undergroud', 0.8240030184734265)]

In [22]:
pre_retrieval_KNN('pollution', 15, wv_wiki_en, 15)

[('pollutions', 0.9166142395275622),
 ('pollution,', 0.8794701382056926),
 ('#pollution', 0.8381261902767326),
 ('earpollution', 0.8319100859328133),
 ('biopollution', 0.8275063703183863),
 ('pollutants', 0.811238387992271),
 ('antipollution', 0.8036468757467374),
 ('pollut', 0.7968592254436011),
 ('polluting', 0.7878968364118702),
 ('sollution', 0.7834910856693191)]

In [23]:
pre_retrieval_KNN('deforestation', 15, wv_wiki_en, 15)

[('deforestations', 0.9543662655883809),
 ('deforestated', 0.8770535809858295),
 ('forestation', 0.8603426012583533),
 ('reforestation', 0.8474253223194705),
 ('deforestator', 0.8424493806368417),
 ('rainforestation', 0.8416130982869243),
 ('deforested', 0.840407202191005),
 ('deforesting', 0.837824533688038),
 ('deforestion', 0.8273205199154788),
 ('afforestation', 0.7714247425497361)]

In [24]:
pre_retrieval_KNN('fishing', 15, wv_wiki_en, 15)

[('—fishing', 0.8224531025707755),
 ('fishing,\u3000', 0.8175126643141182),
 ('fishings', 0.7937207347071881),
 ('‘fishing', 0.7848391574115785),
 ('fishing,', 0.7788249395298493),
 ('codfishing', 0.7764357551424427),
 ('gamefishing', 0.7726032676663466),
 ('shellfishing', 0.7702744225755196),
 ('billfishing', 0.7617344350893832),
 ('flwfishing', 0.757403056207396)]

In [26]:
pre_retrieval_KNN('fish', 15, wv_wiki_en, 15)

[('fishes', 0.8091816234334873),
 ('baitfish', 0.7484850728532674),
 ('milkfishes', 0.7438243794721577),
 ('shellfishes', 0.737795091718359),
 ('gamefish', 0.728902819872663),
 ('mudfishes', 0.7288577509453343),
 ('beakfish', 0.7276970772384057),
 ('goldfishes', 0.7234903439609776),
 ('billfishes', 0.7199549836696606),
 ('fishwater', 0.7198671529918804)]

In [25]:
pre_retrieval_KNN('annex', 15, wv_wiki_en, 15)

[('annexes', 0.716713979048553),
 ('annexet', 0.7042431294278589),
 ('mannex', 0.6553835809055547),
 ('annexe', 0.6403126926256983),
 ('annexy', 0.6340643399412271),
 ('annext', 0.592006018305464),
 ('brannex', 0.5619266512662793),
 ('annexed', 0.5303272978298281),
 ('building', 0.5265608624567143),
 ('gannex', 0.5140592604975713)]

### Document retrieval

In [22]:
import sys
print(sys.path)

['D:\\Users\\sarab\\work\\enviroLens\\final\\enviroLENS\\word-embeddings\\notebooks', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\python37.zip', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\DLLs', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\lib', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens', '', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\lib\\site-packages', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\lib\\site-packages\\modules-1.0-py3.7.egg', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\lib\\site-packages\\win32', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\lib\\site-packages\\win32\\lib', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\lib\\site-packages\\Pythonwin', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\lib\\site-packages\\IPython\\extensions', 'D:\\Users\

In [2]:
# import postgresql
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.insert(0, module_path)
print(sys.path)
from modules.library.postgresql import PostgresQL
# connect to the postgresql database
pg = PostgresQL() 
pg.connect(database="eurlex_environment_only", user="postgres", password="dbpass") #"eurlex_env_only" "solata.2018"

['D:\\Users\\sarab\\work\\enviroLens\\final\\enviroLENS\\word-embeddings', 'D:\\Users\\sarab\\work\\enviroLens\\final\\enviroLENS\\word-embeddings\\notebooks', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\python37.zip', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\DLLs', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\lib', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens', '', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\lib\\site-packages', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\lib\\site-packages\\modules-1.0-py3.7.egg', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\lib\\site-packages\\win32', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\lib\\site-packages\\win32\\lib', 'D:\\Users\\sarab\\AppData\\Local\\conda\\conda\\envs\\EnviroLens\\lib\\site-packages\\Pythonwin', 'D:\\Users\\sarab\\AppData\\Local\\conda\\cond

In [23]:
#import documents 
# documents = pg.execute("""
#     SELECT * FROM documents WHERE LOWER(document_text) ~ '(LOWER({}))';
# """.format('|'.join(['fishing'])))
documents = pg.execute("""
    SELECT * FROM documents;
""")

In [None]:
words =[ "agreement", "fish"]
output = '|'.join(words)
output

In [7]:
# ta dela ampak pocasi, pocakaj na Samota
def db_query(query_words):
    """ Vrne seznam dokumentov iz baze, ki vsebujejo vsaj eno od besed iz seznama(list) query_words"""
    output = '|'.join(query_words)
    SQL = """
        SELECT * FROM documents
        WHERE document_text @@ to_tsquery(""" + '\''+ output + '\'' + """);"""
    documents = pg.execute(SQL)
    return(documents)

In [8]:
words =[ "agreement", "fish"]
docs = db_query(words)

In [5]:
print(len(dos
         ))

23722


In [25]:
documents[1]

{'document_id': 33,
 'document_celex_num': '21981A0916(01)',
 'document_title': 'Exchange of letters between the European Economic Community and the Peoples Republic of Poland on trade in sheepmeat and goatmeat - Exchange of letters relevant to the consultations foreseen in clause 8 of the exchange of letters - Exchange of letters relevant to clause 2 of the exchange of letters',
 'document_author': 'European Economic Community',
 'document_form': 'Exchange of letters',
 'document_date': datetime.date(1981, 4, 28),
 'document_text': 'Avis juridique important|21981A0916(01)Exchange of letters between the European Economic Community and the Peoples Republic of Poland on trade in sheepmeat and goatmeat - Exchange of letters relevant to the consultations foreseen in clause 8 of the exchange of letters - Exchange of letters relevant to clause 2 of the exchange of lettersOfficial Journal L 137 , 23/05/1981 P. 0013 - 0020 EXCHANGE OF LETTERS between the European Economic Community and the Peo

In [None]:
(document_text <> '') IS NOT TRUE

In [23]:
print(len(documents))


123157


In [112]:
docs = documents
# some docs are empty !!!!!!!!!!!!!!!!!!!

# for doc in docs:
#     idd= doc.get('document_id')
#     if idd == 39722:
#         print(doc)

delete = []
for doc in docs:
    n = len(doc.get('document_text'))
    if n == 0:
        id_doc = doc.get('document_id')
        delete.append(id_doc)
            
# remove empty docs
for doc in docs:
    id_doc = doc.get('document_id')
    if id_doc in delete:
        docs.remove(doc)
    

In [113]:
print(len(docs))


99369


In [26]:
#get tokenzized documents (texts), texts, tokenzized titles and titles #use postgress
tokenized_docs = {}
tokenized_titles = {}
texts = {}
titles = {}
for document in docs:
    doc_id = document.get('document_id')
    text = document.get('document_text')
    texts.update({doc_id: text})
    title = document.get('document_title')
    titles.update({doc_id: title})
    text = text.translate(str.maketrans('','',string.punctuation))
    tokenized = tokenize(text, stop_words)
    title = title.translate(str.maketrans('','',string.punctuation))
    tokenized_title = tokenize(title, stop_words)
    for token in tokenized:
        if len(token) == 1:
            if token.isalpha():
                tokenized.remove(token)
    tokenized_docs.update({doc_id: tokenized})
    for title in tokenized_title:
        if len(title) == 1:
            if title.isalpha():
                tokenized_title.remove(title)
    tokenized_titles.update({doc_id: tokenized_title})


In [115]:

print(tokenized_titles.get(3))
print(titles.get(3))

['agreement', 'relating', 'principally', 'chemicals', 'supplementary', 'geneva', '1967', 'protocol', 'general', 'agreement', 'tariffs', 'trade', 'negotiated', 'geneva', '30', 'june', '1967']
/* Agreement relating principally to chemicals, supplementary to the Geneva (1967) Protocol to the General Agreement on Tariffs and Trade, negotiated in Geneva on 30 June 1967 */


In [116]:
print(tokenized_docs.get(39703))
print(texts.get(39703))

[]



In [27]:
# still some empty documents
print(len(tokenized_docs))
#print(take(1, tokenized_docs.items()))
empt=[]
for k,v in tokenized_docs.items():
    l =len(v)
    if l==0:
        empt.append(k)
print(len(empt))
#print((empt))

for k in empt:
    del tokenized_docs[k]
    del tokenized_titles[k]
    del texts[k]
    del titles[k]

  

99412
23151


In [118]:
print(len(tokenized_docs))
print(len(tokenized_titles))
print(len(texts))
print(len(titles))

76261
76261
76261
76261


In [24]:
# just making smaller document set for faster testing, can delete later
tokenized_docs1 = {}
tokenized_titles1 = {}
texts1= {}
titles1={}
for k in range(1,1000):
    vtd = tokenized_docs.get(k)
    vtt = tokenized_titles.get(k)
    vx = texts.get(k)
    vt =titles.get(k)
    tokenized_docs1.update({k:vtd})
    tokenized_titles1.update({k:vtt})
    texts1.update({k:vx})
    titles1.update({k:vt})

    
    
    

NameError: name 'tokenized_docs' is not defined

In [None]:
print(len(tokenized_docs1))
print(len(tokenized_titles1))
print(len(texts1))
print(len(titles1))
tokenized_docs1 = {k:v for k,v in tokenized_docs1.items() if v is not None}


#### search full words (not lemmatized), search as substrings


In [None]:
#### 1. probability scoring

In [31]:
# probability scoring
### all query words have to be in the document (multiplying)

def probab_score(tokens,tokenized_docs,texts):
    doc_probab = {}
    for k, v in tokenized_docs.items():
        n = len(v)
        probability = 1
        text = texts.get(k)
        for token in tokens:
            token_frequency = text.count(token)
            probability = probability*(token_frequency/n)
        doc_probab.update({k: probability})
    return doc_probab

    
    

In [123]:
# only original query
score =probab_score(test,tokenized_docs,texts)
#how many docs have positive score?
positives = dict([(k,v) for k,v in score.items() if v > 0])
sorted_positives = sorted(positives.items(), key=lambda x: x[1],reverse=True)
sorted_positives_top = sorted_positives[0:10]
print(sorted_positives_top)

# original query + extension
score =probab_score(test+ext,tokenized_docs,texts)
#how many docs have positive score?
print(([(k,v) for k,v in score.items() if v > 0]))


# only original query
score =probab_score(test1,tokenized_docs,texts)
#how many docs have positive score?
positives = dict([(k,v) for k,v in score.items() if v > 0])
sorted_positives = sorted(positives.items(), key=lambda x: x[1],reverse=True)
sorted_positives_top = sorted_positives[0:10]
print(sorted_positives_top)

# original query + extension
score =probab_score(test1+ext1,tokenized_docs,texts)
#how many docs have positive score?
print(([(k,v) for k,v in score.items() if v > 0]))


# no point having an extention, empty results are ok

[(120739, 2.2778939503692463e-06), (90952, 1.4623827626019007e-06), (98346, 9.60986510312026e-07), (100875, 9.116079910329329e-07), (96934, 7.795624765214125e-07), (98891, 4.7381201508301593e-07), (99214, 4.374815382790846e-07), (53800, 2.5508770567849454e-07), (101429, 2.4695734439852766e-07), (101872, 2.433802400069899e-07)]
[]
[(72, 4.866706628529354e-11), (4098, 3.9980157168428714e-11), (31030, 1.0620609744608273e-11), (85, 6.56566844179315e-12), (47616, 4.413577090077561e-12), (59689, 3.662677012225006e-12), (2146, 2.529433698703274e-12), (25303, 2.2094496103487563e-12), (57414, 2.1926876323265894e-12), (31131, 1.3631354904463345e-12)]
[]


In [None]:
    def probability_score(tokens,texts):
        """Assigns score to document based on multiplication of probabilities. Used only for query searches which demand that all query words 
        should be in doucuments returned.
        Args:
            tokens (list): List of tokens (tokenized query).
            tokenized_docs (dict): Keys represent document ids, values are lists of tokenized text (content) of documents.
            texts (dict):  Keys represent document ids, values are document text.
        Returns:
            document_probability (dict): Keys represent document ids, values are scores that measure adequacy of the document.
        """
        document_probability = {}
        for k, v in texts.items():
            n = len(v)
            probability = 1
            for token in tokens:
                token_frequency = v.count(token)
                probability = probability*(token_frequency/n)
            document_probability.update({k: probability})
        return document_probability

{1: 0.0,
 2: 5.682684467100214e-12,
 3: 0.0,
 5: 0.0,
 6: 0.0,
 7: 0.0,
 8: 0.0,
 9: 0.0,
 10: 0.0,
 11: 0.0,
 12: 0.0,
 13: 0.0,
 14: 0.0,
 15: 0.0,
 16: 0.0,
 17: 0.0,
 18: 0.0,
 19: 0.0,
 20: 0.0,
 21: 0.0,
 22: 0.0,
 23: 0.0,
 24: 0.0,
 25: 0.0,
 26: 0.0,
 27: 0.0,
 28: 0.0,
 29: 0.0,
 30: 0.0,
 31: 0.0,
 32: 0.0,
 33: 0.0,
 34: 0.0,
 35: 0.0,
 36: 0.0,
 37: 0.0,
 38: 0.0,
 39: 0.0,
 58: 0.0,
 116: 0.0,
 2624: 0.0,
 40: 0.0,
 41: 0.0,
 42: 0.0,
 43: 0.0,
 3021: 0.0,
 44: 0.0,
 45: 0.0,
 46: 0.0,
 47: 0.0,
 48: 0.0,
 49: 0.0,
 50: 0.0,
 51: 1.1509197350428859e-11,
 52: 0.0,
 53: 0.0,
 54: 0.0,
 55: 0.0,
 56: 0.0,
 57: 0.0,
 59: 0.0,
 60: 0.0,
 61: 0.0,
 62: 0.0,
 63: 0.0,
 64: 0.0,
 65: 0.0,
 66: 0.0,
 67: 0.0,
 3022: 0.0,
 68: 0.0,
 69: 0.0,
 70: 0.0,
 71: 0.0,
 72: 0.0,
 73: 0.0,
 74: 0.0,
 75: 0.0,
 76: 0.0,
 77: 0.0,
 78: 0.0,
 79: 0.0,
 80: 1.6387162322777285e-11,
 81: 0.0,
 82: 0.0,
 83: 0.0,
 91: 0.0,
 92: 0.0,
 84: 0.0,
 85: 0.0,
 86: 0.0,
 87: 0.0,
 88: 0.0,
 89: 0.0,
 90: 

In [125]:
## query words summation
def probab_score_sum(tokens,tokenized_docs,texts):
    '''assigns score to document based on summation of probabilities'''
    doc_probab = {}
    for k, v in tokenized_docs.items():
        n = len(v)
        probability = 0
        text = texts.get(k)
        for token in tokens:
            token_frequency = text.count(token)
            probability = probability+(token_frequency/n)
        doc_probab.update({k: probability})
    return doc_probab
    

In [126]:
#zavedaj se:
"gabla is bla2".count("bla")
# kar pomeni da bi bilo bolje uporabiti lemmatized words! popravi!! neke stvari zdaj 2x stejes

2

In [127]:
def top_positives(dictionary,n):
    """Takes dict and returns first n tuples of k,v sorted by v"""
    positives = {} 
    for k,v in dictionary.items():
        if v > 0:
            positives.update({k: v})
    sorted_positives = sorted(positives.items(), key=lambda x: x[1],reverse=True)
    sorted_positives_top = sorted_positives[0:n]
    return sorted_positives_top

In [128]:
# only original query
score_sum =probab_score_sum(test,tokenized_docs,texts)

#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)

[(32476, 0.11656671664167916), (33546, 0.11182108626198083), (37122, 0.10662729658792651), (34869, 0.10542476970317298), (30565, 0.103206106870229), (36068, 0.10318609703113686), (3867, 0.10294117647058823), (95196, 0.0990990990990991), (94467, 0.0984251968503937), (38921, 0.0975609756097561)]


In [129]:
# create dataframe
df_sum_original = pd.DataFrame(sorted_positives_top, columns =['id_sum_original', 'score'])
df_sum_original

Unnamed: 0,id_sum_original,score
0,32476,0.116567
1,33546,0.111821
2,37122,0.106627
3,34869,0.105425
4,30565,0.103206
5,36068,0.103186
6,3867,0.102941
7,95196,0.099099
8,94467,0.098425
9,38921,0.097561


In [130]:
# print(titles.get(565))
# print(texts.get(565))

In [131]:
## original query plus ext
score_sum =probab_score_sum(test+ext,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)

[(32476, 0.11656671664167916), (33546, 0.11182108626198083), (37122, 0.10662729658792651), (34869, 0.10542476970317298), (30565, 0.103206106870229), (36068, 0.10318609703113686), (3867, 0.10294117647058823), (95196, 0.0990990990990991), (94467, 0.0984251968503937), (38921, 0.0975609756097561)]


In [132]:
# create dataframe
df_sum_original_ext = pd.DataFrame(sorted_positives_top, columns =['id_sum_original_ext', 'score'])

In [133]:
# only original query
score =probab_score(test1,tokenized_docs,texts)
#how many docs have positive score?
sorted_positives_top = top_positives(score,10)
print(sorted_positives_top)



## original query
score_sum =probab_score_sum(test1,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)
#dataframe
df_sum_original1 = pd.DataFrame(sorted_positives_top, columns =['id_sum_original1', 'score'])
df_sum_original1

[(72, 4.866706628529354e-11), (4098, 3.9980157168428714e-11), (31030, 1.0620609744608273e-11), (85, 6.56566844179315e-12), (47616, 4.413577090077561e-12), (59689, 3.662677012225006e-12), (2146, 2.529433698703274e-12), (25303, 2.2094496103487563e-12), (57414, 2.1926876323265894e-12), (31131, 1.3631354904463345e-12)]
[(94870, 0.1276595744680851), (11790, 0.09090909090909091), (100258, 0.08527131782945736), (97436, 0.08280254777070063), (28599, 0.08163833937029813), (92702, 0.07514450867052024), (97265, 0.07430340557275542), (50112, 0.07235621521335807), (62459, 0.07103825136612021), (49539, 0.07056113902847572)]


Unnamed: 0,id_sum_original1,score
0,94870,0.12766
1,11790,0.090909
2,100258,0.085271
3,97436,0.082803
4,28599,0.081638
5,92702,0.075145
6,97265,0.074303
7,50112,0.072356
8,62459,0.071038
9,49539,0.070561


In [134]:
## original query plus ext
score_sum =probab_score_sum(test1+ext1,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
positives = dict([(k,v) for k,v in score_sum.items() if v > 0])
sorted_positives = sorted(positives.items(), key=lambda x: x[1],reverse=True)
sorted_positives_top = sorted_positives[0:10]
#dataframe
df_sum_original_ext1 = pd.DataFrame(sorted_positives_top, columns =['id_sum_original_ext1', 'score'])
df_sum_original_ext1

Unnamed: 0,id_sum_original_ext1,score
0,94870,0.12766
1,92702,0.121387
2,93604,0.116959
3,97781,0.114583
4,93549,0.104348
5,14571,0.102041
6,97436,0.101911
7,100258,0.100775
8,44382,0.100151
9,98394,0.099099


In [135]:
# adding candidates
## without weights
# no point using multiplication

# summation:
## original query
score_sum =probab_score_sum(test+topw_list,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)
#dataframe
df_sum_original_cand = pd.DataFrame(sorted_positives_top, columns =['id_sum_original_cand', 'score'])


## original query plus ext
score_sum =probab_score_sum(test+ext+top_list,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)
#dataframe
df_sum_original_ext_cand = pd.DataFrame(sorted_positives_top, columns =['id_sum_original_ext_cand', 'score'])


## original query
score_sum =probab_score_sum(test1+topw_list1,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)
#dataframe
df_sum_original_cand1 = pd.DataFrame(sorted_positives_top, columns =['id_sum_original_cand1', 'score'])


## original query plus ext
score_sum =probab_score_sum(test1+ext1+top_list1,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)
#dataframe
df_sum_original_ext_cand1 = pd.DataFrame(sorted_positives_top, columns =['id_sum_original_ext_cand1', 'score'])



[(12744, 0.15163934426229508), (92986, 0.1473429951690821), (92988, 0.14669421487603304), (9649, 0.13286713286713286), (93921, 0.11695906432748537), (32476, 0.11656671664167916), (62937, 0.11578947368421053), (94936, 0.11494252873563218), (97319, 0.11363636363636363), (95196, 0.11261261261261261)]
[(94936, 0.12643678160919541), (3453, 0.11875000000000001), (32476, 0.11656671664167916), (97319, 0.11363636363636363), (33546, 0.11182108626198083), (95196, 0.10810810810810811), (37122, 0.10662729658792651), (34869, 0.10542476970317298), (30565, 0.103206106870229), (36068, 0.10318609703113686)]
[(94870, 0.1276595744680851), (11790, 0.09090909090909091), (100258, 0.08527131782945736), (97436, 0.08280254777070063), (28599, 0.08163833937029813), (92702, 0.07514450867052024), (97265, 0.07430340557275542), (50112, 0.07235621521335807), (62459, 0.07103825136612021), (49539, 0.07056113902847572)]
[(94870, 0.1276595744680851), (92702, 0.12138728323699421), (93604, 0.11695906432748537), (97781, 0.11

In [136]:
def word_value(word, alpha, original_tokens, top_expansion, wv):
    """values word based on whether is in original token set or expanded, if alpha -1 value equals to cosine similarity"""
    only_expanded = []
    for token in top_expansion:
        if token not in original_tokens:
            only_expanded.append(token)
            
    sum_similarity = 0
    for exp_token in only_expanded:
            sum_similarity += similarity(exp_token,original_tokens, wv)
            
    if alpha == -1:
        if word in original_tokens:
            value = 1
        else:
            value = similarity(word, original_tokens, wv)/sum_similarity


    else:
        if word in original_tokens:
            value = alpha
        else:
            value = (1-alpha)*similarity(word, original_tokens, wv)/sum_similarity
    return value

In [137]:
# ce ni ext zraven je so cudni rezultati, zamenja vrstni red pomembnsti med sewage in undergrounding??
top = top[0:4]
top_words = [i[0] for i in top]
print(word_value("water", 0.7, test+ext ,top_words, wv_wiki_en))
print(word_value("sewage", 0.7, test+ext ,top_words, wv_wiki_en))
print(word_value("undergrounding", 0.7, test+ext ,top_words, wv_wiki_en))
print(word_value("biopollution", 0.7, test+ext ,top_words, wv_wiki_en))

0.7
0.28172557133554
0.7
0.27141179615367744


In [138]:
def probab_score_sum_weights(original_tokens, top_expansion,tokenized_docs,texts, wv, alpha): 
    '''As probab_score_sum only weighted; usually extention added to original tokens, candidates = top_expansion - have weights'''
    doc_probab = {}
    for k, v in tokenized_docs.items():
        n = len(v)
        probability = 0
        text = texts.get(k)
        for token in original_tokens+top_expansion:
            token_frequency = text.count(token)
            probability = probability+(token_frequency/n)*word_value(token, alpha, original_tokens, top_expansion, wv)
        doc_probab.update({k: probability})
    return doc_probab

In [139]:
#check word values
wvals = []
for token in test+ext+top_list:
    wvals.append(word_value(token, 0.35, test+ext, top_list, wv_wiki_en))
wvals


[0.35,
 0.35,
 0.35,
 0.35,
 0.35,
 0.35,
 0.35,
 0.35,
 0.33091025022915893,
 0.31908974977084104]

In [34]:
top_list

['pollution', 'pollutions', 'undergrounding', 'earpollution', 'pollution,']

In [48]:
#check word frequences
n = len(tokenized_docs1)
print(n)
t_freqs = []
for k, v in tokenized_docs1.items():
    n = len(v)
    text = texts1.get(k)
    for token in test+ext+top_list:
        token_frequency = text.count(token)
        t_freqs.append(token_frequency)

998


In [141]:
## with weights
# summation 
original_query_cand = []
for alpha in [-1,0.35,0.4,0.45,0.5,0.6,0.7,0.8,0.9,1]:
    ## original query
    score_sum = probab_score_sum_weights(test, topw_list,tokenized_docs, texts, wv_wiki_en, alpha)
    #original + ext gives same score, add global and state gives different score, same order
    #how many docs have positive score?
    sorted_positives_top = top_positives(score_sum,10)
    #dataframe
    df_wsum_original_cand = pd.DataFrame(sorted_positives_top, columns =['id_wsum_original_cand'+str(alpha), 'score'+str(alpha)])
    original_query_cand.append(df_wsum_original_cand)

In [142]:
## original query plus ext
original_query_ext_cand = []
for alpha in [-1,0.35,0.4,0.45,0.5,0.6,0.7,0.8,0.9,1]:
    score_sum =probab_score_sum_weights(test+ext, top_list,tokenized_docs,texts,  wv_wiki_en, alpha)
    #original + ext gives same score, add global and state gives different score, same order
    #how many docs have positive score?
    sorted_positives_top = top_positives(score_sum,10)
    #dataframe
    df_wsum_original_ext_cand = pd.DataFrame(sorted_positives_top, columns =['id_wsum_original_ext_cand'+str(alpha), 'score'+str(alpha)])
    original_query_ext_cand.append(df_wsum_original_ext_cand)

In [143]:
# comparing sorting for each alpha
doubleframes =[]
for i in range(len(original_query_cand)):
    frst = original_query_cand[i].take([0], axis=1)
    snd = original_query_ext_cand[i].take([0], axis=1)
    con = pd.concat([frst,snd], axis=1)
    doubleframes.append(con)
    
    

In [144]:
doubleframes[2]

Unnamed: 0,id_wsum_original_cand0.4,id_wsum_original_ext_cand0.4
0,32476,94936
1,12744,32476
2,92988,3453
3,94936,97319
4,97319,33546
5,92986,95196
6,33546,37122
7,95196,34869
8,9649,30565
9,37122,36068


In [145]:
# comparing sorting for different alphas, original + cand
frames =[]
for i in range(len(original_query_cand)):
    dataf = original_query_cand[i].take([0], axis=1)
    frames.append(dataf)
con = pd.concat(frames, axis=1)
con

Unnamed: 0,id_wsum_original_cand-1,id_wsum_original_cand0.35,id_wsum_original_cand0.4,id_wsum_original_cand0.45,id_wsum_original_cand0.5,id_wsum_original_cand0.6,id_wsum_original_cand0.7,id_wsum_original_cand0.8,id_wsum_original_cand0.9,id_wsum_original_cand1
0,32476,12744,32476,32476,32476,32476,32476,32476,32476,32476
1,94936,92988,12744,94936,94936,94936,94936,94936,94936,94936
2,97319,92986,92988,97319,97319,97319,97319,97319,97319,97319
3,33546,32476,94936,92988,33546,33546,33546,33546,33546,33546
4,92988,94936,97319,12744,92988,95196,95196,95196,95196,95196
5,95196,97319,92986,33546,95196,37122,37122,37122,37122,37122
6,12744,33546,33546,95196,12744,3453,3453,3453,3453,3453
7,37122,9649,95196,92986,37122,92988,34869,34869,34869,34869
8,3453,95196,9649,37122,3453,34869,30565,30565,30565,30565
9,92986,93921,37122,3453,92986,12744,36068,36068,36068,36068


- alpha 1 - 0.5 first 4 docs the same, 0.45 first 3 the same, 0.4 1st the same
- alpha -1 same as alpha 0.5
- alpha 0.35 even 1st doc different than others

In [146]:
#counting number of occurances of  documents
values = con.values.tolist()
flat_vals = []
for sublist in values:
    for item in sublist:
        flat_vals.append(item)
counter=collections.Counter(flat_vals)
print((counter))
print(len(counter))

Counter({32476: 10, 94936: 10, 97319: 10, 33546: 10, 95196: 10, 37122: 9, 3453: 8, 12744: 6, 92988: 6, 92986: 5, 34869: 5, 30565: 4, 36068: 4, 9649: 2, 93921: 1})
15


- 5 docs in every column 
- 11 of 15 different docs appear at least in half of the columns

In [147]:
# comparing sorting for different alphas, original +ext + cand
frames =[]
for i in range(len(original_query_ext_cand)):
    dataf = original_query_ext_cand[i].take([0], axis=1)
    frames.append(dataf)
conex = pd.concat(frames, axis=1)
conex

Unnamed: 0,id_wsum_original_ext_cand-1,id_wsum_original_ext_cand0.35,id_wsum_original_ext_cand0.4,id_wsum_original_ext_cand0.45,id_wsum_original_ext_cand0.5,id_wsum_original_ext_cand0.6,id_wsum_original_ext_cand0.7,id_wsum_original_ext_cand0.8,id_wsum_original_ext_cand0.9,id_wsum_original_ext_cand1
0,94936,94936,94936,94936,94936,94936,94936,32476,32476,32476
1,32476,3453,32476,32476,32476,32476,32476,94936,94936,94936
2,97319,32476,3453,3453,97319,97319,97319,97319,97319,97319
3,3453,97319,97319,97319,3453,33546,33546,33546,33546,33546
4,33546,33546,33546,33546,33546,3453,3453,95196,95196,95196
5,95196,95196,95196,95196,95196,95196,95196,3453,3453,37122
6,37122,37122,37122,37122,37122,37122,37122,37122,37122,3453
7,34869,34869,34869,34869,34869,34869,34869,34869,34869,34869
8,30565,30565,30565,30565,30565,30565,30565,30565,30565,30565
9,36068,36068,36068,36068,36068,36068,36068,36068,36068,36068


- 0.8 - 1 1st doc 32476, rest 1st doc 94936
- 0.8 - 1 first 5 docs the same



In [148]:
#counting number of occurances of  documents
values = conex.values.tolist()
flat_vals = []
for sublist in values:
    for item in sublist:
        flat_vals.append(item)
counter=collections.Counter(flat_vals)
print((counter))
print(len(counter))

Counter({94936: 10, 32476: 10, 3453: 10, 97319: 10, 33546: 10, 95196: 10, 37122: 10, 34869: 10, 30565: 10, 36068: 10})
10


- all columns contain same documents, just order is different

In [149]:
print(test)
print(ext)
print(top_list)
# 94936 light pollution -, 32476 fishing quotas o, 97319 light pollution - , 3453 marine pollution o
# without weighted sum (below): 
# 32476 already appeared, 33546 fishing quotas o,37122 fishing quotas o,12744 groundwater protection +,92988 groundwater protection +
# 33546 fishing quotas o, 92986 groundwater protection +
# id_sum_original_cand best choice

['water', 'pollution', 'underground']
['pollutions', 'undergrounding']
['pollution', 'pollutions', 'undergrounding', 'earpollution', 'pollution,']


In [150]:
texts.get(92986)

'Avis juridique important|91997E1892WRITTEN QUESTION No. 1892/97 by Amedeo AMADEO to the Commission. Integrated groundwater protection and managementOfficial Journal C 045 , 10/02/1998 P. 0120 WRITTEN QUESTION E-1892/97 by Amedeo Amadeo (NI) to the Commission (4 June 1997)Subject: Integrated groundwater protection and managementWith reference to the Commission Proposal for a European Parliament and Council Decision on an action programme for integrated groundwater protection and management (COM(96) 0315 final - 96/0181 COD) ((OJ C 355, 25.11.1996, p. 1.)),the proposed action programme comprises four main lines of action: planning and management of groundwater protection, creating a regulatory framework for fresh water abstraction, development of instruments for control of groundwater polution from diffuse sources and development of instruments for control of point source emissions and discharges.With regard to the first line of action, will the Commission:1. Classify waters according t

In [151]:
#check word values
wvals = []
for token in test1+ext1+top_list1:
    wvals.append(word_value(token, 0.35, test+ext, top_list, wv_wiki_en))
wvals

[0.10124015377562287,
 0.15817081118234286,
 0.07172285285022235,
 0.11798534515333482,
 0.18960660975970187,
 0.1010925155782609,
 0.33091025022915893,
 0.35,
 0.3127512400283096,
 0.29937648460119626,
 0.26977364610954957]

In [152]:
#test1:
## original query
original_query_cand1 = []
for alpha in [-1,0.35,0.4,0.45,0.5,0.6,0.7,0.8,0.9,1]:
    score_sum = probab_score_sum_weights(test1, topw_list1,tokenized_docs,texts,  wv_wiki_en, alpha)
    #original + ext gives same score, add global and state gives different score, same order
    #how many docs have positive score?
    sorted_positives_top = top_positives(score_sum,10)
    #dataframe
    df_wsum_original_cand1 = pd.DataFrame(sorted_positives_top, columns =['id_wsum_original_cand1'+str(alpha), 'score'+str(alpha)])
    original_query_cand1.append(df_wsum_original_cand1)


## original query plus ext
original_query_ext_cand1 = []
for alpha in [-1,0.35,0.4,0.45,0.5,0.6,0.7,0.8,0.9,1]:
    score_sum =probab_score_sum_weights(test1+ext1, top_list1,tokenized_docs,texts,  wv_wiki_en, alpha)
    #original + ext gives same score, add global and state gives different score, same order
    #how many docs have positive score?
    sorted_positives_top = top_positives(score_sum,10)
    #dataframe
    df_wsum_original_ext_cand1 = pd.DataFrame(sorted_positives_top, columns =['id_wsum_original_ext_cand1'+str(alpha), 'score'+str(alpha)])
    original_query_ext_cand1.append(df_wsum_original_ext_cand1)
    

In [153]:
# comparing sorting for each alpha
doubleframes1 =[]
for i in range(len(original_query_cand1)):
    frst = original_query_cand1[i].take([0], axis=1)
    snd = original_query_ext_cand1[i].take([0], axis=1)
    con = pd.concat([frst,snd], axis=1)
    doubleframes1.append(con)

In [154]:
doubleframes1[0]

Unnamed: 0,id_wsum_original_cand1-1,id_wsum_original_ext_cand1-1
0,94870,94870
1,11790,92702
2,100258,93604
3,97436,97781
4,28599,93549
5,92702,14571
6,97265,97436
7,50112,100258
8,62459,44382
9,49539,98394


In [155]:
# comparing sorting for different alphas, original + cand
frames =[]
for i in range(len(original_query_cand1)):
    dataf = original_query_cand1[i].take([0], axis=1)
    frames.append(dataf)
con1 = pd.concat(frames, axis=1)
con1

Unnamed: 0,id_wsum_original_cand1-1,id_wsum_original_cand10.35,id_wsum_original_cand10.4,id_wsum_original_cand10.45,id_wsum_original_cand10.5,id_wsum_original_cand10.6,id_wsum_original_cand10.7,id_wsum_original_cand10.8,id_wsum_original_cand10.9,id_wsum_original_cand11
0,94870,94870,94870,94870,94870,94870,94870,94870,94870,94870
1,11790,11790,11790,11790,11790,11790,11790,11790,11790,11790
2,100258,100258,100258,100258,100258,100258,100258,100258,100258,100258
3,97436,97436,97436,97436,97436,97436,97436,97436,97436,97436
4,28599,28599,28599,28599,28599,28599,28599,28599,28599,28599
5,92702,92702,92702,92702,92702,92702,92702,92702,92702,92702
6,97265,97265,97265,97265,97265,97265,97265,97265,97265,97265
7,50112,50112,50112,50112,50112,50112,50112,50112,50112,50112
8,62459,62459,62459,62459,62459,62459,62459,62459,62459,62459
9,49539,49539,49539,49539,49539,49539,49539,49539,49539,49539


- all columns the same

In [156]:
print(test1)
print(ext1)
print(top_list1)
#94870 fishing agreement EU +, 11790 fishing agreement EU +, 100258 fishing agreement +
#94870 11790, 100258 already, 92702 fishing agreement +, 93604 environmental agreement o ;better without extention

['annex', 'fishing', 'agreement', 'europe']
['flwfishing', 'agreements']
['earpollution', 'pollution', 'pollutants', 'biopollution', 'potable']


In [157]:
texts.get(93604)

'Avis juridique important|91997E3072WRITTEN QUESTION No. 3072/97 by Amedeo AMADEO to the Commission. Environmental agreementsOfficial Journal C 117 , 16/04/1998 P. 0146 WRITTEN QUESTION E-3072/97 by Amedeo Amadeo (NI) to the Commission (2 October 1997)Subject: Environmental agreementsThe main objective of the Commissions communication on environmental agreements (COM(96) 561 final) is to promote and facilitate the use of effective and acceptable environmental agreements. These agreements are instruments for the integration or implementation of environment law in the Community. The communication should be seen in the light of the strategy outlined in the fifth action programme to extend the range of environment policy instruments and put into practice the concept of shared responsibility.The communication also seeks to clarify certain aspects of how environmental agreements can be used to implement certain provisions of Community directives in the Member States and how environmental agr

In [158]:
# comparison of summation method versions on test set
frms = [df_sum_original["id_sum_original"], df_sum_original_cand["id_sum_original_cand"], df_sum_original_ext['id_sum_original_ext'],df_sum_original_ext_cand['id_sum_original_ext_cand'],doubleframes[2]]
sum_result = pd.concat(frms, axis=1)
sum_result


Unnamed: 0,id_sum_original,id_sum_original_cand,id_sum_original_ext,id_sum_original_ext_cand,id_wsum_original_cand0.4,id_wsum_original_ext_cand0.4
0,32476,12744,32476,94936,32476,94936
1,33546,92986,33546,3453,12744,32476
2,37122,92988,37122,32476,92988,3453
3,34869,9649,34869,97319,94936,97319
4,30565,93921,30565,33546,97319,33546
5,36068,32476,36068,95196,92986,95196
6,3867,62937,3867,37122,33546,37122
7,95196,94936,95196,34869,95196,34869
8,94467,97319,94467,30565,9649,30565
9,38921,95196,38921,36068,37122,36068


In [159]:
values = sum_result.values.tolist()
flat_vals = []
for sublist in values:
    for item in sublist:
        flat_vals.append(item)
counter=collections.Counter(flat_vals)
print((counter))
print(len(counter))

Counter({32476: 6, 95196: 6, 33546: 5, 37122: 5, 94936: 4, 34869: 4, 97319: 4, 30565: 4, 36068: 4, 12744: 2, 92986: 2, 3453: 2, 92988: 2, 9649: 2, 3867: 2, 94467: 2, 38921: 2, 93921: 1, 62937: 1})
19


- same results for sum original and sum original_ext
- slight diff. between original ext cand and original ext cand wsum  
- ..




In [160]:
# comparison of summation method versions on test1 set
frames = [df_sum_original1["id_sum_original1"], df_sum_original_cand1["id_sum_original_cand1"], df_sum_original_ext1['id_sum_original_ext1'],df_sum_original_ext_cand1['id_sum_original_ext_cand1'],doubleframes1[2]]
sum_result1 = pd.concat(frames, axis=1)
sum_result1


Unnamed: 0,id_sum_original1,id_sum_original_cand1,id_sum_original_ext1,id_sum_original_ext_cand1,id_wsum_original_cand10.4,id_wsum_original_ext_cand10.4
0,94870,94870,94870,94870,94870,94870
1,11790,11790,92702,92702,11790,92702
2,100258,100258,93604,93604,100258,93604
3,97436,97436,97781,97781,97436,97781
4,28599,28599,93549,93549,28599,93549
5,92702,92702,14571,14571,92702,14571
6,97265,97265,97436,97436,97265,97436
7,50112,50112,100258,100258,50112,100258
8,62459,62459,44382,44382,62459,44382
9,49539,49539,98394,98394,49539,98394


- 1st place the same
- sum original, sum original cand, wsum original cand same
- sum original ext, sum original ext cand, wsum original ext  cand the same
  --> having ext or not gives different results

In [161]:
values = sum_result1.values.tolist()
flat_vals = []
for sublist in values:
    for item in sublist:
        flat_vals.append(item)
counter1=collections.Counter(flat_vals)
print((counter1))
print(len(counter1))

Counter({94870: 6, 92702: 6, 100258: 6, 97436: 6, 11790: 3, 93604: 3, 97781: 3, 28599: 3, 93549: 3, 14571: 3, 97265: 3, 50112: 3, 62459: 3, 44382: 3, 49539: 3, 98394: 3})
16


- 4 values appear in all columns, rest of the values appear in half columns (ext/not ext)


In [162]:
# for first 5 returned docs no difference between weighted and unweighted for alpha = 0.6,  alpha = 0.8, 1 #if all weight the same is same as
# if expansion would not exist
# only tokens / tokens + ext
# [(99, 0.048582995951417005), (380, 0.046610169491525424), (244, 0.04477611940298507), (89, 0.04371584699453552), (376, 0.04034065441506051)]
# [(244, 0.08955223880597014), (243, 0.08), (903, 0.04964539007092198), (99, 0.048582995951417005), (380, 0.046610169491525424)]
# unweighted with candidate exp:
# [(565, 0.048730964467005075), (1219, 0.0461864406779661), (12, 0.04042348411934552), (226, 0.039756782039289056), (22, 0.03749147920927062)]
# [(565, 0.05177664974619289), (1219, 0.04745762711864407), (12, 0.04138594802694899), (226, 0.04069223573433115), (22, 0.03953646898432175)]
# [(99, 0.048582995951417005), (380, 0.046610169491525424), (244, 0.04477611940298507), (89, 0.04371584699453552), (376, 0.04034065441506051)]
# [(244, 0.08955223880597014), (243, 0.08), (903, 0.04964539007092198), (99, 0.048582995951417005), (380, 0.046610169491525424)]

In [163]:
# for 10
# alpha 0.8, 1
# [(161, 0.027947874459039665), (313, 0.027129979796553974), (73, 0.025445292620865142), (402, 0.022429906542056073)]
# [(161, 0.027915369391449767), (313, 0.02712754175646111), (73, 0.025570205421714953), (402, 0.022429906542056073)]
# [(243, 0.032), (925, 0.031578947368421054), (1212, 0.03118536197295147), (910, 0.031168831168831172), (108, 0.03114754098360656)]
# [(1212, 0.036276849642004776), (89, 0.034972677595628415), (376, 0.032989690721649485), (925, 0.031578947368421054), (910, 0.031168831168831172)]
# for alpha 0.6 one change in one case
# only tokens/tokens+ext
# [(243, 0.04), (925, 0.039473684210526314), (1212, 0.03898170246618934), (910, 0.03896103896103896), (108, 0.0389344262295082)]
# [(1212, 0.045346062052505964), (89, 0.04371584699453552), (376, 0.041237113402061855), (925, 0.039473684210526314), (910, 0.03896103896103896)]

In [164]:
# #### 2.TFIDF evaluation
# texts_keys = []
# texts_values = []
# for key in sorted(texts.keys()) :
#     texts_keys.append(key)
#     texts_values.append(texts[key])

In [165]:
# vectorizer = TfidfVectorizer(stop_words = "english")
# vectors_t = vectorizer.fit_transform(texts_values)

In [166]:
# # get the first vector out (for the first document)
# vector_t = vectors_t[0]


In [167]:
# # place tf-idf values in a pandas data frame
# vector_dframe_t = pd.DataFrame(vector_t.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
# vector_dframe_t = vector_dframe_t.sort_values(by=["tfidf"],ascending=False)

In [168]:
# vector_dframe_t.head(50) #treaty ne sesteje lepo, lahko bi text olepsal preden gre v vectorizer, a pojavitev pomeni istost?

In [169]:
# # try to solve with transformation into string of tokenized text:
# strings_keys = []
# strings = []
# for key in sorted(tokenized_docs.keys()) :
#     strings_keys.append(key)
#     list_tokens = tokenized_docs[key]
#     corrected = " ".join(list_tokens)
#     strings.append(corrected)



In [170]:
# vectors = vectorizer.fit_transform(strings)


In [171]:
# # get the first vector out (for the first document)
# vector = vectors[0]

In [172]:
# # place tf-idf values in a pandas data frame
# vector_dframe = pd.DataFrame(vector.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
# vector_dframe = vector_dframe.sort_values(by=["tfidf"],ascending=False)

In [173]:
# vector_dframe.head(50) #not much difference

In [174]:
# calculate tfidf only for query words:

In [175]:
def nb_docs_tokens_appear(tokensI,tokenized_docsI,textsI):
    '''For each token in tokensI counts the number of documents the token has appeared'''
    docs_per_token = []
    for i in range(len(tokensI)):
        docs_per_token.append(0)
    for k, v in tokenized_docsI.items():
        content = tokenized_docsI.get(k)
        text = textsI.get(k)
        for i in range(len(tokensI)):
            token = tokensI[i]
            if token in text:
                docs_per_token[i] = docs_per_token[i]+1
    return docs_per_token
                    

In [176]:
def tfidf_sum(tokensI,tokenized_docsI, textsI):
    '''First tuple argument similar to probab_score_sum function but different metric - tfidf, second returns words that did not occure in any document'''
    nb_docs_tokens_appeared = nb_docs_tokens_appear(tokensI,tokenized_docsI,textsI)
    filtered_nb_docs_tokens_appeared = [elt for elt in nb_docs_tokens_appeared if not elt == 0]
    not_appear = []
    appear = []
    for i in range(len(nb_docs_tokens_appeared)):
        if nb_docs_tokens_appeared[i] == 0:
            not_appear.append(tokensI[i])
        else:
            appear.append(tokensI[i])    
    l = len(tokenized_docsI)
    doc_probab = {}
    for k, v in tokenized_docsI.items():
        n = len(v)
        text = textsI.get(k)
        probability = 0
        for i in range(len(appear)):
            token_frequency = text.count(appear[i])
            idf = math.log(l/filtered_nb_docs_tokens_appeared[i])
            probability = probability+((token_frequency/n)*idf)
        doc_probab.update({k: probability})
    return doc_probab, not_appear

In [177]:
# tfidf metric for test set
tf = tfidf_sum(test,tokenized_docs, texts)
df_tfidf_sum_original = pd.DataFrame(top_positives(tf[0],10), columns =['id_tfidf_sum_original', 'score'])
tf[1] #query words that did not appear in any doc
df_tfidf_sum_original

Unnamed: 0,id_tfidf_sum_original,score
0,32476,0.197593
1,33546,0.189548
2,37122,0.180744
3,34869,0.178706
4,95196,0.178279
5,30565,0.174945
6,36068,0.174911
7,3867,0.174496
8,63388,0.167434
9,94467,0.166841


In [178]:
tf = tfidf_sum(test+ext,tokenized_docs, texts)
df_tfidf_sum_original_ext = pd.DataFrame(top_positives(tf[0],10), columns =['id_tfidf_sum_original_ext', 'score'])
tf[1]
#df_tfidf_sum_original_ext

[]

In [179]:
tf = tfidf_sum(test+ext+top_list,tokenized_docs, texts)
df_tfidf_sum_original_ext_cand = pd.DataFrame(top_positives(tf[0],10), columns =['id_tfidf_sum_original_ext_cand', 'score'])
tf[1]
#df_tfidf_sum_original_ext_cand

['earpollution']

In [180]:
tf = tfidf_sum(test+topw_list,tokenized_docs, texts)
df_tfidf_sum_original_cand = pd.DataFrame(top_positives(tf[0],10), columns =['id_tfidf_sum_original_cand', 'score'])
tf[1]
#df_tfidf_sum_original_cand

['earpollution']

In [181]:
# compare results of tfidf sum for different input sets
frames = [df_tfidf_sum_original['id_tfidf_sum_original'], df_tfidf_sum_original_ext['id_tfidf_sum_original_ext'], df_tfidf_sum_original_ext_cand['id_tfidf_sum_original_ext_cand'],df_tfidf_sum_original_cand['id_tfidf_sum_original_cand']]
tfidf_sum_result = pd.concat(frames, axis=1)
tfidf_sum_result

Unnamed: 0,id_tfidf_sum_original,id_tfidf_sum_original_ext,id_tfidf_sum_original_ext_cand,id_tfidf_sum_original_cand
0,32476,32476,94936,92988
1,33546,33546,3453,12744
2,37122,37122,97319,92986
3,34869,34869,93533,9649
4,95196,95196,97320,94936
5,30565,30565,4505,97319
6,36068,36068,94953,92987
7,3867,3867,59175,93921
8,63388,63388,93921,47944
9,94467,94467,92988,3453


- tfifd sum original and tfidf sum original ext the same


In [182]:
#count number of appearances for each doument in upper dataframe
values = tfidf_sum_result.values.tolist()
flat_vals = []
for sublist in values:
    for item in sublist:
        flat_vals.append(item)
countertf=collections.Counter(flat_vals)
print((countertf))
print(len(countertf))

Counter({32476: 2, 94936: 2, 92988: 2, 33546: 2, 3453: 2, 37122: 2, 97319: 2, 34869: 2, 95196: 2, 30565: 2, 36068: 2, 3867: 2, 93921: 2, 63388: 2, 94467: 2, 12744: 1, 92986: 1, 93533: 1, 9649: 1, 97320: 1, 4505: 1, 94953: 1, 92987: 1, 59175: 1, 47944: 1})
25


In [183]:
#evaluating different merics
print(test)
print(ext)
print(top_list)
##############################################
#SUM
###############################################
# 94936 light pollution -, 32476 fishing quotas o, 97319 light pollution - , 3453 marine pollution o
# without weighted sum (below): 
# 32476 already appeared, 33546 fishing quotas o,37122 fishing quotas o,12744 groundwater protection +,92988 groundwater protection +
# 33546 fishing quotas o, 92986 groundwater protection +
# id_sum_original_cand best choice
##############################################
# TFIDF
##########################################
# 32476, 94936, 92988, 33546, 3453, 12744, ... 
# id_tfidf_sum_original_ext_cand -, id_tfidf_sum_original_cand +, id_tfidf_sum_original and id_tfidf_sum_original_ext o

['water', 'pollution', 'underground']
['pollutions', 'undergrounding']
['pollution', 'pollutions', 'undergrounding', 'earpollution', 'pollution,']


In [184]:
texts.get(94870)

'Avis juridique important|91998E2572WRITTEN QUESTION No. 2572/98 by John McCARTIN Fishing agreement with the Comores 1994-1997Official Journal C 320 , 06/11/1999 P. 0008 WRITTEN QUESTION E-2572/98by John McCartin (PPE) to the Commission(1 September 1998)Subject: Fishing agreement with the Comores 1994-1997Can the Commission state how many fishing vessels were involved in the 1994-1997 EU fishing agreement with the Comores, what was the tonnage of these vessels and how many days they fished under the agreement? Top'

In [185]:
# comparing best sum and best tfidf metric/combination of input data
best_test = pd.concat([tfidf_sum_result['id_tfidf_sum_original_cand'], sum_result['id_sum_original_cand']], axis=1)
best_test

Unnamed: 0,id_tfidf_sum_original_cand,id_sum_original_cand
0,92988,12744
1,12744,92986
2,92986,92988
3,9649,9649
4,94936,93921
5,97319,32476
6,92987,62937
7,93921,94936
8,47944,97319
9,3453,95196


In [186]:
values = best_test.values.tolist()
flat_vals = []
for sublist in values:
    for item in sublist:
        flat_vals.append(item)
countbestTest=collections.Counter(flat_vals)
print((countbestTest))
print(len(countbestTest))

Counter({92988: 2, 12744: 2, 92986: 2, 9649: 2, 94936: 2, 93921: 2, 97319: 2, 32476: 1, 92987: 1, 62937: 1, 47944: 1, 3453: 1, 95196: 1})
13


In [187]:
# different docs in dataframe best_test:
# sum probab:32476 o, 62937 +, 95196 o+
# tfidf sum :92987 +,47944 +, 3453 o,
# tfidf better

In [188]:
texts.get(95196)

'Avis juridique important|91998E3492WRITTEN QUESTION No. 3492/98 by Luigi MORETTI to the Commission. Pollution of surface waterOfficial Journal C 207 , 21/07/1999 P. 0077 WRITTEN QUESTION E-3492/98by Luigi Moretti (NI) to the Commission(25 November 1998)Subject: Pollution of surface waterThe drainage systems in built-up areas are often not designed to convey surface water, or water from recent rainfall, to water treatment plants. As a result, these waters flow into rivers, streams and lakes.To my knowledge there are currently no laws or provisions requiring these waters to be treated before they enter waterways.In view of the fact that surface water and water from recent rainfall are more polluted than sewage, since they contain over 2010 exhaust gases and heavy metals, can the Commission say what measures it intends to adopt in this area?Answer given by Mrs Bjerregaard on behalf of the Commission(12 January 1999)Rainwater on impermeable urban surfaces can be collected either separatel

In [189]:
# test1 set, query words that do not occure in any of documents
tf = tfidf_sum(test1,tokenized_docs, texts)
print(tf[1])
df_tfidf_sum_original1 = pd.DataFrame(top_positives(tf[0],10), columns =['id_tfidf_sum_original1', 'score'])
tf = tfidf_sum(test1+ext1,tokenized_docs, texts)
print(tf[1])
df_tfidf_sum_original_ext1 = pd.DataFrame(top_positives(tf[0],10), columns =['id_tfidf_sum_original_ext1', 'score'])
tf = tfidf_sum(test1+ext1+top_list1,tokenized_docs, texts)
print(tf[1])
df_tfidf_sum_original_ext_cand1 = pd.DataFrame(top_positives(tf[0],10), columns =['id_tfidf_sum_original_ext_cand1', 'score'])
tf = tfidf_sum(test1+topw_list1,tokenized_docs, texts)
print(tf[1])
df_tfidf_sum_original_cand1 = pd.DataFrame(top_positives(tf[0],10), columns =['id_tfidf_sum_original_cand1', 'score'])


[]
['flwfishing']
['flwfishing', 'earpollution', 'biopollution']
['earpollution', 'biopollution']


In [190]:
frames = [df_tfidf_sum_original1['id_tfidf_sum_original1'], df_tfidf_sum_original_ext1['id_tfidf_sum_original_ext1'], df_tfidf_sum_original_ext_cand1['id_tfidf_sum_original_ext_cand1'],df_tfidf_sum_original_cand1['id_tfidf_sum_original_cand1']]
tfidf_sum_result1 = pd.concat(frames, axis=1)
tfidf_sum_result1

Unnamed: 0,id_tfidf_sum_original1,id_tfidf_sum_original_ext1,id_tfidf_sum_original_ext_cand1,id_tfidf_sum_original_cand1
0,94870,92702,92702,94870
1,11790,94870,94870,11790
2,97436,93604,93604,97436
3,100258,97781,97781,100258
4,28599,97436,97436,38528
5,50112,93549,93549,28599
6,97265,100258,100258,3455
7,39238,14571,14571,94936
8,49539,96942,96942,97319
9,96093,11790,11790,50112


- sum_original_ext and sum_original_ext_cand same, the other two columns same in first 4

In [191]:
values = tfidf_sum_result1.values.tolist()
flat_vals = []
for sublist in values:
    for item in sublist:
        flat_vals.append(item)
countertf1=collections.Counter(flat_vals)
print((countertf1))
print(len(countertf1))

Counter({94870: 4, 11790: 4, 97436: 4, 100258: 4, 92702: 2, 93604: 2, 97781: 2, 28599: 2, 50112: 2, 93549: 2, 14571: 2, 96942: 2, 38528: 1, 97265: 1, 3455: 1, 39238: 1, 94936: 1, 49539: 1, 97319: 1, 96093: 1})
20


In [192]:
# comparing results for test1 set
print(test1)
print(ext1)
print(top_list1)
############
#SUM
############################################################
#94870 fishing agreement EU +, 11790 fishing agreement EU +, 100258 fishing agreement +
#94870 11790, 100258 already, 92702 fishing agreement +, 93604 environmental agreement o ;better without extention
############################################################
# TFIDF
#94870, 92702, 11790, 94870,97436 overfishing, 93604
# better without ext

['annex', 'fishing', 'agreement', 'europe']
['flwfishing', 'agreements']
['earpollution', 'pollution', 'pollutants', 'biopollution', 'potable']


In [193]:
texts.get(97436)

'Avis juridique important|92000E3661WRITTEN QUESTION E-3661/00 by Glenys Kinnock (PSE) to the Commission. Coastal fishing in ACP countries.Official Journal 174 E , 19/06/2001 P. 0104 - 0105 WRITTEN QUESTION E-3661/00by Glenys Kinnock (PSE) to the Commission(27 November 2000)Subject: Coastal fishing in ACP countriesWould the Commission outline what measures it is taking to ensure that Community fishing vessels, operating under EU-ACP fishing agreements, respect the needs and rights of small-scale, coastal fishing communities in ACP countries and do not damage the local ACP fisheries sector?What action is the Commission taking to improve the capacity of ACP countries to patrol the waters under their jurisdiction, so as to control the activities of both Community and ACP fishing fleets and thereby prevent overfishing?Answer given by Mr Fischler on behalf of the Commission(5 January 2001)The Commission thanks the Honourable Member for her question and informs her that, in order to avoid cl

In [194]:
def tfidf_sum_weights(original_tokens, top_expansion,tokenized_docs,texts, wv, alpha): 
    tokens_together = original_tokens+top_expansion
    nb_docs_tokens_appeared = nb_docs_tokens_appear(tokens_together,tokenized_docs,texts)
    filtered_nb_docs_tokens_appeared = [elt for elt in nb_docs_tokens_appeared if not elt == 0]
    not_appear = []
    appear = []
    for i in range(len(nb_docs_tokens_appeared)):
        if nb_docs_tokens_appeared[i] == 0:
            not_appear.append(tokens_together[i])
        else:
            appear.append(tokens_together[i])  
    l = len(tokenized_docs)
    doc_probab = {}
    for k, v in tokenized_docs.items():
        n = len(v)
        text = texts.get(k)
        probability = 0
        for i in range(len(appear)):
            token_frequency = text.count(appear[i])
            idf = math.log(l/filtered_nb_docs_tokens_appeared[i])
            probability = probability+((token_frequency/n)*idf)*word_value(appear[i], alpha, original_tokens, top_expansion, wv)
        doc_probab.update({k: probability})
    return doc_probab,not_appear

In [195]:
tfidf_original_query_ext_cand = []
for alpha in [0.5,0.6,0.7,0.8,0.9,1]:
    tfw = tfidf_sum_weights(test+ext, top_list,tokenized_docs,texts, wv_wiki_en, alpha)
    df_tfidf_wsum_original_ext_cand = pd.DataFrame(top_positives(tfw[0],10), columns =['id_tfidf_wsum_original_ext_cand'+str(alpha), 'score'+str(alpha)])
    tfidf_original_query_ext_cand.append(df_tfidf_wsum_original_ext_cand)


In [196]:
# comparing sorting for different alphas, original + ext + cand
frames =[]
for i in range(len(tfidf_original_query_ext_cand)):
    dataf = tfidf_original_query_ext_cand[i].take([0], axis=1)
    frames.append(dataf)
tfidfcon = pd.concat(frames, axis=1)
tfidfcon

Unnamed: 0,id_tfidf_wsum_original_ext_cand0.5,id_tfidf_wsum_original_ext_cand0.6,id_tfidf_wsum_original_ext_cand0.7,id_tfidf_wsum_original_ext_cand0.8,id_tfidf_wsum_original_ext_cand0.9,id_tfidf_wsum_original_ext_cand1
0,94936,94936,94936,94936,94936,94936
1,97319,97319,97319,97319,97319,97319
2,3453,3453,3453,3453,3453,3453
3,93533,93533,93533,93533,93533,93533
4,97320,97320,97320,97320,97320,97320
5,4505,59175,59175,59175,59175,59175
6,59175,4505,93921,93921,93921,93921
7,93921,93921,4505,4505,4505,4505
8,94953,94953,94953,94953,94953,94953
9,62286,62286,62286,62286,62286,62286


In [197]:
# ##########################
# #set json format in readable form (starting wih multiple objec format, need list)
# annotation_dir='D:/Users/sarab/work/enviroLens/files/'
# print('Loading annotations')
# annotations=[]
# for filename in os.listdir(annotation_dir):
#     print('loading file ',filename)
#     lines = [line.rstrip('\n') for line in open(annotation_dir+filename,encoding='utf-8')]
#     for line in lines:
#         js=json.loads(line)
#         annotations.append(js)