# Query Expansion
### Using FastText Word Embedding
Based on this paper: https://arxiv.org/pdf/1606.07608.pdf

Pre-made vector models: https://fasttext.cc/docs/en/aligned-vectors.html

In [None]:
from gensim.models import KeyedVectors
from itertools import islice
from sklearn.feature_extraction.text import TfidfVectorizer

import pandas as pd
import string
import numpy as np
import collections

# import natural language toolkit
from nltk.corpus   import stopwords
from nltk.tokenize import word_tokenize
# prepare stopword list
stop_words = stopwords.words('english')

In [None]:
def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

In [None]:
import os
os.listdir()

In [None]:
wiki_en_align = './data/fasttext/wiki.en.align.vec' #change directory to old one for marquis
# get fasttext wiki embeddings for english
wv_wiki_en = KeyedVectors.load_word2vec_format(wiki_en_align)
print('english words {}'.format(len(list(wv_wiki_en.vocab.keys()))))

## Pre-retrieval kNN Based Approach

In [None]:
#list of terms
def tokenize(text, stopwords):
    """Tokenizes and removes stopwords from the document"""
    tokens = word_tokenize(text)
    filtered = [w.lower() for w in tokens if not w in stopwords]
    return filtered

In [None]:
#extended list of terms ###
def extend_tokens(token_list, wv):
    """Extends token list summing vector pairs"""
    tokens = []
    for token in token_list:
        # check if the token is in the vocabulary
        if token in wv.vocab.keys():
            tokens.append(token)
    extention = set()
    for i in range(len(tokens)-1):
        new_token = wv_wiki_en.most_similar(positive=[tokens[i], tokens[i+1]])[0][0]
        extention.add(new_token)
    extention = list(extention)
    return extention

In [None]:
test = tokenize('water pollution underground', stop_words)
print(test)
ext = extend_tokens(test,wv_wiki_en)
print(ext)


In [None]:
test1 = tokenize('annex fishing agreement europe', stop_words)
print(test1)
ext1 = extend_tokens(test1,wv_wiki_en)
print(ext1)

In [None]:
# knn nearest
def get_candidate_expansion_terms(tokens, k, wv):
    """Gets the candidate expansion terms"""
    candidates = set()
    for token in tokens:
        # check if the token is in the vocabulary
        if token in wv.vocab.keys():
            result = wv.similar_by_word(token)
            limit = k if len(result) > k else len(result)
            # iterate through the most similar words
            for i in range(limit):
                candidates.add(result[i][0])
    # return list of candidates
    candidates = list(candidates)
    return candidates
        

In [None]:
candidates = get_candidate_expansion_terms(test+ext, 5, wv_wiki_en)
print(candidates)
witout = get_candidate_expansion_terms(test, 5, wv_wiki_en)
print(witout)

In [None]:
# similarity between word and list of words
def similarity(token, token_list, wv ):
    """calculates the similarity between word and list of words"""
    # calculate the similarity of the token to all tokens
    similarity = 0
    num_of_tokens = 0
    for toks in token_list:
        # check if the token is in the vocabulary
        if toks in wv.vocab.keys():
            num_of_tokens += 1
            similarity += wv.similarity(toks, token)
    return similarity/num_of_tokens

In [None]:
# calculates similarity and sorts
def get_top_expansion_terms(tokens, candidates, wv):
    """Gets the actual expansion terms"""
    similarity_pairs = []
    for candidate in candidates:
        sim = similarity(candidate, tokens, wv)
        similarity_pairs.append((candidate, sim))
    # return the list of expansion terms with their similarities
    return similarity_pairs

In [None]:
top = get_top_expansion_terms(test+ext, candidates,  wv_wiki_en)
topwithout = get_top_expansion_terms(test, candidates,  wv_wiki_en)
def takeSecond(elem):
    return elem[1]
top = sorted(top, key=takeSecond)[::-1]
topw = sorted(topwithout, key=takeSecond)[::-1]
print((top))
print((topw))
top = top[0:5]
topw = topw[0:5]
top_list = []
for tupl in top:
    top_list.append(tupl[0])
topw_list = []
for tupl in topw:
    topw_list.append(tupl[0])

top1 = get_top_expansion_terms(test1+ext1, candidates,  wv_wiki_en)
topwithout1 = get_top_expansion_terms(test1, candidates,  wv_wiki_en)
def takeSecond(elem):
    return elem[1]
top1 = sorted(top1, key=takeSecond)[::-1]
topw1 = sorted(topwithout1, key=takeSecond)[::-1]
top1 = top1[0:5]
topw1 = topw1[0:5]
top_list1 = []
for tupl in top1:
    top_list1.append(tupl[0])
topw_list1 = []
for tupl in topw1:
    topw_list1.append(tupl[0])


In [None]:
# all functions together, finds k nearest for each term, returns top n
def pre_retrieval_KNN(string, k, wv, n):
    """Find the most similar tokens to the given query"""
    tokens = tokenize(string, stop_words)
    candidates = get_candidate_expansion_terms(tokens, k, wv)
    candidates_sim = get_top_expansion_terms(tokens, candidates, wv)
    def takeSecond(elem):
        return elem[1]
    sort = sorted(candidates_sim, key=takeSecond)[::-1]
    return sort[:n]

In [None]:
pre_retrieval_KNN('deep learning', 5, wv_wiki_en, 10)

In [None]:
# import postgresql
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from modules.library.postgresql import PostgresQL
# connect to the postgresql database
pg = PostgresQL() 
pg.connect(database="eurlex_environment_only", user="postgres", password="dbpass")

In [None]:
#import documents 
documents = pg.execute("""
    SELECT * FROM documents;
""")

In [None]:
print(len(documents))
documents[100]

### Document search

In [None]:
docs = documents
# some docs are empty !!!!!!!!!!!!!!!!!!!

# for doc in docs:
#     idd= doc.get('document_id')
#     if idd == 39722:
#         print(doc)

delete = []
for doc in docs:
    n = len(doc.get('document_text'))
    if n == 0:
        id_doc = doc.get('document_id')
        delete.append(id_doc)
            
# remove empty docs
for doc in docs:
    id_doc = doc.get('document_id')
    if id_doc in delete:
        docs.remove(doc)
    

In [None]:
print(len(docs))


In [None]:
#get tokenzized documents (texts), texts, tokenzized titles and titles
tokenized_docs = {}
tokenized_titles = {}
texts = {}
titles = {}
for document in docs:
    doc_id = document.get('document_id')
    text = document.get('document_text')
    texts.update({doc_id: text})
    title = document.get('document_title')
    titles.update({doc_id: title})
    text = text.translate(str.maketrans('','',string.punctuation))
    tokenized = tokenize(text, stop_words)
    title = title.translate(str.maketrans('','',string.punctuation))
    tokenized_title = tokenize(title, stop_words)
    for token in tokenized:
        if len(token) == 1:
            if token.isalpha():
                tokenized.remove(token)
    tokenized_docs.update({doc_id: tokenized})
    for title in tokenized_title:
        if len(title) == 1:
            if title.isalpha():
                tokenized_title.remove(title)
    tokenized_titles.update({doc_id: tokenized_title})


In [None]:

print(tokenized_titles.get(3))
print(titles.get(3))

In [None]:
print(tokenized_docs.get(39703))
print(texts.get(39703))

In [None]:
print(len(tokenized_docs))
#print(take(1, tokenized_docs.items()))
empt=[]
for k,v in tokenized_docs.items():
    l =len(v)
    if l==0:
        empt.append(k)
print(len(empt))
#print((empt))

for k in empt:
    del tokenized_docs[k]
    del tokenized_titles[k]
    del texts[k]
    del titles[k]

  

In [None]:
print(len(tokenized_docs))
print(len(tokenized_titles))
print(len(texts))
print(len(titles))

In [None]:
tokenized_docs1 = {}
tokenized_titles1 = {}
texts1= {}
titles1={}
for k in range(1000):
    vtd = tokenized_docs.get(k)
    vtt = tokenized_titles.get(k)
    vx = texts.get(k)
    vt =titles.get(k)
    tokenized_docs1.update({k:vtd})
    tokenized_titles1.update({k:vtt})
    texts1.update({k:vx})
    titles1.update({k:vt})

    
    
    

In [None]:
print(len(tokenized_docs1))
print(len(tokenized_titles1))
print(len(texts1))
print(len(titles1))


#### search full words (not lemmatized), search as substrings


In [None]:
#### 1. probability scoring

In [None]:
# probability scoring
### all query words have to be in the document (multiplying)

def probab_score(tokens,tokenized_docs,texts):
    doc_probab = {}
    for k, v in tokenized_docs.items():
        n = len(v)
        probability = 1
        text = texts.get(k)
        for token in tokens:
            token_frequency = text.count(token)
            probability = probability*(token_frequency/n)
        doc_probab.update({k: probability})
    return doc_probab

    
    

In [None]:
# only original query
score =probab_score(test,tokenized_docs,texts)
#how many docs have positive score?
positives = dict([(k,v) for k,v in score.items() if v > 0])
sorted_positives = sorted(positives.items(), key=lambda x: x[1],reverse=True)
sorted_positives_top = sorted_positives[0:10]
print(sorted_positives_top)

# original query + extension
score =probab_score(test+ext,tokenized_docs,texts)
#how many docs have positive score?
print(([(k,v) for k,v in score.items() if v > 0]))


# only original query
score =probab_score(test1,tokenized_docs,texts)
#how many docs have positive score?
positives = dict([(k,v) for k,v in score.items() if v > 0])
sorted_positives = sorted(positives.items(), key=lambda x: x[1],reverse=True)
sorted_positives_top = sorted_positives[0:10]
print(sorted_positives_top)

# original query + extension
score =probab_score(test1+ext1,tokenized_docs,texts)
#how many docs have positive score?
print(([(k,v) for k,v in score.items() if v > 0]))


# no point having an extention, empty results are ok

In [None]:
# no point having an extention, empty results are ok

In [None]:
# print(titles.get(80))
# print(texts.get(80))



In [None]:
## query words summation
def probab_score_sum(tokens,tokenized_docs,texts):
    doc_probab = {}
    for k, v in tokenized_docs.items():
        n = len(v)
        probability = 0
        text = texts.get(k)
        for token in tokens:
            token_frequency = text.count(token)
            probability = probability+(token_frequency/n)
        doc_probab.update({k: probability})
    return doc_probab
    

In [None]:
#zavedaj se:
"gabla is bla2".count("bla")
# kar pomeni da bi bilo bolje uporabiti lemmatized words! popravi!!

In [None]:
def top_positives(dictionary,n):
    """Takes dict and returns first n tuples of k,v sorted by v"""
    positives = {} 
    for k,v in score_sum.items():
        if v > 0:
            positives.update({k: v})
    sorted_positives = sorted(positives.items(), key=lambda x: x[1],reverse=True)
    sorted_positives_top = sorted_positives[0:n]
    return sorted_positives_top

In [None]:
# only original query
score_sum =probab_score_sum(test,tokenized_docs,texts)

#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)

In [None]:
# create dataframe
df_sum_original = pd.DataFrame(sorted_positives_top, columns =['id_sum_original', 'score'])
df_sum_original

In [None]:
# print(titles.get(565))
# print(texts.get(565))

In [None]:
## original query plus ext
score_sum =probab_score_sum(test+ext,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)

In [None]:
# create dataframe
df_sum_original_ext = pd.DataFrame(sorted_positives_top, columns =['id_sum_original_ext', 'score'])

In [None]:
# only original query
score =probab_score(test1,tokenized_docs,texts)
#how many docs have positive score?
sorted_positives_top = top_positives(score,10)
print(sorted_positives_top)



## original query
score_sum =probab_score_sum(test1,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)
#dataframe
df_sum_original1 = pd.DataFrame(sorted_positives_top, columns =['id_sum_original1', 'score'])
df_sum_original1

In [None]:
## original query plus ext
score_sum =probab_score_sum(test1+ext1,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
positives = dict([(k,v) for k,v in score_sum.items() if v > 0])
sorted_positives = sorted(positives.items(), key=lambda x: x[1],reverse=True)
sorted_positives_top = sorted_positives[0:10]
#dataframe
df_sum_original_ext1 = pd.DataFrame(sorted_positives_top, columns =['id_sum_original_ext1', 'score'])
df_sum_original_ext1

In [None]:
# adding candidates
## without weights
# no point using multiplication

# summation:
## original query
score_sum =probab_score_sum(test+topw_list,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)
#dataframe
df_sum_original_cand = pd.DataFrame(sorted_positives_top, columns =['id_sum_original_cand', 'score'])


## original query plus ext
score_sum =probab_score_sum(test+ext+top_list,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)
#dataframe
df_sum_original_ext_cand = pd.DataFrame(sorted_positives_top, columns =['id_sum_original_ext_cand', 'score'])


## original query
score_sum =probab_score_sum(test1+topw_list1,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)
#dataframe
df_sum_original_cand1 = pd.DataFrame(sorted_positives_top, columns =['id_sum_original_cand1', 'score'])


## original query plus ext
score_sum =probab_score_sum(test1+ext1+top_list1,tokenized_docs,texts)
#original + ext gives same score, add global and state gives different score, same order
#how many docs have positive score?
sorted_positives_top = top_positives(score_sum,10)
print(sorted_positives_top)
#dataframe
df_sum_original_ext_cand1 = pd.DataFrame(sorted_positives_top, columns =['id_sum_original_ext_cand1', 'score'])



In [None]:
def word_value(word, alpha, original_tokens, top_expansion, wv):
    """values word based on whether is in original token set or expanded"""
    only_expanded = []
    for token in top_expansion:
        if token not in original_tokens:
            only_expanded.append(token)
            
    sum_similarity = 0
    for exp_token in only_expanded:
            sum_similarity += similarity(exp_token,original_tokens, wv)
        
    if word in original_tokens:
        value = alpha
    else:
        value = (1-alpha)*similarity(word, original_tokens, wv)/sum_similarity
    return value

In [None]:
# ce ni ext zraven je so cudni rezultati, zamenja vrstni red pomembnsti med sewage in undergrounding??
top = top[0:4]
top_words = [i[0] for i in top]
print(word_value("water", 0.7, test+ext ,top_words, wv_wiki_en))
print(word_value("sewage", 0.7, test+ext ,top_words, wv_wiki_en))
print(word_value("undergrounding", 0.7, test+ext ,top_words, wv_wiki_en))
print(word_value("biopollution", 0.7, test+ext ,top_words, wv_wiki_en))

In [None]:
def probab_score_sum_weights(original_tokens, top_expansion,tokenized_docs,texts, wv, alpha): 
    doc_probab = {}
    for k, v in tokenized_docs.items():
        n = len(v)
        probability = 0
        text = texts.get(k)
        for token in original_tokens+top_expansion:
            token_frequency = text.count(token)
            probability = probability+(token_frequency/n)*word_value(token, alpha, original_tokens, top_expansion, wv)
        doc_probab.update({k: probability})
    return doc_probab

In [None]:
## with weights
# summation 
original_query_cand = []
for alpha in [0.5,0.6,0.7,0.8,0.9,1]:
    ## original query
    score_sum = probab_score_sum_weights(test, topw_list,tokenized_docs, texts, wv_wiki_en, alpha)
    #original + ext gives same score, add global and state gives different score, same order
    #how many docs have positive score?
    sorted_positives_top = top_positives(score_sum,10)
    #dataframe
    df_wsum_original_cand = pd.DataFrame(sorted_positives_top, columns =['id_wsum_original_cand'+str(alpha), 'score'+str(alpha)])
    original_query_cand.append(df_wsum_original_cand)

In [None]:
## original query plus ext
original_query_ext_cand = []
for alpha in [0.5,0.6,0.7,0.8,0.9,1]:
    score_sum =probab_score_sum_weights(test+ext, top_list,tokenized_docs, texts, wv_wiki_en, alpha)
    #original + ext gives same score, add global and state gives different score, same order
    #how many docs have positive score?
    sorted_positives_top = top_positives(score_sum,10)
    #dataframe
    df_wsum_original_ext_cand = pd.DataFrame(sorted_positives_top, columns =['id_wsum_original_ext_cand'+str(alpha), 'score'+str(alpha)])
    original_query_ext_cand.append(df_wsum_original_ext_cand)

In [None]:
# comparing sorting for each alpha
frames =[]
for i in range(len(original_query_cand)):
    frst = original_query_cand[i].take([0], axis=1)
    snd = original_query_ext_cand[i].take([0], axis=1)
    con = pd.concat([frst,snd], axis=1)
    frames.append(con)
    
    

In [None]:
frames[0]

In [None]:
# comparing sorting for different alphas, original + cand
frames =[]
for i in range(len(original_query_cand)):
    dataf = original_query_cand[i].take([0], axis=1)
    frames.append(dataf)
con = pd.concat(frames, axis=1)
con

- first 4 rows same values
- 5.,6.,7. row only values for alpha 0.5 different
- lower than 7. row: values for 0.5 and 0.6 different
- half of values lower than 4.place on alpha 0.5 also appear in other columns

In [None]:
#test1:
## original query
original_query_cand1 = []
for alpha in [0.5,0.6,0.7,0.8,0.9,1]:
    score_sum = probab_score_sum_weights(test1, topw_list1,tokenized_docs,texts,  wv_wiki_en, alpha)
    #original + ext gives same score, add global and state gives different score, same order
    #how many docs have positive score?
    sorted_positives_top = top_positives(score_sum,10)
    #dataframe
    df_wsum_original_cand1 = pd.DataFrame(sorted_positives_top, columns =['id_wsum_original_cand1'+str(alpha), 'score'+str(alpha)])
    original_query_cand1.append(df_wsum_original_cand1)


## original query plus ext
original_query_ext_cand1 = []
for alpha in [0.5,0.6,0.7,0.8,0.9,1]:
    score_sum =probab_score_sum_weights(test1+ext1, top_list1,tokenized_docs,texts,  wv_wiki_en, alpha)
    #original + ext gives same score, add global and state gives different score, same order
    #how many docs have positive score?
    sorted_positives_top = top_positives(score_sum,10)
    #dataframe
    df_wsum_original_ext_cand1 = pd.DataFrame(sorted_positives_top, columns =['id_wsum_original_ext_cand1'+str(alpha), 'score'+str(alpha)])
    original_query_ext_cand1.append(df_wsum_original_ext_cand1)
    

In [None]:
# comparing sorting for each alpha
frames1 =[]
for i in range(len(original_query_cand1)):
    frst = original_query_cand1[i].take([0], axis=1)
    snd = original_query_ext_cand1[i].take([0], axis=1)
    con = pd.concat([frst,snd], axis=1)
    frames1.append(con)

In [None]:
frames1[0]

In [None]:
# comparing sorting for different alphas, original + cand
frames =[]
for i in range(len(original_query_cand1)):
    dataf = original_query_cand1[i].take([0], axis=1)
    frames.append(dataf)
con1 = pd.concat(frames, axis=1)
con1

In [None]:
# comparison of summation method versions on test set
frames = [df_sum_original["id_sum_original"], df_sum_original_cand["id_sum_original_cand"], df_sum_original_ext['id_sum_original_ext'],df_sum_original_ext_cand['id_sum_original_ext_cand'],df_wsum_original_cand['id_wsum_original_cand'], df_wsum_original_ext_cand['id_wsum_original_ext_cand']]
sum_result = pd.concat(frames, axis=1)
sum_result
# error ker wsum samo se z alphami e.g. wsum0.6

In [None]:
values = sum_result.values.tolist()
flat_vals = []
for sublist in values:
    for item in sublist:
        flat_vals.append(item)
counter=collections.Counter(flat_vals)
print((counter))
print(len(counter))

- same results for sum original and sum original_ext
- slight diff. between original cand and original ext cand in wsum  
 --> ext impacts on cand  
- for top 5 only difference between using cand or not
- 5. and 6. place id different wether ext is used or not
- 9. place differs in column sum original cand
- 8 values occures in all cases, so max 2 differ for each list




In [None]:
# comparison of summation method versions on test1 set
frames = [df_sum_original1["id_sum_original1"], df_sum_original_cand1["id_sum_original_cand1"], df_sum_original_ext1['id_sum_original_ext1'],df_sum_original_ext_cand1['id_sum_original_ext_cand1'],df_wsum_original_cand1['id_wsum_original_cand1'], df_wsum_original_ext_cand1['id_wsum_original_ext_cand1']]
sum_result1 = pd.concat(frames, axis=1)
sum_result1

In [None]:
values = sum_result1.values.tolist()
flat_vals = []
for sublist in values:
    for item in sublist:
        flat_vals.append(item)
counter1=collections.Counter(flat_vals)
print((counter1))
print(len(counter1))

- same results for all without ext and all with ext
- rows do not match
- 9 values appear in all columns


In [None]:
# for first 5 returned docs no difference between weighted and unweighted for alpha = 0.6,  alpha = 0.8, 1 #if all weight the same is same as
# if expansion would not exist
# only tokens / tokens + ext
# [(99, 0.048582995951417005), (380, 0.046610169491525424), (244, 0.04477611940298507), (89, 0.04371584699453552), (376, 0.04034065441506051)]
# [(244, 0.08955223880597014), (243, 0.08), (903, 0.04964539007092198), (99, 0.048582995951417005), (380, 0.046610169491525424)]
# unweighted with candidate exp:
# [(565, 0.048730964467005075), (1219, 0.0461864406779661), (12, 0.04042348411934552), (226, 0.039756782039289056), (22, 0.03749147920927062)]
# [(565, 0.05177664974619289), (1219, 0.04745762711864407), (12, 0.04138594802694899), (226, 0.04069223573433115), (22, 0.03953646898432175)]
# [(99, 0.048582995951417005), (380, 0.046610169491525424), (244, 0.04477611940298507), (89, 0.04371584699453552), (376, 0.04034065441506051)]
# [(244, 0.08955223880597014), (243, 0.08), (903, 0.04964539007092198), (99, 0.048582995951417005), (380, 0.046610169491525424)]

In [None]:
# for 10
# alpha 0.8, 1
# [(161, 0.027947874459039665), (313, 0.027129979796553974), (73, 0.025445292620865142), (402, 0.022429906542056073)]
# [(161, 0.027915369391449767), (313, 0.02712754175646111), (73, 0.025570205421714953), (402, 0.022429906542056073)]
# [(243, 0.032), (925, 0.031578947368421054), (1212, 0.03118536197295147), (910, 0.031168831168831172), (108, 0.03114754098360656)]
# [(1212, 0.036276849642004776), (89, 0.034972677595628415), (376, 0.032989690721649485), (925, 0.031578947368421054), (910, 0.031168831168831172)]
# for alpha 0.6 one change in one case
# only tokens/tokens+ext
# [(243, 0.04), (925, 0.039473684210526314), (1212, 0.03898170246618934), (910, 0.03896103896103896), (108, 0.0389344262295082)]
# [(1212, 0.045346062052505964), (89, 0.04371584699453552), (376, 0.041237113402061855), (925, 0.039473684210526314), (910, 0.03896103896103896)]

In [None]:
# #### 2.TFIDF evaluation
# texts_keys = []
# texts_values = []
# for key in sorted(texts.keys()) :
#     texts_keys.append(key)
#     texts_values.append(texts[key])

In [None]:
# vectorizer = TfidfVectorizer(stop_words = "english")
# vectors_t = vectorizer.fit_transform(texts_values)

In [None]:
# # get the first vector out (for the first document)
# vector_t = vectors_t[0]


In [None]:
# # place tf-idf values in a pandas data frame
# vector_dframe_t = pd.DataFrame(vector_t.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
# vector_dframe_t = vector_dframe_t.sort_values(by=["tfidf"],ascending=False)

In [None]:
# vector_dframe_t.head(50) #treaty ne sesteje lepo, lahko bi text olepsal preden gre v vectorizer, a pojavitev pomeni istost?

In [None]:
# # try to solve with transformation into string of tokenized text:
# strings_keys = []
# strings = []
# for key in sorted(tokenized_docs.keys()) :
#     strings_keys.append(key)
#     list_tokens = tokenized_docs[key]
#     corrected = " ".join(list_tokens)
#     strings.append(corrected)



In [None]:
# vectors = vectorizer.fit_transform(strings)


In [None]:
# # get the first vector out (for the first document)
# vector = vectors[0]

In [None]:
# # place tf-idf values in a pandas data frame
# vector_dframe = pd.DataFrame(vector.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
# vector_dframe = vector_dframe.sort_values(by=["tfidf"],ascending=False)

In [None]:
# vector_dframe.head(50) #not much difference

In [None]:
# calculate tfidf only for query words:

In [None]:
def calculate_nb_docs_token_appears(tokens,tokenized_docs,texts):
    nb_docs_token_appeared = []
    for i in range(len(tokens)):
        nb_docs_token_appeared.append(0)
    for k, v in tokenized_docs.items():
        text = texts.get(k)
        for i in range(len(tokens)):
            token = tokens[i]
            if token in text:
                nb_docs_token_appeared[i] = nb_docs_token_appeared[i]+1
    return nb_docs_token_appeared
                    

In [None]:
def tfidf_sum(tokens,tokenized_docs, texts):
    nb_docs_token_appears =calculate_nb_docs_token_appears(tokens,tokenized_docs,texts)
    l = len(tokenized_docs)
    doc_probab = {}
    for k, v in tokenized_docs.items():
        n = len(v)
        text = texts.get(k)
        probability = 0
        for i in range(len(tokens)):
            token_frequecy = text.count(token[i])
            idf = l/nb_docs_token_appears[i]
            probability = probability+((token_frequency/n)*idf)
        doc_probab.update({k: probability})
    return doc_probab

In [None]:
#to do
def tfidf_sum_weights(original_tokens, top_expansion,tokenized_docs,texts, wv, alpha): 
    doc_probab = {}
    for k, v in tokenized_docs.items():
        n = len(v)
        probability = 0
        for token in original_tokens+top_expansion:
            token_frequency = texts.get(k).count(token)
            probability = probability+(token_frequency/n)*word_value(token, alpha, original_tokens, top_expansion, wv)
        doc_probab.update({k: probability})
    return doc_probab