In [294]:
from gensim.models import KeyedVectors, FastText
from itertools import islice
from sklearn.feature_extraction.text import TfidfVectorizer


import json
import pandas as pd
import string
import numpy as np
import collections
import math

# import natural language toolkit
from nltk.corpus   import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

In [295]:
stop_words = stopwords.words('english')

In [296]:
wiki_en_align = './../data/fasttext/wiki.en.align.vec' #'../../data/fasttext/wiki.en.align.vec'
wiki_en_bin = './../data/fasttext/wiki.en.bin'

In [298]:
# get fasttext wiki embeddings for english
wv_wiki_en = KeyedVectors.load_word2vec_format(wiki_en_align)
print('english words {}'.format(len(list(wv_wiki_en.vocab.keys()))))

In [297]:
ft_wiki_en = FastText.load_fasttext_format(wiki_en_bin)

In [332]:


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)



def tokenized_query(text, stopwords):
    """Tokenizes, lowers words and removes stopwords from the document.
        Args:
            text (str): Text we want to tokenize.
            stopwords (list): List of words we want to remove from the tokenized text. 
        Returns:
            filtered_tokens (list): List of low case tokens wich does not contain stop words.
        """
    without_punctuations = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(without_punctuations)
    filtered = ([lemmatizer.lemmatize(w.lower(), get_wordnet_pos(w.lower())) for w in tokens if not w in stopwords])
    return filtered

def extend_tokens(token_list, model):
    """Extends token list by summing consecutive vector pairs.
        Args: 
            token_list (list): List of tokens we want to extend.
        Returns:
            extension (list): List of extensions.
            wv (Word2VecKeyedVectors): Word embeddings.
        """
    tokens = []
    if model_format == 'word2vec':
        for token in token_list:
            # check if the token is in the vocabulary
            if token in model.vocab.keys():
                tokens.append(token)
    if model_format == 'fasttext':
        for token in token_list:
            # check if the token is in the vocabulary
            if token in model.wv.vocab:
                tokens.append(token)
    extention = set()
    for i in range(len(tokens)-1):
        new_token = model.most_similar(positive=[tokens[i], tokens[i+1]])[0][0]
        extention.add(new_token)
    extention = list(extention)
    return extention

def candidate_expansion_terms(tokens, k, model, model_format):
    """Gets the candidate expansion terms"""
    candidates = set()
    if model_format == 'word2vec':
        for token in tokens:
            # check if the token is in the vocabulary
            if token in model.vocab.keys():
                result = model.similar_by_word(token)
                limit = k if len(result) > k else len(result)
                # iterate through the most similar words
                for i in range(limit):
                    candidates.add(result[i][0])
    elif model_format == 'fasttext':
        for token in tokens:
            # check if the token is in the vocabulary
            if token in model.wv.vocab:
                result = model.most_similar(token)
                limit = k if len(result) > k else len(result)
                # iterate through the most similar words
                for i in range(limit):
                    candidates.add(result[i][0])
    else:
        raise Exception('Model type incorrect')
    # return list of candidates
    candidates = list(candidates)
    return candidates

def similarity(token, token_list, model, model_format ):
    """calculates the similarity between word and list of words"""
    # calculate the similarity of the token to all tokens
    similarity = 0
    num_of_tokens = 0
    if model_format == 'word2vec':
        for toks in token_list:
            # check if the token is in the vocabulary
            if toks in model.vocab.keys():
                num_of_tokens += 1
                similarity += model.similarity(toks, token)
    elif model_format == 'fasttext':
        for toks in token_list:
            # check if the token is in the vocabulary
            if toks in model.wv.vocab:
                num_of_tokens += 1
                similarity += model.similarity(toks, token)
    else:
        raise Exception('Model type incorrect')
    return similarity/num_of_tokens


def get_similarity_pairs(tokens, candidates, wv, model_format):
    """Calculates similarity to tokens for list of candidates.
        Args: 
            tokens (list): List of tokens to wich similarity is calculated
            candidates (list): List of tokens for wich similarity is calculated.
            wv (Word2VecKeyedVectors): Word embeddings.
        Returns:
            similarity_pairs (list): List of tuples. Tuples are pairs of candidates and their similarity to tokens.
        """
    similarity_pairs = []
    for candidate in candidates:
        sim = similarity(candidate, tokens, wv, model_format)
        similarity_pairs.append((candidate, sim))
    # return the list of expansion terms with their similarities
    return similarity_pairs

# updated function
def pre_retrieval_KNN(query, k, wv, n, stop_words,model_format, extension=False):
    """Find n most similar tokens(candidates) to the given query, optional: 
        query can be extended, then the candidates are found for extended query.
        Args: 
            query (string): User query we want to expand.
            k (int): Number of nearest neighbours.
            wv (Word2VecKeyedVectors): Word embeddings.
            n (int): Number of candidates (with the highest simiarity) that is returned.
            stopwords (list): List of words we want to remove from the tokenized text. 
        Returns:
            candidate_list (list): List of n candidates with the highest similarity to query tokens.
        """
    tokens = tokenized_query(query, stop_words)
    if extension:
        extended = extend_tokens(tokens,wv)
        candidates = candidate_expansion_terms(tokens+extended, k, wv,model_format)
        candidates_sim = get_similarity_pairs(tokens+extended, candidates, wv, model_format)
    else:
        candidates = candidate_expansion_terms(tokens, k, wv,model_format)
        candidates_sim = get_similarity_pairs(tokens, candidates, wv, model_format)
    def takeSecond(elem):
        return elem[1]
    sort = sorted(candidates_sim, key=takeSecond)[::-1]
    candidate_list = []
    for tupl in sort:
        candidate_list.append(tupl[0])
    cleaned = [word for word in candidate_list if word.isalpha()]
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in cleaned]
    candidate_list = [w for w in lemmatized if w not in tokens]
    candidate_list = candidate_list[:n]
    return candidate_list

In [333]:
pre_retrieval_KNN('fishing and pollution', 10, ft_wiki_en, 10, stop_words, 'fasttext')




['shellfishing',
 'earpollution',
 'flwfishing',
 'biopollution',
 'pollute',
 'billfishing',
 'sollution',
 'gamefishing',
 'pollut',
 'antipollution']

In [320]:
pre_retrieval_KNN('fishing and pollution', 10, wv_wiki_en, 10, stop_words)

['shellfishing',
 'earpollution',
 'flwfishing',
 'biopollution',
 'pollute',
 'billfishing',
 'sollution',
 'gamefishing',
 'pollut',
 'antipollution']

In [321]:
pre_retrieval_KNN('underground fish water', 10, wv_wiki_en, 100, stop_words)

['seawater',
 'fishwater',
 'groundwaters',
 'sewage',
 'groundwater',
 'potable',
 'undergrounding',
 'undergrounders',
 'ρwater',
 'wastewater',
 'undergrounded',
 'undergroung',
 'undergrounder',
 'baitfish',
 'undergroun',
 'gamefish',
 'shellfish',
 'beakfish',
 'undergroud',
 'mudfishes',
 'goldfish',
 'milkfishes',
 'billfish']

In [271]:
pre_retrieval_KNN('underground fish water', 10, wv_wiki_en, 20, stop_words)

['seawater',
 'fishwater',
 'groundwaters',
 'sewage',
 'groundwater',
 'potable',
 'undergrounding',
 'undergrounders',
 'ρwater',
 'wastewater',
 'undergrounded',
 'undergroung',
 'undergrounder',
 'baitfish',
 'undergroun']

In [131]:
lemmatizer.lemmatize('stood', get_wordnet_pos('stood'))

'stood'

In [111]:
len(pre_retrieval_KNN('underground fish water', 10, wv_wiki_en, 40, stop_words))

23

In [112]:
len(pre_retrieval_KNN('underground fish water bull', 10, wv_wiki_en, 100, stop_words))

32

In [None]:
# vsake beseda jih da 10 knn, dolzina lahko 100, verjetno omejitev ker bo doc_retrieval delal 100 let

In [None]:
# glej query modules za teste, search je zelo odvisen od stevila dokumentov, torej tokeniziranega queryja, 4 besede je ze veliko:
# 20 extensionov 12s,10 8s, 5 6s
# z dvema besedama pa za 20 extensionov 6s, 10 4s, 5 2s cca 2krat hitreje..

In [154]:
# adding annotations

In [163]:
import json
annotated_topics = []
with open('C:\\Users\\sarab\\work\\try\\TREC\\processed_data\\titles_annotated.jsonl', 'r') as file:
        for line in file: 
            dicti = json.loads(line)
            annotated_topics.append(dicti)



In [262]:



annotated_cleaned = {}

for dicti in annotated_topics:
    ID = dicti.get('id')
    
    annotated_cleaned[ID]={}
    text = dicti.get('text')
    tokenized_text = tokenized_query(text, stop_words)
    annotated_cleaned[ID]['text'] = text
    
    NE_list = dicti.get('annotations').get('NE')
    NE_tuples = []
    for NE in NE_list:
        NE_tuples.append((NE.get('text'),NE.get('type')))
    annotated_cleaned[ID]['NE'] = NE_tuples   
        
    wiki_list = dicti.get('annotations').get('wiki')
    wiki_words = []
    for wiki in wiki_list:
        name = wiki.get("name")
        if len(name.split()) > 1:
            wiki_words.append(name.lower())
    annotated_cleaned[ID]['wiki'] = wiki_words 
    
    annotated_words_list = dicti.get('annotations').get('annotatedWords').get("words")
    synonyms = []
    for word in annotated_words_list:
        syn_list = word.get('synonyms')
        for syn in syn_list:
            tokenized_syn= word_tokenize(syn)
            for w in tokenized_syn:
                w = w.lower()
                for l in w:
                    if not l.isalpha() and l != "-":
                        w = w.replace(l, " ")
                if w not in tokenized_text:
                    synonyms.append(w)
    annotated_cleaned[ID]['synonyms'] = synonyms        



In [265]:
annotated_cleaned

{'301': {'text': 'International Organized Crime',
  'NE': [],
  'wiki': ['organized crime'],
  'synonyms': ['law-breaking']},
 '302': {'text': 'Poliomyelitis and Post-Polio',
  'NE': [],
  'wiki': [],
  'synonyms': ['polio',
   'infantile paralysis',
   'acute anterior poliomyelitis']},
 '303': {'text': 'Hubble Telescope Achievements',
  'NE': [],
  'wiki': ['hubble space telescope'],
  'synonyms': ['edwin hubble',
   'edwin powell hubble',
   'scope',
   'accomplishment']},
 '304': {'text': 'Endangered Species (Mammals)',
  'NE': [],
  'wiki': ['endangered species'],
  'synonyms': ['species', 'mammalian']},
 '305': {'text': 'Most Dangerous Vehicles',
  'NE': [],
  'wiki': [],
  'synonyms': ['unsafe',
   'grave',
   'grievous',
   'serious',
   'severe',
   'life-threatening']},
 '306': {'text': 'African Civilian Deaths',
  'NE': [('African Civilian Deaths', 'MISC')],
  'wiki': [],
  'synonyms': ['decease',
   'expiry',
   'last',
   'dying',
   'demise',
   'end',
   'destruction']},


In [266]:
# TREC testing annotations

In [282]:
from gensim.models import Word2Vec