In [2]:
import pandas as pd
import pickle
import annoy
import numpy as np
from gensim.models import Word2Vec

In [3]:
# Load Word Embeddings
word2vec_model = Word2Vec.load(r'C:\Users\Stefan\Desktop\licenta_mea_de_10\03_word_embadings\word2vec_embeddings-unique_phrases-ep20\word2vec_embeddings-unique_phrases-ep20.model')

# Load Annoy 
vector_length = 100
annoy_index = annoy.AnnoyIndex(vector_length, metric='angular')
annoy_index.load('annoy_index-10ktrees.ann', prefault=True)

True

In [4]:
# Load pickle file in chunks (test set)
def load_data(filename):
    chunks = []
    with open(filename, 'rb') as f:
        while True:
            try:
                chunk = pickle.load(f)
                chunks.append(chunk)
            except EOFError:
                break
    return pd.concat(chunks, ignore_index=True)

In [5]:
# Calculate word embeddings mean
def get_embeddings(word_list, model):
    embeddings = []
    for word in word_list:
        if word in model.wv:
            embeddings.append(model.wv[word])
        else:
            embeddings.append(np.zeros(model.vector_size))
    return np.mean(embeddings, axis=0)

In [6]:
# Check if true value is among the top k predicted values
def check_in_list(row):
  return row['men_href_title_id'] in row['predicted_candidates_list']

In [7]:
# Obtain the top k candidates using Annoy
def get_top_k_candidates(embedding, num_matches):
    ids = annoy_index.get_nns_by_vector(
        embedding,
        num_matches,
        search_k = -1,
        include_distances=False
    )
    
    return [i for i in ids]

In [8]:
# Load test data
loaded_df = load_data('test_data-less_then_4_men_filtered.pkl')

# Load mention mean embedding
loaded_df['emb_men_text'] = loaded_df['men_text'].apply(lambda x: get_embeddings(x, word2vec_model))

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [9]:
loaded_df.head()

Unnamed: 0,men_id,men_text,phrase,men_href_title,men_href_title_id,emb_men_text
0,565003,"[15, lege, contencios, administrativ, număr, 5...","[obligație, fiscal, stabili, act, administrati...","[lege, număr, 554, 2, decembrie, 2004]",10083,"[-1.5641718, -0.6049332, -2.4777005, 0.7451549..."
1,155608,"[legea-cadru, număr, 153/2017]","[29, însă, dată, 1, iulie, 2017, intra, vigoar...","[lege-cadru, număr, 153, 28, iunie, 2017]",10497,"[1.261335, 0.52220327, -4.000471, 4.380332, -3..."
2,1320370,"[cod, deontologic]","[hotărâre, număr, 10, 21, aprilie, 2017, aprob...","[cod, deontologic, 14, noiembrie, 2015]",507,"[2.5022848, 1.6882548, 0.93107533, 1.908083, -..."
3,992802,[3],"[65, articol, 220, alineat, 2, 3, modifica, ur...","[regulament, 5, martie, 2018]",18969,"[0.2195049, -3.3490834, -2.8692324, 0.79235923..."
4,1391928,"[articol, 20, alineat, 1, lege, număr, 213/1998]","[temei, articol, 108, constituție, România, re...","[lege, număr, 213, 17, noiembrie, 1998]",8906,"[0.17845795, -1.6992744, -2.5976417, -0.271391..."


In [22]:
# Sanitizes test data. Remove empty mentions
loaded_df[loaded_df['emb_men_text'].isna()]

Unnamed: 0,men_id,men_text,phrase,men_href_title,men_href_title_id,emb_men_text
108,1046852,[],"[interpretare, aplicare, unitar, dispoziție, a...","[lege, număr, 554, 2, decembrie, 2004]",10083,
181,1123390,[],"[avea, vedere, obiectiv, comisie, anchetă, pre...","[hotărâre, număr, 11, 20, februarie, 2018]",5894,
314,474434,[],"[14, articol, 176, literă, modifica, următor, ...","[lege, număr, 302, 26, iunie, 2004, republicat]",10532,
359,1381640,[],"[ordin, număr, 75, 9, august, 2017, modificare...","[ordonanţă, urgenţă, număr, 33, 4, 2007, actua...",16699,
525,1401143,[],"[articol, punct, 1, literă, alineat, 7, ^, 1, ...","[ordonanță, urgență, număr, 69, 12, octombrie,...",17137,
...,...,...,...,...,...,...
267671,780171,[],"[valoare, maxim, garanție, individual, determi...","[hotărâre, număr, 717, 17, iunie, 2009]",7293,
267803,1410382,[],"[penitenciar, miercurea-ciuc, spațiu, cazare, ...","[lege, număr, 254, 19, iulie, 2013]",9227,
267892,316082,[],"[197, articol, 133, alineat, 1, literă, modifi...","[ordonanță, urgență, număr, 111, 14, decembrie...",16112,
268252,780283,[],"[87, anexă, număr, 5, anexă, număr, 6, articol...","[ordin, număr, 2.225/994/2009]",13406,


In [23]:
loaded_df = loaded_df[loaded_df['emb_men_text'].notna()]

In [24]:
loaded_df.shape

(266994, 6)

In [25]:
emb_men_text = loaded_df['emb_men_text']
men_href_title_id = loaded_df['men_href_title_id']

In [26]:
# Predicted tops
predicted_candidates_list = emb_men_text.apply(lambda x: get_top_k_candidates(x, 10)).tolist()

In [27]:
len(predicted_candidates_list)

266994

In [29]:
predicted_candidates_list[:5]

[[655, 18028, 18498, 5340, 18021, 18285, 18208, 18362, 18024, 18499],
 [502, 503, 543, 546, 729, 8675, 8101, 11811, 516, 10511],
 [18351, 11221, 518, 519, 10456, 11223, 17454, 17466, 17438, 6025],
 [677, 723, 656, 650, 720, 644, 526, 628, 655, 651],
 [677, 655, 526, 8756, 5141, 16543, 723, 483, 17454, 547]]

In [30]:
loaded_df['predicted_candidates_list'] = predicted_candidates_list

In [31]:
loaded_df.head()

Unnamed: 0,men_id,men_text,phrase,men_href_title,men_href_title_id,emb_men_text,predicted_candidates_list
0,565003,"[15, lege, contencios, administrativ, număr, 5...","[obligație, fiscal, stabili, act, administrati...","[lege, număr, 554, 2, decembrie, 2004]",10083,"[-1.5641718, -0.6049332, -2.4777005, 0.7451549...","[655, 18028, 18498, 5340, 18021, 18285, 18208,..."
1,155608,"[legea-cadru, număr, 153/2017]","[29, însă, dată, 1, iulie, 2017, intra, vigoar...","[lege-cadru, număr, 153, 28, iunie, 2017]",10497,"[1.261335, 0.52220327, -4.000471, 4.380332, -3...","[502, 503, 543, 546, 729, 8675, 8101, 11811, 5..."
2,1320370,"[cod, deontologic]","[hotărâre, număr, 10, 21, aprilie, 2017, aprob...","[cod, deontologic, 14, noiembrie, 2015]",507,"[2.5022848, 1.6882548, 0.93107533, 1.908083, -...","[18351, 11221, 518, 519, 10456, 11223, 17454, ..."
3,992802,[3],"[65, articol, 220, alineat, 2, 3, modifica, ur...","[regulament, 5, martie, 2018]",18969,"[0.2195049, -3.3490834, -2.8692324, 0.79235923...","[677, 723, 656, 650, 720, 644, 526, 628, 655, ..."
4,1391928,"[articol, 20, alineat, 1, lege, număr, 213/1998]","[temei, articol, 108, constituție, România, re...","[lege, număr, 213, 17, noiembrie, 1998]",8906,"[0.17845795, -1.6992744, -2.5976417, -0.271391...","[677, 655, 526, 8756, 5141, 16543, 723, 483, 1..."


In [32]:
# Check presence of true lable among its predicted candidates
loaded_df['presence']  = loaded_df.apply(check_in_list, axis=1)

In [33]:
loaded_df.head()

Unnamed: 0,men_id,men_text,phrase,men_href_title,men_href_title_id,emb_men_text,predicted_candidates_list,presence
0,565003,"[15, lege, contencios, administrativ, număr, 5...","[obligație, fiscal, stabili, act, administrati...","[lege, număr, 554, 2, decembrie, 2004]",10083,"[-1.5641718, -0.6049332, -2.4777005, 0.7451549...","[655, 18028, 18498, 5340, 18021, 18285, 18208,...",False
1,155608,"[legea-cadru, număr, 153/2017]","[29, însă, dată, 1, iulie, 2017, intra, vigoar...","[lege-cadru, număr, 153, 28, iunie, 2017]",10497,"[1.261335, 0.52220327, -4.000471, 4.380332, -3...","[502, 503, 543, 546, 729, 8675, 8101, 11811, 5...",False
2,1320370,"[cod, deontologic]","[hotărâre, număr, 10, 21, aprilie, 2017, aprob...","[cod, deontologic, 14, noiembrie, 2015]",507,"[2.5022848, 1.6882548, 0.93107533, 1.908083, -...","[18351, 11221, 518, 519, 10456, 11223, 17454, ...",False
3,992802,[3],"[65, articol, 220, alineat, 2, 3, modifica, ur...","[regulament, 5, martie, 2018]",18969,"[0.2195049, -3.3490834, -2.8692324, 0.79235923...","[677, 723, 656, 650, 720, 644, 526, 628, 655, ...",False
4,1391928,"[articol, 20, alineat, 1, lege, număr, 213/1998]","[temei, articol, 108, constituție, România, re...","[lege, număr, 213, 17, noiembrie, 1998]",8906,"[0.17845795, -1.6992744, -2.5976417, -0.271391...","[677, 655, 526, 8756, 5141, 16543, 723, 483, 1...",False


In [34]:
# Number of true labels among top k candidates
loaded_df['presence'].sum()

2514

In [35]:
loaded_df.to_pickle("top_k_presence-2514_outof_266994.pkl")