In [1]:
import pandas as pd
import pickle
import numpy as np
from gensim.models import Word2Vec
import annoy

In [2]:
# Load train data in chunks
def load_large_dataframe(filename):
    chunks = []
    with open(filename, 'rb') as f:
        while True:
            try:
                chunk = pickle.load(f)
                chunks.append(chunk)
            except EOFError:
                break
    return pd.concat(chunks, ignore_index=True)

In [3]:
loaded_df = load_large_dataframe('train_data-less_then_4_men_filtered.pkl')
print('Shape:', loaded_df.shape)

1
2
3
4
5
6
7
8
9
10
11
Shape: (1074964, 5)


In [4]:
loaded_df = loaded_df.drop(columns=['men_text', 'men_id', 'phrase'])

In [5]:
loaded_df.head()

Unnamed: 0,men_href_title,men_href_title_id
0,"[lege, număr, 187, 24, octombrie, 2011]",8678
1,"[lege, număr, 24, 27, martie, 2000, republicat]",9116
2,"[lege, număr, 47, 18, 1992, republicat]",9947
3,"[ordonanță, număr, 17, 15, iulie, 2015]",17618
4,"[decizie, număr, 802, 3, iulie, 2008]",4828


In [6]:
loaded_df['men_href_title_id'].nunique()

19624

In [9]:
# Load word embeddings
word2vec_model = Word2Vec.load(r'C:\Users\Stefan\Desktop\licenta_mea_de_10\03_word_embadings\word2vec_embeddings-filtered_train_unique_phrases-ep20\word2vec_embeddings-filtered_train_unique_phrases-ep20.model')

In [10]:
# Calculate the mean embedding of a train title
def get_embeddings(word_list, model):
    embeddings = []
    for word in word_list:
        if word in model.wv:
            embeddings.append(model.wv[word])
        else:
            embeddings.append(np.zeros(model.vector_size))
    return np.mean(embeddings, axis=0)

In [11]:
loaded_df['emb_men_href_title'] = loaded_df['men_href_title'].apply(lambda x: get_embeddings(x, word2vec_model))

In [13]:
del word2vec_model

In [12]:
loaded_df.head()

Unnamed: 0,men_href_title,men_href_title_id,emb_men_href_title
0,"[lege, număr, 187, 24, octombrie, 2011]",8678,"[0.9943053, -1.2912544, 2.1456861, -2.0568573,..."
1,"[lege, număr, 24, 27, martie, 2000, republicat]",9116,"[-1.1107086, -1.5717981, 1.4432237, -1.4780799..."
2,"[lege, număr, 47, 18, 1992, republicat]",9947,"[-1.9419836, -3.0039403, 1.3080685, -0.1964377..."
3,"[ordonanță, număr, 17, 15, iulie, 2015]",17618,"[0.5815943, -0.6987576, 2.5268528, -1.1766078,..."
4,"[decizie, număr, 802, 3, iulie, 2008]",4828,"[0.76110214, 1.1239939, 1.0337374, -2.2991674,..."


In [14]:
loaded_df['emb_men_href_title'].isnull().any()

False

In [15]:
loaded_df.shape

(1074964, 3)

In [16]:
# Get unique title_ids to build them as indexes in Annoy object
annoy_df = loaded_df.drop_duplicates(subset='men_href_title_id')[['men_href_title_id', 'men_href_title', 'emb_men_href_title']]

In [17]:
annoy_df.head()

Unnamed: 0,men_href_title_id,men_href_title,emb_men_href_title
0,8678,"[lege, număr, 187, 24, octombrie, 2011]","[0.9943053, -1.2912544, 2.1456861, -2.0568573,..."
1,9116,"[lege, număr, 24, 27, martie, 2000, republicat]","[-1.1107086, -1.5717981, 1.4432237, -1.4780799..."
2,9947,"[lege, număr, 47, 18, 1992, republicat]","[-1.9419836, -3.0039403, 1.3080685, -0.1964377..."
3,17618,"[ordonanță, număr, 17, 15, iulie, 2015]","[0.5815943, -0.6987576, 2.5268528, -1.1766078,..."
4,4828,"[decizie, număr, 802, 3, iulie, 2008]","[0.76110214, 1.1239939, 1.0337374, -2.2991674,..."


In [18]:
annoy_df.shape

(19624, 3)

In [None]:
# Build Annoy with 10000 trees
vector_length = 100
metric = 'angular'
annoy_index = annoy.AnnoyIndex(vector_length, metric=metric)

# Mapping between the item and its identifier in the index
for index, row in annoy_df.iterrows():
    men_href_title_id = row['men_href_title_id']
    emb_men_href_title = row['emb_men_href_title']
    annoy_index.add_item(men_href_title_id, emb_men_href_title)   

num_trees = 10000
annoy_index.build(n_trees=num_trees)

In [23]:
annoy_index.save('annoy_index-10ktrees.ann')

True