Code Referennce: https://www.analyticsvidhya.com/blog/2020/08/information-retrieval-using-word2vec-based-vector-space-model/#h2_7

In [None]:
import spacy
from gensim.models.word2vec import Word2Vec as W2V
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import string
import random
import matplotlib.pyplot as plt
tqdm.pandas()

# DatasetLoad

In [None]:
w2v_model = W2V()

Select on which dataset we want to work with

In [None]:
df_col_names = ['stem_text', 'doc_text', 'lem_text']
df_col_select = df_col_names[2]
df_query_col_names = ['cl_q', 'stem_q', 'lem_q']
df_query_col_select = df_query_col_names[2]
test_dim = 0.2
print("TEXT: " + df_col_select + "\nQUERY: " + df_query_col_select + "\nTEST_DIM: " + str(test_dim))


Configure the model parameters

In [None]:
wv_model_size = 1000
wv_model_m_c = 0
wv_model_win = 10
wv_model_type = 0

Dataframes Paths

In [None]:
path = "Docs/"
luc_retr = path+"raw_dev_Lucene_retrievals.csv"
g_truth_rank = path + "dev_data.csv"

In [None]:
path_cl = "ProcDocs/" + "/Split_"+str(test_dim)+"/"
docs_test_path = path_cl +"docs_test.csv"
docs_train_path = path_cl +"docs_train.csv"
queries_test_path = path_cl +"queries_test.csv"
queries_train_path = path_cl +"queries_train.csv"

In [None]:
model_path = "ProcDocs/W2V/"

In [None]:
docs_train_df = pd.read_csv(docs_train_path)
docs_test_df = pd.read_csv(docs_test_path)

In [None]:
queries_train_df = pd.read_csv(queries_train_path)
queries_test_df = pd.read_csv(queries_test_path)

In [None]:
luc_retr_df = pd.read_csv(luc_retr)
g_truth_r = pd.read_csv(g_truth_rank)

# W2V Train

In [None]:
# Combining corpus and queries for training
combined_training=pd.concat([docs_train_df.rename(columns={df_col_select:'text'})['text'],\
                             queries_train_df.rename(columns={df_query_col_select:'text'})['text'],\
                                 queries_test_df.rename(columns={df_query_col_select:'text'})['text']])\
                             .sample(frac=1).reset_index(drop=True)

Create the compass file

In [None]:
f = open('compass.txt',"w+")
f.close()
with open('compass.txt', 'a') as f:
    for x in combined_training:
        f.write(x)
        f.write('\n')
    

Create a dataframe with query_num and doc_num

In [None]:
complete_query = pd.merge(luc_retr_df, queries_test_df, how="right")

In [None]:
query_list = list(complete_query.Query_number.unique())

In [None]:
slice_dicts = dict()
file_name_list = list()

Save each file for slice training

In [None]:
for x in query_list:
    file_name = "CADE/"+str(x)+".txt"
    f = open(file_name,"w+")
    f.close()
    docs_list = list(complete_query[complete_query.Query_number == x].doc_number.unique())
    with open(file_name, 'a') as f:
        for y in docs_list: 
            f.write(docs_test_df[docs_test_df.doc_number == y][df_col_select].item())
            f.write('\n')
    file_name_list.append(file_name)

In [None]:
from cade.cade import CADE

# Training

Instantiate the model parameters

In [None]:
aligner = CADE(size=wv_model_size, min_count = wv_model_m_c, workers=8, sg=0, window=wv_model_win)

Train the compass

In [None]:
aligner.train_compass("compass.txt", overwrite=False)

Train each slice

In [None]:
query_number_list = []

In [None]:
for x in tqdm(file_name_list):
    slice_dicts[int(x.split('.')[0].split('/')[1])] = aligner.train_slice(x)
    query_number_list.append(int(x.split('.')[0].split('/')[1]))

# Test Dataset Vectorization

In [None]:
# Function returning vector reperesentation of a document
def get_embedding_w2v(doc_tokens, model, wv_m_size = wv_model_size):
    embeddings = []
    if len(doc_tokens)<1:
        return np.zeros(wv_m_size)
    else:
        for tok in doc_tokens:
            if tok in model.wv.vocab:
                embeddings.append(model.wv[tok])
            else:
                continue
                embeddings.append(np.random.rand(wv_m_size))
        # mean the vectors of individual words to get the vector of the document
        return np.mean(embeddings, axis=0)

In [None]:
# Getting Word2Vec Vectors for Testing Corpus and Queries
docs_test_df['vector']=docs_test_df[['doc_number', df_col_select]].progress_apply(lambda x :get_embedding_w2v(str(x[1]).split(), model = aligner.compass), axis = 1)
queries_test_df['vector']=queries_test_df[['Query_number', df_query_col_select]].progress_apply(lambda x :get_embedding_w2v(str(x[1]).split(), model = slice_dicts[x[0]]), axis = 1)

# Evaluate

In [None]:
#Re-rank documents for a query
def reorder_docs(q_num, lucene_res, top_N = 10):
  # generating vector
  
  vector = queries_test_df[queries_test_df.Query_number == q_num]['vector'].values[0]

  #selectin docs to order
  tmp_docs_df = docs_test_df[docs_test_df.doc_number.isin(lucene_res)].copy()
  tmp_docs_df['vector'] = tmp_docs_df[['doc_number', df_col_select]].apply(lambda x :get_embedding_w2v(str(x[1]).split(), model = slice_dicts[q_num]), axis = 1).copy()
  #docs_test_df[docs_test_df.doc_number.isin(lucene_res)].copy()
  
  # ranking documents
  documents=tmp_docs_df.copy()
  documents['similarity']=documents['vector'].apply(lambda x: cosine_similarity(np.array(vector).reshape(1, -1),np.array(x).reshape(1, -1)).item())
  documents.sort_values(by='similarity',ascending=False,inplace=True)
  return documents.head(top_N).reset_index(drop=True)

In [None]:
#Re-rank documents for a query and returns the Recall@K
def get_reorder_recall(q_num, recall_at=10):
    lucene_query_doc = list(luc_retr_df[luc_retr_df.Query_number == q_num].doc_number) # Select the document related to that query

    tmp_compare_result = g_truth_r[g_truth_r.Query_number == q_num].reset_index() # Select the benchmark slice

    n_of_ret = len(tmp_compare_result)
    if recall_at == 0: recall_at = n_of_ret 

    ordered_res = reorder_docs(q_num, lucene_query_doc, n_of_ret).reset_index() # Get W2V similarity
    
    tmp_compare_result = tmp_compare_result.iloc[:recall_at].reset_index() # Select only slice to compare
    n_of_ret = len(tmp_compare_result)

    #Calculate recall
    count_correct = 0
    for x in range(n_of_ret):
        if tmp_compare_result.loc[x, 'doc_number'] == ordered_res.loc[x, 'doc_number']:
            count_correct += 1
    if count_correct == 0:
        return 0
    return count_correct/n_of_ret

In [None]:
queries_test_df['score'] = queries_test_df.Query_number.progress_apply(lambda x: get_reorder_recall(x,recall_at=5))
queries_test_df['score'].mean()*100

In [None]:
queries_test_df['score'] = queries_test_df.Query_number.progress_apply(lambda x: get_reorder_recall(x,recall_at=10))
queries_test_df['score'].mean()*100

In [None]:
queries_test_df['score'] = queries_test_df.Query_number.progress_apply(lambda x: get_reorder_recall(x,recall_at=20))
queries_test_df['score'].mean()*100

In [None]:
# Function for calculating average precision for a query
def average_precision(qid,qvector, only_relevant=False):
  
  # Getting the ground truth and document vectors
  qresult=g_truth_r.loc[g_truth_r['Query_number']==qid,['doc_number','label']]
  n_of_rel = len(qresult)
  if only_relevant == True: n_of_rel = len(qresult[qresult['label'] == 1])
  qcorpus=docs_test_df[docs_test_df['doc_number'].isin(list(qresult['doc_number']))].reset_index(drop=True)
  qcorpus['vector'] = qcorpus[['doc_number', df_col_select]].apply(lambda x :get_embedding_w2v(str(x[1]).split(), model = slice_dicts[qid]), axis = 1).copy()
  qcorpus = qcorpus[['doc_number','vector']]
  
  qresult=pd.merge(qresult,qcorpus,on='doc_number')
  
  # Ranking documents for the query
  qresult['similarity']=qresult['vector'].apply(lambda x: cosine_similarity(np.array(qvector).reshape(1, -1),np.array(x).reshape(1, -1)).item())
  qresult.sort_values(by='similarity',ascending=False,inplace=True)
  # Taking Top 10 documents for the evaluation
  ranking=qresult.head(n_of_rel)['label'].values
  
  # Calculating precision
  precision=[]
  for i in range(1,n_of_rel):
    if ranking[i-1]:
      precision.append(np.sum(ranking[:i])/i)
  
  # If no relevant document in list then return 0
  if precision==[]:
    return 0

  return np.mean(precision)

In [None]:
queries_test_df['AP']=queries_test_df.progress_apply(lambda x: average_precision(x['Query_number'],x['vector'], only_relevant=True),axis=1)

In [None]:
# Finding Mean Average Precision
print('Mean Average Precision=>',queries_test_df['AP'].mean()*100)