Code Referennce: https://www.analyticsvidhya.com/blog/2020/08/information-retrieval-using-word2vec-based-vector-space-model/#h2_7

In [None]:
import spacy
from gensim.models.word2vec import Word2Vec as W2V
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import string
import random
import matplotlib.pyplot as plt
tqdm.pandas()

# DatasetLoad

In [None]:
w2v_model = W2V()

Select on which dataset we want to work with

In [None]:
df_col_names = ['stem_text', 'doc_text', 'lem_text']
df_col_select = df_col_names[2]
df_query_col_names = ['cl_q', 'stem_q', 'lem_q']
df_query_col_select = df_query_col_names[2]
test_dim = 0.2
print("TEXT: " + df_col_select + "\nQUERY: " + df_query_col_select + "\nTEST_DIM: " + str(test_dim))


Configure the model parameters

In [None]:
wv_model_size = 100
wv_model_m_c = 0
wv_model_win = 5
wv_model_type = 0

Dataframes Paths

In [None]:
path = "Docs/"
luc_retr = path+"raw_dev_Lucene_retrievals.csv"
g_truth_rank = path + "dev_data.csv"

In [None]:
path_cl = "ProcDocs/" + "/Split_"+str(test_dim)+"/"
docs_test_path = path_cl +"docs_test.csv"
docs_train_path = path_cl +"docs_train.csv"
queries_test_path = path_cl +"queries_test.csv"
queries_train_path = path_cl +"queries_train.csv"

In [None]:
model_path = "ProcDocs/W2V_c/"

In [None]:
docs_train_df = pd.read_csv(docs_train_path)
docs_test_df = pd.read_csv(docs_test_path)

In [None]:
queries_train_df = pd.read_csv(queries_train_path)
queries_test_df = pd.read_csv(queries_test_path)

In [None]:
luc_retr_df = pd.read_csv(luc_retr)
g_truth_r = pd.read_csv(g_truth_rank)

# W2V Train

In [None]:
# Combining corpus and queries for training
combined_training=pd.concat([docs_train_df.rename(columns={df_col_select:'text'})['text'],\
                             queries_train_df.rename(columns={df_query_col_select:'text'})['text'],\
                                 queries_test_df.rename(columns={df_query_col_select:'text'})['text']])\
.sample(frac=1).reset_index(drop=True)

In [None]:
train_data=[]
for i in tqdm(combined_training):
    train_data.append(i.split())

In [None]:
# Training a word2vec model from the given data set
w2v_model = W2V(tqdm(train_data), size=wv_model_size, min_count=wv_model_m_c, window=wv_model_win, sg=wv_model_type, workers=8)

In [None]:
dir = "cl_text"
os.mkdir(model_path + dir)

In [None]:
w2v_model.save(model_path + dir +"/mod.model")

# Load Model

In [None]:
model_path

In [None]:
w2v_model = W2V.load(model_path + "SG_lem_text_lem_q/mod.model")

In [None]:
print(len(w2v_model.wv.vocab))

# Compute Vectors for the test DataSet

In [None]:
np.random.seed(534)

In [None]:
# Function returning vector reperesentation of a document
def get_embedding_w2v(doc_tokens, wv_m_size = wv_model_size, model = w2v_model):
    embeddings = []
    if len(doc_tokens)<1:
        return np.zeros(wv_m_size)
    else:
        for tok in doc_tokens:
            if tok in model.wv.vocab:
                embeddings.append(model.wv.word_vec(tok))
            else:
                continue
        #if len(embeddings) == 0:
         #   return np.zeros(wv_m_size)
        # mean the vectors of individual words to get the vector of the document
        return np.mean(embeddings, axis=0)

In [None]:
# Getting Word2Vec Vectors for Testing Corpus and Queries
docs_test_df['vector']=docs_test_df[df_col_select].progress_apply(lambda x :get_embedding_w2v(str(x).split()))
queries_test_df['vector']=queries_test_df[df_query_col_select].progress_apply(lambda x :get_embedding_w2v(str(x).split()))

# Evaluate

In [None]:
#Re-rank documents for a query
def reorder_docs(q_num, lucene_res, top_N = 10):
  # generating vector
  vector = queries_test_df[queries_test_df.Query_number == q_num]['vector'].values[0]

  #selectin docs to order
  tmp_docs_df = docs_test_df[docs_test_df.doc_number.isin(lucene_res)].copy()
  
  # ranking documents
  documents=tmp_docs_df.copy()
  documents['similarity']=documents['vector'].apply(lambda x: cosine_similarity(np.array(vector).reshape(1, -1),np.array(x).reshape(1, -1)).item())
  documents.sort_values(by='similarity',ascending=False,inplace=True)
  return documents.head(top_N).reset_index(drop=True)

In [None]:
#Re-rank documents for a query and returns the Recall@K
def get_reorder_recall(q_num, recall_at=10):
    lucene_query_doc = list(luc_retr_df[luc_retr_df.Query_number == q_num].doc_number) # Select the document related to that query

    tmp_compare_result = g_truth_r[g_truth_r.Query_number == q_num].reset_index() # Select the benchmark slice

    n_of_ret = len(tmp_compare_result)
    if recall_at == 0: recall_at = n_of_ret 

    ordered_res = reorder_docs(q_num, lucene_query_doc, n_of_ret).reset_index() # Get W2V similarity
    
    tmp_compare_result = tmp_compare_result.iloc[:recall_at].reset_index() # Select only slice to compare
    n_of_ret = len(tmp_compare_result)

    #Calculate recall
    count_correct = 0
    for x in range(n_of_ret):
        if tmp_compare_result.loc[x, 'doc_number'] == ordered_res.loc[x, 'doc_number']:
            count_correct += 1
    if count_correct == 0:
        return 0
    return count_correct/n_of_ret

## Rec

In [None]:
queries_test_df['score'] = queries_test_df.Query_number.progress_apply(lambda x: get_reorder_recall(x,recall_at=5))
queries_test_df['score'].mean()*100

In [None]:
queries_test_df['score'] = queries_test_df.Query_number.progress_apply(lambda x: get_reorder_recall(x,recall_at=10))
queries_test_df['score'].mean()*100

In [None]:
queries_test_df['score'] = queries_test_df.Query_number.progress_apply(lambda x: get_reorder_recall(x,recall_at=20))
queries_test_df['score'].mean()*100

In [None]:
# Function for calculating average precision for a query
def average_precision(qid,qvector, only_relevant=False):
  
  # Getting the ground truth and document vectors
  qresult=g_truth_r.loc[g_truth_r['Query_number']==qid,['doc_number','label']]
  n_of_rel = len(qresult)
  if only_relevant == True: n_of_rel = len(qresult[qresult['label'] == 1])
  qcorpus=docs_test_df[docs_test_df['doc_number'].isin(list(qresult['doc_number']))].reset_index(drop=True)
  qcorpus = qcorpus[['doc_number','vector']]
  
  qresult=pd.merge(qresult,qcorpus,on='doc_number')
  
  # Ranking documents for the query
  qresult['similarity']=qresult['vector'].apply(lambda x: cosine_similarity(np.array(qvector).reshape(1, -1),np.array(x).reshape(1, -1)).item())
  qresult.sort_values(by='similarity',ascending=False,inplace=True)
  # Taking Top 10 documents for the evaluation
  ranking=qresult.head(n_of_rel)['label'].values
  
  # Calculating precision
  precision=[]
  for i in range(1,n_of_rel):
    if ranking[i-1]:
      precision.append(np.sum(ranking[:i])/i)
  
  # If no relevant document in list then return 0
  if precision==[]:
    return 0

  return np.mean(precision)

## Avg Pec

In [None]:
queries_test_df['AP']=queries_test_df.progress_apply(lambda x: average_precision(x['Query_number'],x['vector'], only_relevant=True),axis=1)

In [None]:
# Finding Mean Average Precision
print('Mean Average Precision=>',queries_test_df['AP'].mean()*100)

# Parameter Optimization

Execute the Dataset Load stage before and load the function under: "Compute Vectors for the test Dataset" and "Evaluate"

Creates models with different vector size and calculates the recall and average precision to determine which parameter fits better the model

In [None]:
# Combining corpus and queries for training
combined_training=pd.concat([docs_train_df.rename(columns={df_col_select:'text'})['text'],\
                             queries_train_df.rename(columns={df_query_col_select:'text'})['text'],\
                                 queries_test_df.rename(columns={df_query_col_select:'text'})['text']])\
.sample(frac=1).reset_index(drop=True)
train_data=[]
for i in tqdm(combined_training):
    train_data.append(i.split())

In [None]:
model_list_d = dict() #Models Dictioary
recall_dict = dict() #Scores Dictionary

In [None]:
size_test = [100,300,500,800,1000] #[3,5,10,15] #[100,300,500,800,1000] #Different size or window to test

In [None]:
#Train the models, set x on the parameter you want to optimize
for x in tqdm(size_test):
    w2v_model = W2V(train_data, size=x, min_count=wv_model_m_c, window=wv_model_win, sg=wv_model_type, workers=8)
    model_list_d[x] = w2v_model

calculates the recall and average precision for all the models

In [None]:
for x in tqdm(size_test):
    vect_size = x
    docs_test_df['vector']=docs_test_df[df_col_select].apply(lambda y :get_embedding_w2v(str(y).split(), wv_m_size=int(vect_size), model = model_list_d[x]))
    queries_test_df['vector']=queries_test_df[df_query_col_select].apply(lambda y :get_embedding_w2v(str(y).split(), wv_m_size= int(vect_size), model = model_list_d[x]))
    queries_test_df['score'] = queries_test_df.Query_number.apply(lambda y: get_reorder_recall(y,recall_at=5))
    r5 = queries_test_df['score'].mean()
    queries_test_df['score'] = queries_test_df.Query_number.apply(lambda y: get_reorder_recall(y,recall_at=10))
    r10 = queries_test_df['score'].mean()
    queries_test_df['score'] = queries_test_df.Query_number.apply(lambda y: get_reorder_recall(y,recall_at=20))
    r20 = queries_test_df['score'].mean()
    queries_test_df['AP']=queries_test_df.apply(lambda x: average_precision(x['Query_number'],x['vector'], only_relevant=True),axis=1)
    avg_p = queries_test_df['AP'].mean()
    recall_dict[x] = [r5,r10,r20, avg_p]

In [None]:
# Print the recall dict
for aa, bb in recall_dict.items():
    print(str(aa)+":")
    for z in bb:
        print(str(z*100))

In [None]:
# Avg Precision Plot
ordered_avg_dict = []
for x in size_test:
    ordered_avg_dict.append(recall_dict[x][3]*100)
plt.plot(size_test, ordered_avg_dict)
plt.yticks(np.arange(70,80,1))
plt.xlabel("Vec Size")
plt.ylabel("Avg Prec %")
plt.show()

In [None]:
# Recall Plot
ordered_recall_dict_5 = []
ordered_recall_dict_10 = []
ordered_recall_dict_mean = []
for x in size_test:
    ordered_recall_dict_5.append(recall_dict[x][0]*100)
    ordered_recall_dict_10.append(recall_dict[x][1]*100)
    ordered_recall_dict_mean.append(np.mean([recall_dict[x][0]*100, recall_dict[x][1]*100]))
plt.plot(size_test, ordered_recall_dict_5)
plt.plot(size_test, ordered_recall_dict_10)
plt.plot(size_test, ordered_recall_dict_mean)
plt.yticks(np.arange(0,11,1))
plt.xlabel("Vec Size")
plt.ylabel("Rec %")
plt.legend(["Rec@5", "Rec@10", "Rec@Mean"], loc='best')
plt.show()

# Save Models

In [None]:
for x in tqdm(size_test):
    end_fix = model_path+"param_opt_win"
    end_fix = end_fix+ "/W2V_model_"+str(x)
    os.mkdir(end_fix)
    model_list_d[x].save(end_fix + "/mod.model")

Load Models

In [None]:
for x in tqdm(size_test):
    end_fix = model_path+"param_opt_lem_text_lem_q"
    end_fix = end_fix+ "/W2V_model_"+str(x)
    model_list_d[x] = W2V.load(end_fix + "/mod.model")

# Plot

TSNE - Plot a query in the vector space

In [None]:
from sklearn.manifold import TSNE
import numpy as np



def cluster(keys, model, topn = 15):    
    embedding_clusters = []
    word_clusters = []
    for word in keys:
        embeddings = []
        words = []
        for similar_word, _ in model.most_similar(word, topn=topn):
            words.append(similar_word)
            embeddings.append(model[similar_word])
        embedding_clusters.append(embeddings)
        word_clusters.append(words)

    embedding_clusters = np.array(embedding_clusters)
    n, m, k = embedding_clusters.shape
    tsne_model_en_2d = TSNE(perplexity=15, n_components=2, init='pca', n_iter=3500, random_state=32)
    embeddings_en_2d = np.array(tsne_model_en_2d.fit_transform(embedding_clusters.reshape(n * m, k))).reshape(n, m, 2)
    return [embeddings_en_2d, word_clusters]
keys = "sherwin williams phone number".split()
clust = cluster(keys=keys, model=w2v_model)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
#% matplotlib inline


def tsne_plot_similar_words(title, labels, embedding_clusters, word_clusters, a, filename=None):
    plt.figure(figsize=(16*5/3, 9*5/3))
    colors = cm.rainbow(np.linspace(0, 1, len(labels)))
    for label, embeddings, words, color in zip(labels, embedding_clusters, word_clusters, colors):
        x = embeddings[:, 0]
        y = embeddings[:, 1]
        plt.scatter(x, y, c=color, alpha=a, label=label, s = 50)
        for i, word in enumerate(words):
            plt.annotate(word, alpha=0.8, xy=(x[i], y[i]), xytext=(5, 2),
                         textcoords='offset points', ha='right', va='bottom', size=14)
    plt.legend(loc=4)
    plt.title(title)
    plt.grid(True)
    if filename:
        plt.savefig(filename, format='png', dpi=250, bbox_inches='tight')
    plt.show()


tsne_plot_similar_words('Similar words to a query', keys, clust[0], clust[1], 0.7,
                        'similar_words.png')

# Test Data Reordering - Executed on the best model

In [None]:
test_data_doc = pd.read_csv("ProcDocs/test_data/test_data_doc.csv")
test_data_query = pd.read_csv("ProcDocs/test_data/test_data_query.csv")
test_data_lucene = pd.read_csv("ProcDocs/test_data/test_data_lucene.csv")

In [None]:
test_data_doc_complete = test_data_doc.copy()

In [None]:
test_data_doc_complete = test_data_doc_complete.append(docs_train_df[docs_train_df.doc_number.isin(list(test_data_lucene.doc_number))][['doc_number','doc_text','lem_text']]).reset_index(drop=True)

In [None]:
# Combining corpus and queries for training
combined_training=pd.concat([docs_train_df.rename(columns={df_col_select:'text'})['text'],\
                             queries_train_df.rename(columns={df_query_col_select:'text'})['text'],\
                                 queries_test_df.rename(columns={df_query_col_select:'text'})['text'],\
                                     test_data_doc.rename(columns={df_col_select:'text'})['text'],\
                                         test_data_query.rename(columns={df_query_col_select:'text'})['text'],\
                                 ])\
.sample(frac=1).reset_index(drop=True)

In [None]:
train_data=[]
for i in tqdm(combined_training):
    train_data.append(i.split())

In [None]:
# Training a word2vec model from the given data set
w2v_model = W2V(tqdm(train_data), size=1000, min_count=0, window=10, sg=1, workers=8)

In [None]:
dir = "test_data"
os.mkdir(model_path + dir)

In [None]:
w2v_model.save(model_path + dir +"/mod.model")

In [None]:
# Function returning vector reperesentation of a document
def get_embedding_w2v_test_data(doc_tokens, wv_m_size = 1000, model = w2v_model):
    embeddings = []
    if len(doc_tokens)<1:
        return np.zeros(wv_m_size)
    else:
        for tok in doc_tokens:
            if tok in model.wv.vocab:
                embeddings.append(model.wv.word_vec(tok))
            else:
                continue
        # mean the vectors of individual words to get the vector of the document
        return np.mean(embeddings, axis=0)

In [None]:
# Getting Word2Vec Vectors for Testing Corpus and Queries
test_data_doc_complete['vector']=test_data_doc_complete[df_col_select].progress_apply(lambda x :get_embedding_w2v(str(x).split()))
test_data_query['vector']=test_data_query[df_query_col_select].progress_apply(lambda x :get_embedding_w2v(str(x).split()))

In [None]:
#Re-rank documents for a query
def reorder_docs_test_data(q_num, top_N = 10):
  lucene_docs_list = list(test_data_lucene[test_data_lucene.Query_number == q_num].doc_number)
  # generating vector
  vector = test_data_query[test_data_query.Query_number == q_num]['vector'].values[0]

  #selectin docs to order
  tmp_docs_df = test_data_doc_complete[test_data_doc_complete.doc_number.isin(lucene_docs_list)].copy()
  
  # ranking documents
  documents=tmp_docs_df.copy()
  documents['similarity']=documents['vector'].apply(lambda x: cosine_similarity(np.array(vector).reshape(1, -1),np.array(x).reshape(1, -1)).item())
  documents.sort_values(by='similarity',ascending=False,inplace=True)
  return documents.head(top_N).reset_index(drop=True)

In [None]:
dimensionality_reduction_rank = pd.DataFrame(columns=['Query_number', 'doc_number'])
for q_num in list(test_data_query.Query_number.unique()):
    result = reorder_docs_test_data(q_num)
    for x in range(0, len(result)):
        tmp = pd.DataFrame([[q_num, result.iloc[x]['doc_number']]], columns=['Query_number', 'doc_number'])
        dimensionality_reduction_rank = dimensionality_reduction_rank.append(tmp).reset_index(drop=True)


In [None]:
dimensionality_reduction_rank

In [None]:
dimensionality_reduction_rank.to_csv('dimensionality_reduction_rank.csv', index = False)