## Qualitative comparison of contextual vs word embedding approaches

In [27]:
import pandas as pd
import numpy as np
import os

import numpy as np
import pandas as pd
from pyserini.index import IndexReader

from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from numpy import dot
from numpy.linalg import norm

In [None]:
ROOT_FOLDER ='.'
DATA_PATH = './data' # path to the data files
OUTPUT_PATH = './outputs' # path to where the outputs will be saved

WORD_RANKINGS_FILE = 'glove_euclidean_ranking.txt' # path to word embedding model's rankings
CONTEXTUAL_RANKINGS_FILE = 'distilbert_cosine_similarity.txt' # path to contextual embedding model's rankings

### Read data

In [28]:
# Read Data
qrels = pd.read_csv(DATA_PATH + '2019qrels-pass.txt',sep=' ', header=None, names = ["query_id", "Q0", "passage_id", "relevance"])
test_queries = pd.read_csv(DATA_PATH + 'msmarco-test2019-queries.tsv',sep='\t', header=None, names = ["query_id", "query"])
passages = pd.read_csv(DATA_PATH + 'collection.tsv',sep='\t', header=None, names = ["passage_id", "passage"])

In [29]:
word_results = pd.read_csv(WORD_RANKINGS_FILE, sep='\t', header=None, names = ["query_id", "passage_id", "rank"])
cont_results = pd.read_csv(CONTEXTUAL_RANKINGS_FILE, sep='\t', header=None, names = ["query_id", "passage_id", "rank"])

In [30]:
#make joins to take query and passage contents
qrels = pd.merge(qrels, test_queries,  how='left', left_on=['query_id'], right_on = ['query_id'])
qrels = pd.merge(qrels, passages,  how='left', left_on=['passage_id'], right_on = ['passage_id'])

### Find highly relevant passages returned by both word and contextual

In [None]:
# Find highly relevant (relevance >= 2)
relevant = qrels.loc[(qrels['relevance'] >= 2)].reset_index()

# Inner join to find irrelevant passages that were returned
cont_returned = pd.merge(cont_results, relevant,  how='inner', left_on=['query_id','passage_id'], right_on = ['query_id','passage_id']).drop(['index','Q0'], axis=1)
word_returned = pd.merge(word_results, relevant,  how='inner', left_on=['query_id','passage_id'], right_on = ['query_id','passage_id']).drop(['index','Q0'], axis=1)

### Find higly relevant passages that word methods ranked at least 800 positions lower than contextual methods

In [106]:
# Find unique query-passage combinations 
unique_tuples = []
unique = word_returned[['query_id','passage_id']].drop_duplicates()
for i in range(unique.shape[0]):
    unique_tuples.append((unique.at[i,"query_id"],unique.at[i,"passage_id"]))


data = []
# For each unique query-passage combination
for i in range(len(unique_tuples)):
    qid = unique_tuples[i][0]
    pid = unique_tuples[i][1]

    # Find index of relevant query-passage pair in the rankings of both the word and contextual embedding models
    index1 = word_returned.loc[(word_returned['query_id'] == qid) & (word_returned['passage_id'] == pid)].reset_index()["index"].values[0]
    index2 = cont_returned.loc[(cont_returned['query_id'] == qid) & (cont_returned['passage_id'] == pid)].reset_index()["index"].values[0]

    # Append to results query-passage combinations, where the ranking of word embedding models was 800 positions lower than the corresponding ranking od contextual
    if word_returned.at[index1,'rank'] - cont_returned.at[index2,'rank'] > 800:
        data.append(cont_returned.loc[index2]) 


df = pd.DataFrame(data, columns=['query_id', 'passage_id', 'rank', 'query', 'passage']).reset_index().drop(["index", "rank"], axis=1)
df = pd.merge(df, cont_returned,  how='left', left_on=['query_id','passage_id'], right_on = ['query_id','passage_id']).drop(['query_y','passage_y'], axis=1).rename(columns={"rank": "cont_rank", "query_x": "query", "passage_x": "passage"})
df = pd.merge(df, word_returned,  how='left', left_on=['query_id','passage_id'], right_on = ['query_id','passage_id']).drop(['query_y','passage_y'], axis=1).rename(columns={"rank": "word_rank", "query_x": "query", "passage_x": "passage"})
df = df.rename(columns={"relevance_y": "relevance"}).drop(["relevance_x","query_id","passage_id"],axis=1)
df

10


### Save Results

In [None]:
df.to_json(OUTPUT_PATH + "comparison_contextual_vs_word.json", orient="records", indent=2)