In [None]:
import pandas as pd
import numpy as np
import json
from numpy import dot
from numpy.linalg import norm

In [None]:
ROOT_FOLDER ='.'
DATA_PATH = './data' # path to the data files
OUTPUT_PATH = './outputs' # path to where the outputs will be saved
DISTILBERT_RANKINGS_FILE = 'distilbert_cosine_similarity.txt' # path to Distlbert rankings
BM25_PRF_RANKINGS_FILE = 'prf.txt' # path to bm25+prf rankings
DISTILBERT_EVAL_FILE = 'errors_distilbert_ranking.txt' # path to ndcg errors of Distlbert rankins
PATH_TO_INDEXES = 'indexes/trec-19-dl'

# Behaviour on most difficult queries

### Read Data

In [None]:
# Read Data
qrels = pd.read_csv(DATA_PATH + '2019qrels-pass.txt',sep=' ', header=None, names = ["query_id", "Q0", "passage_id", "relevance"])
test_queries = pd.read_csv(DATA_PATH + 'msmarco-test2019-queries.tsv',sep='\t', header=None, names = ["query_id", "query"])
results = pd.read_csv(DATA_PATH + DISTILBERT_RANKINGS_FILE,sep='\t', header=None, names = ["query_id", "passage_id", "rank"])
errors = pd.read_csv(DATA_PATH + DISTILBERT_EVAL_FILE,sep='\t', header=None, names = ["label", "query_id", "value"])
passages = pd.read_csv(DATA_PATH + 'collection.tsv',sep='\t', header=None, names = ["passage_id", "passage"])

In [None]:
#make joins to take query and passage contents
qrels = pd.merge(qrels, test_queries,  how='left', left_on=['query_id'], right_on = ['query_id'])
qrels = pd.merge(qrels, passages,  how='left', left_on=['passage_id'], right_on = ['passage_id'])

### Find most difficult queries

In [None]:
ndcg = errors['label'][1]
#get queries with ndcg <= 0.25, which will be the examined queries
queries = errors.loc[(errors['value'] <= 0.3) & (errors['label'] == ndcg)]['query_id'].tolist()
queries

In [None]:
returned = results[results['query_id'].astype(str).isin(queries)].reset_index()

### Find irrelevant passages returned for the most difficult queries

In [None]:
# Find returned
returned = results[results['query_id'].astype(str).isin(queries)].reset_index()
# Find irrelevant (relevance = 0)
irrelevant = qrels.loc[(qrels['relevance'] == 0)].reset_index()
# Inner join to find irrelevant passages that were returned
irrelevant_returned = pd.merge(returned, irrelevant,  how='inner', left_on=['query_id','passage_id'], right_on = ['query_id','passage_id']).drop(['index_x', 'index_y', 'Q0','relevance'], axis=1)
irrelevant_returned

### Find relevant passages not returned for the most difficult queries

In [None]:
# Find returned passage ids
returned_passages = results[results['query_id'].astype(str).isin(queries)].reset_index()["passage_id"].tolist()
# Find relevant (relevance >= 1) from examined queries
relevant = qrels.loc[(qrels['relevance'] >= 1) & (qrels['query_id'].astype(str).isin(queries))].reset_index()
# Find relevant that were not returned
relevant_not_returned = relevant[~relevant['passage_id'].isin(returned_passages)].reset_index().drop(['level_0', 'index', 'Q0'], axis=1)
relevant_not_returned

### Save files

In [None]:
# Create Output Files
irrelevant_returned.to_json(OUTPUT_PATH + "distilbert_irrelevant_returned.json", orient="records", indent=2)
relevant_not_returned.to_json(OUTPUT_PATH + "distilbert_relevant_not_returned.json", orient="records", indent=2)

### Find percentages of relevant, highly relevant, and perfeclty relevant passages retreived by the model for the most difficult queries

In [None]:
for i in range(3):
    available = qrels.loc[(qrels['relevance'] <= 3)].reset_index() 
    irrelevant = qrels.loc[(qrels['relevance'] <= i)].reset_index() 
    irrelevant

    # Inner join to find irrelevant passages that were returned
    irrelevant_returned = pd.merge(returned, irrelevant,  how='inner', left_on=['query_id','passage_id'], right_on = ['query_id','passage_id']).drop(['index_x', 'index_y', 'Q0','relevance'], axis=1)
    available_returned = pd.merge(returned, available,  how='inner', left_on=['query_id','passage_id'], right_on = ['query_id','passage_id']).drop(['index_x', 'index_y', 'Q0','relevance'], axis=1)

    success_rates = {}
    for query in queries:
        temp = available_returned.loc[(available_returned['query_id'] == int(query))].shape[0]
        success_rates[query] = (temp - irrelevant_returned.loc[(irrelevant_returned['query_id'] == int(query))].shape[0])/temp
    print(np.array(list(success_rates.values())).mean())
    
    # Create Output Files
    with open(OUTPUT_PATH + "glove_success_rates_greater_than_{}.json".format(i+1), "w") as outfile:
        json.dump(success_rates, outfile, indent=2)

# Passage length analysis

In [None]:
# Read Data
qrels = pd.read_csv(DATA_PATH + '2019qrels-pass.txt',sep=' ', header=None, names = ["query_id", "Q0", "passage_id", "relevance"])
test_queries = pd.read_csv(DATA_PATH + 'msmarco-test2019-queries.tsv',sep='\t', header=None, names = ["query_id", "query"])
results = pd.read_csv(DATA_PATH + DISTILBERT_RANKINGS_FILE,sep='\t', header=None, names = ["query_id", "passage_id", "rank"])
errors = pd.read_csv(DATA_PATH + DISTILBERT_EVAL_FILE,sep='\t', header=None, names = ["label", "query_id", "value"])
passages = pd.read_csv(DATA_PATH + 'collection.tsv',sep='\t', header=None, names = ["passage_id", "passage"])

In [None]:
# make joins to take query and passage contents
qrels = pd.merge(qrels, test_queries,  how='left', left_on=['query_id'], right_on = ['query_id'])
qrels = pd.merge(qrels, passages,  how='left', left_on=['passage_id'], right_on = ['passage_id'])

# Keep only large passages, with more than 700 characters
mask = (qrels['passage'].str.len() > 700)
qres_large = qrels.loc[mask].reset_index()
qres_large

In [None]:
results_embedd = pd.read_csv(DISTILBERT_RANKINGS_FILE, sep='\t', header=None, names = ["query_id", "passage_id", "rank"])
results_prf = pd.read_csv(BM25_PRF_RANKINGS_FILE, sep='\t', header=None, names = ["query_id", "passage_id", "rank"])

In [None]:
#make joins to take query and passage contents
df = pd.merge(qres_large, results_prf,  how='left', left_on=['query_id', 'passage_id'], right_on = ['query_id', 'passage_id']).rename(columns={"rank": "PRF_rank"})
df = pd.merge(df, results_embedd,  how='left', left_on=['query_id', 'passage_id'], right_on = ['query_id', 'passage_id']).rename(columns={"rank": "embed_rank"})

### Keep only highly relevant queries, which were ranked in the top 10 positions of BM25+PRF and study the relation between passage length and the new rankings from the GloVe model

In [None]:
df = df.loc[(df['relevance'] >= 2) & (df['PRF_rank'] <= 10)].reset_index().drop(["index","query_id","Q0","passage_id","level_0"], axis=1)
for i in range(df.shape[0]):
    df.at[i,"passage_length"] = len(df.at[i,'passage'])
df

In [None]:
### Save results
df.to_csv(OUTPUT_PATH + "distilbert_passage_length_analysis.txt", header=True, sep=" ", index=False)