# Experiment with Query Expansion Using Word Embeddings for Ad Hoc Search

Imports for the project

In [None]:
%%capture
!pip install pyserini
!pip install ir_datasets
!pip install gensim
!pip install faiss-cpu --no-cache
import ir_datasets
import gensim
from gensim.parsing.preprocessing import remove_stopwords
import os
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
from multiprocessing import Pool
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

Download required files for our experiment.

In [None]:
# Downloaded qrels
!wget -c https://trec.nist.gov/data/robust/qrels.robust2004.txt

# Downloaded Google News Word2vec pre-trained model
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

# Unziping Google News Word2vec pre-trained model
!gzip -d GoogleNews-vectors-negative300.bin.gz

We downloaded qrels from https://trec.nist.gov/data/t13_robust.html which is used as to test our technique.

In [None]:
# Funcution to get dataframe from txt file for qrels
def get_ground_truth():
  ground_truth = pd.read_csv('qrels.robust04.txt', sep=" ", header=None)
  ground_truth = ground_truth.rename(columns={0: 'query_id', 2: 'doc_id', 3: 'relevance'})
  return ground_truth

## Load Dataset

In [None]:
dataset = ir_datasets.load("trec-robust04")
query_df = pd.DataFrame(dataset.queries_iter())
score_df = get_ground_truth()

# Replacing relevance 2 to 1
score_df['relevance'] = score_df['relevance'].replace(2, 1)

## Pre-processing

In [None]:
# Pre-processing title for expansion
query_df["title"] = query_df.title.apply(remove_stopwords)
query_df["query_exp_title"] = query_df.title.apply(gensim.utils.simple_preprocess)

In [None]:
# Pre-processing title for expansion and description
query_df["soup"] = query_df["title"] + ' ' + query_df["description"]
query_df["soup"] = query_df.soup.apply(remove_stopwords)
query_df["query_exp_title_description"] = query_df.soup.apply(gensim.utils.simple_preprocess)

In [None]:
# Pre-processing title for expansion, description and narrative
query_df["soup"] = query_df["title"] + ' ' + query_df["description"] + ' ' + query_df["narrative"]
query_df["soup"] = query_df.soup.apply(remove_stopwords)
query_df["query_exp_all"] = query_df.soup.apply(gensim.utils.simple_preprocess)

## Training Model

In [None]:
# setting hyper-parameter
size = 100
sg = 1
window = 2
min_count = 1
iter = 100
workers = Pool()._processes

In [None]:
#Training model with only title
title_vector_model = Word2Vec(sentences = query_df["query_exp_title"], vector_size = size, sg = sg, window = window, min_count = min_count, epochs = iter, workers = workers)
title_vector_model.init_sims(replace = True)
title_vector_model.save('word2vec_title_vector_model')

In [None]:
#Training model with title and description
title_description_vector_model = Word2Vec(sentences = query_df["query_exp_title_description"], vector_size = size, sg = sg, window = window, min_count = min_count, epochs = iter, workers = workers)
title_description_vector_model.init_sims(replace = True)
title_description_vector_model.save('word2vec_title_description_vector_model')

In [None]:
#Training model with title, description and narrative
all_vector_model = Word2Vec(sentences = query_df["query_exp_all"], vector_size = size, sg = sg, window = window, min_count = min_count, epochs = iter, workers = workers)
all_vector_model.init_sims(replace = True)
all_vector_model.save('word2vec_all_vector_model')

In [None]:
# Loading GoogleNews vector model
from gensim import models

model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) 

In [None]:
# You can uncomment and test this query epansion method as well to test and play around


# from operator import itemgetter
# def expand_query(text, model):
#   final_query = gensim.utils.simple_preprocess(text)
#   token = remove_stopwords(text)
#   token = gensim.utils.simple_preprocess(token)
#   word_to_add = []
#   for t in token:
#     if t in model.wv.vocab:
#       sim = model.wv.most_similar(t.lower(), topn=5)
#       if len(sim) > 0:
#         word_to_add.append(sim[0][0])
#       if len(sim) > 1:
#         word_to_add.append(sim[1][0])
#       if len(sim) > 2:
#         word_to_add.append(sim[2][0])
#       if len(sim) > 3:
#         word_to_add.append(sim[3][0])
#       if len(sim) > 4:
#         word_to_add.append(sim[4][0])

#   for o in word_to_add:
#     o = o.replace('_', ' ')
#     final_query.insert(0, o)

#   return ' '.join(final_query)

## Query Expansion

In [None]:
def expand_query(text, model):
  final_query = gensim.utils.simple_preprocess(text)
  token = remove_stopwords(text)
  token = gensim.utils.simple_preprocess(token)
  if str(type(model)) == "<class 'gensim.models.keyedvectors.KeyedVectors'>":
    dic = list(model.index_to_key)
    match_fun = model.most_similar
  else:
    dic = list(model.wv.index_to_key)
    match_fun = model.wv.most_similar
  
  if token[0].lower() in dic:
    first_word = match_fun(token[0].lower())
    word_to_add = []
    leg = 0
    for f,i in first_word:
      if i > .1 and leg < 1:
        word_to_add.append(f)
        leg = leg + 1
    for w in word_to_add:
      final_query.insert(0, w)

  if token[-1].lower() in dic: 
    last_word = match_fun(token[-1].lower())
    ornganl_last = token[-1].lower()
    word_to_add = []
    leg = 0
    for f,i in last_word:
      if i > .1 and leg < 1:
        word_to_add.append(f)
        leg = leg + 1
    for w in word_to_add:
      final_query.insert(-1, w)
  return ' '.join(final_query)

query_df['query_exp_title'] = query_df.title.apply(lambda x: expand_query(x, title_vector_model))
query_df['query_exp_title_description'] = query_df.title.apply(lambda x: expand_query(x, title_description_vector_model))
query_df['query_exp_all'] = query_df.title.apply(lambda x: expand_query(x, all_vector_model))

# query_df['query_exp_title_google'] = query_df.title.apply(lambda x: expand_query(x, model))
query_df


## Document Ranking 

In [None]:
from pyserini.search import SimpleSearcher

searcher = SimpleSearcher.from_prebuilt_index('robust04')

def get_ranking(dataframe, query):
  results = {}
  for index, row in dataframe.iterrows():
      hits = searcher.search(row[query])
      final_hits = []
      for i in range(0, min(10, len(hits))):
        final_hits.append({'docid': hits[i].docid, 'score': hits[i].score})
      results[row['query_id']] = final_hits
  return results

r_title = get_ranking(query_df, 'query_exp_title')
r_title_description = get_ranking(query_df, 'query_exp_title_description')
r_title_all = get_ranking(query_df, 'query_exp_all')
r_title_title_google = get_ranking(query_df, 'query_exp_title_google')

## Result Match

In [None]:
def get_match(results):
  global relevancies
  ground_rank = []
  for key,value in results.items():
    ranks = []
    for v in value:
      relevance = score_df[(int(score_df['query_id']) == int(key)) & (score_df['doc_id'] == v['docid'])]['relevance']
      if relevance.empty:
        ranks.append(0)
      else:
        ranks.append(relevance.values[0])
    if len(ranks) != 10:
      for i in range(10 - len(ranks)):
        ranks.append(0)
    ground_rank.append({'query_id':key, 'hits': ranks})
  return ground_rank

title_result = get_match(r_title)
title_description_result = get_match(r_title_description)
all_result = get_match(r_title_all)
google = get_match(r_title_title_google)
# google
all_result

## Calculate evaluation metrics

In [None]:
def precision(query_relevancy_labels, k):
    value = 0
    for label in query_relevancy_labels:
          value += np.sum(label[:k])
    return value/(k*total_query)

def recall(query_relevancy_labels, k):
    value = 0
    for label in query_relevancy_labels:
      if np.sum(label):
        value += (np.sum(label[:k])/np.sum(label))
    return value/total_query

def F_score(query_relevancy_labels, k):
    value = 0
    for label in query_relevancy_labels:
      precision_value = precision([label], k)
      recall_value = recall([label], k)
      sum_of_both = recall_value + precision_value
      if sum_of_both:
        value += ((2.0*precision_value*recall_value)/sum_of_both)
    return value

def DCG(query_relevancy_labels, k):
    # Use log with base 2
    value = 0
    for label in query_relevancy_labels:
      dcg = 0
      for i in range(min(len(label), k)): 
          if label[i]:
            # add 2 because python 0-index
            denominator =  np.log2(i + 2) 
            score = 1/denominator
            dcg = dcg + score
      value = value + dcg
    return value/len(query_relevancy_labels)

def NDCG(query_relevancy_labels, k):
    ndcg = 0
    for label in query_relevancy_labels:
      dcg = DCG([label], k)
      max_dcg = DCG([sorted(label, reverse=True)], k)
      if max_dcg:
        ndcg = ndcg + (dcg / max_dcg)
    return ndcg/total_query

def MAP(query_relevancy_labels):
    s = 0
    for label in query_relevancy_labels:
      top = 0
      for i in range(len(label)):
        pre = np.sum(label[:(i+1)])/(i+1)
        if label[i]:
          top = top + pre
      if np.sum(label):
        s = s + (top/np.sum(label))
    return s/total_query

def MRR(query_relevancy_labels):
    s = 0
    for label in query_relevancy_labels:
      is_done = 1
      for i in range(len(label)):
        if label[i] and is_done:
          is_done = 0
          s = s + (1/(i+1))
      
    return s/total_query

In [None]:
def evaluate(result):
  global total_query
  query_labels = []
  for i in result:
    query_labels.append(i['hits'])
  
  total_query = len(query_labels)

  return {
      'precision@10': precision(query_labels, 10),
      'recall@10': recall(query_labels, 10),
      'F-score@10': F_score(query_labels, 10),
      'DCG@10': DCG(query_labels, 10),
      'NDCG@10': NDCG(query_labels, 10),
      'MAP': MAP(query_labels),
      'MRR': MRR(query_labels)
  }

In [None]:
title_scores = evaluate(title_result)
title_description_scores = evaluate(title_description_result)
all_score = evaluate(all_result)
google_score = evaluate(google)

title_scores, title_description_scores, all_score, google_score

## Calculate Baseline

In [None]:
def get_expansion_ranking(dataframe, query):
  results = {}
  searcher.set_rm3(10, 10, 0.5)
  for index, row in dataframe.iterrows():
      hits = searcher.search(row[query])
      final_hits = []
      for i in range(0, min(10, len(hits))):
        final_hits.append({'docid': hits[i].docid, 'score': hits[i].score})
      results[row['query_id']] = final_hits
  return results

baseline_evaluation = evaluate(get_match(get_expansion_ranking(query_df, 'title')))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
x = baseline_evaluation.keys()
y = baseline_evaluation.values()
z = title_scores.values()
o = title_description_scores.values()

X_axis = np.arange(len(x))

plt.figure(figsize=(10, 6), dpi=80)

plt.xticks(X_axis, x)
plt.bar(X_axis-0.2, y, label='Baseline Evaluation', width=0.5)
# plt.bar(X_axis+0.3, z, label='QE Evaluation', width=0.5)
plt.bar(X_axis+0.3, z, label='TitleQE Evaluation', width=0.5)

plt.legend()

In [None]:
x = baseline_evaluation.keys()
y = baseline_evaluation.values()
z = title_description_scores.values()

X_axis = np.arange(len(x))

plt.figure(figsize=(10, 6), dpi=80)

plt.xticks(X_axis, x)
plt.bar(X_axis-0.2, y, label='Baseline Evaluation', width=0.5)
# plt.bar(X_axis+0.3, z, label='QE Evaluation', width=0.5)
plt.bar(X_axis+0.3, z, label='Title description QE Evaluation', width=0.5)

plt.legend()

In [None]:
baseline_evaluation['name'] = 'Without Query Expansion'
title_scores['name'] = 'Query Expansion With Title'
title_description_scores['name'] = 'Query Expansion With Name and Description'
all_score['name'] = 'Query Expansion With Name, Description and Narrasion'
google_score['name'] = 'Query Expansion With google'

pd.DataFrame([baseline_evaluation, title_scores, title_description_scores, all_score, google_score])