# Team 13 MS MARCO Document re-ranking task

Here you will find our implementation for the MS MARCO re-ranking 2020 task. This project was part of our evaluation and project work in DAT640-1 20H Information Retrieval and Text Mining. This implementation are using a feature based traditional machine learning approach to this problem.

## Task

We were to re-rank the top 100 documents retrieved from queries (from Indri model) in the MS MARCO dataset.
We have implemented a both a baseline and an advanced model to tackle this problem.
Our models are feature based traditional machine learning models with a pointwise learning to rank approach.

## Environment

Your environment is expected to be an Anaconda base environment with Python version 3.6+.
You must have elasticsearch library installed with version 7.9.1+ and have a local instance of it running on your machine.

In [1]:
import csv
from elasticsearch import Elasticsearch
from collections import defaultdict
import sys
import random
import os
import random
from collections import defaultdict
import pandas as pd
from elasticsearch import helpers
from nltk import sent_tokenize, word_tokenize
import nltk
from scipy import spatial
import numpy as np
import math
from nltk.util import ngrams
import concurrent.futures
import more_itertools as mit
from sklearn.ensemble import RandomForestRegressor
import pickle
import numpy as np
#from LambdaRankNN import LambdaRankNN

In [5]:
#To deal with csv error. https://stackoverflow.com/questions/15063936/csv-error-field-larger-than-field-limit-131072
maxInt = sys.maxsize
while True:
    try:
        csv.field_size_limit(maxInt)
        break
    except OverflowError:
        maxInt = int(maxInt/10)

## Indexing

In [6]:
# set index to True to index the dataset. Requiers msmarco-docs.tsv and elasticsearch.
index = False

es = Elasticsearch()
es.info()

{'name': 'DESKTOP-E2UM8IU',
 'cluster_name': 'elasticsearch',
 'cluster_uuid': '0MFc_C9cRA27Grr9lSUCYA',
 'version': {'number': '7.9.1',
  'build_flavor': 'default',
  'build_type': 'zip',
  'build_hash': '083627f112ba94dffc1232e8b42b73492789ef91',
  'build_date': '2020-09-01T21:22:21.964974Z',
  'build_snapshot': False,
  'lucene_version': '8.6.2',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [7]:
INDEX_NAME ='ms_marco_index'


INDEX_SETTINGS = {
    "settings":{
        "analysis":{
       
        "analyzer": {
            "my_analyzer":{
                "tokenizer":"standard",
                "filter": [ "lowercase","kstem","stop" ]     
            }         
        }       
            
        }    
        
    },
    'mappings': {
            'properties': {
                'url': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'english'
                },
                'title': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'my_analyzer'
                },
                'body': {
                    'type': 'text',
                    'term_vector': 'yes',
                    'analyzer': 'my_analyzer'
                }
                
                
            }
        }
    }

if index:
    es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)

The following cell defines a function for bulk indexing

In [8]:
def get_documents(file="msmarco-docs.tsv"):
    """Create generator object containg document dictionaries for bulk indexing
    Arguments:
        file(string): name of file to index. 
    Returns:
        Generator object 
    """
    with open(file,encoding='utf8') as f:
        tsv = csv.reader(f, delimiter="\t")
        for [doc, url, title,body] in tsv:
            yield {'_id':doc,'url':url,'title':title,'body':body}

if index:
    helpers.bulk(es, get_documents(), index=INDEX_NAME)

## Trainingdata

In this section we will retrieve our downloaded datasets into our code and putting them into pandas dataframes for further use. Here we will also modify the dataframes to only get the columns we need to use for our implementation

In [9]:
df_querys_train = pd.read_csv("queries.doctrain.tsv",encoding="utf8",delimiter="\t",header=None)
df_querys_train.rename( columns ={0: 'id', 1: 'query'}, inplace = True )
df_querys_train = df_querys_train.astype('str')
df_querys_train.set_index('id',inplace = True)
df_querys_train.head()

Unnamed: 0_level_0,query
id,Unnamed: 1_level_1
1185869,)what was the immediate impact of the success ...
1185868,_________ justice is designed to repair the ha...
1183785,elegxo meaning
645590,what does physical medicine do
186154,feeding rice cereal how many times per day


In [10]:
df_qrels_train = pd.read_csv("msmarco-doctrain-qrels.tsv",encoding="utf8",delimiter="\t",header=None) 
df_qrels_train.rename( columns ={0: 'test'}, inplace = True )
df_qrels_train = pd.DataFrame( df_qrels_train.test.str.split(' ',3).tolist(),columns = ['query_id','i1','doc_id','i2'])
df_qrels_train.set_index(['query_id','doc_id'],inplace = True)
df_qrels_train.drop(['i1', 'i2'], axis=1,inplace = True)
df_qrels_train.head()

query_id,doc_id
3,D312959
5,D140227
12,D213890
15,D1033338
16,D508131


In [11]:
df_top100_train = pd.read_csv("msmarco-doctrain-top100.tsv",encoding="utf8",delimiter="\t",header=None)#,nrows=1000)
df_top100_train.rename( columns ={0: 'test'}, inplace = True )
df_top100_train = pd.DataFrame( df_top100_train.test.str.split(' ',5).tolist(),columns = ['query_id','i1','doc_id','i2','score','i3'])
df_top100_train.set_index(['query_id','doc_id'],inplace = True)
df_top100_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,i1,i2,score,i3
query_id,doc_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1185869,D59221,Q0,1,-4.80433,IndriQueryLikelihood
1185869,D59220,Q0,2,-4.92127,IndriQueryLikelihood
1185869,D2192591,Q0,3,-5.05215,IndriQueryLikelihood
1185869,D2777518,Q0,4,-5.05486,IndriQueryLikelihood
1185869,D2371978,Q0,5,-5.07048,IndriQueryLikelihood


# Dev and Testdata

In this section we will retrieve our downloaded development and training datasets into our code and putting them into pandas dataframes for further use. Here we will also modify the dataframes to only get the columns we need to use for our implementation like we did in the previous section.

In [12]:
df_querys_devtest = pd.read_csv("queries.docdev.tsv",encoding="utf8",delimiter="\t",header=None)
df_querys_devtest.rename( columns ={0: 'id', 1: 'query'}, inplace = True )
df_querys_devtest = df_querys_devtest.astype('str')
df_querys_devtest.set_index('id',inplace = True)
df_querys_devtest.head()

Unnamed: 0_level_0,query
id,Unnamed: 1_level_1
174249,does xpress bet charge to deposit money in you...
320792,how much is a cost to run disneyland
1090270,botulinum definition
1101279,do physicians pay for insurance from their sal...
201376,here there be dragons comic


In [13]:
df_top100_devtest = pd.read_csv("docdev-stopstem.xml_1.out",encoding="utf8",delimiter="\t",header=None)#,nrows=1000)
df_top100_devtest.rename( columns ={0: 'test'}, inplace = True )
df_top100_devtest = pd.DataFrame( df_top100_devtest.test.str.split(' ',5).tolist(),columns = ['query_id','i1','doc_id','i2','score','i3'])
df_top100_devtest.set_index(['query_id','doc_id'],inplace = True)
#df_qrels.drop(['i1', 'i2'], axis=1,inplace = True)
df_top100_devtest.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,i1,i2,score,i3
query_id,doc_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
174249,D3126539,Q0,1,-5.99003,IndriQueryLikelihood
174249,D978773,Q0,2,-6.18444,IndriQueryLikelihood
174249,D399803,Q0,3,-6.20982,IndriQueryLikelihood
174249,D2204704,Q0,4,-6.24312,IndriQueryLikelihood
174249,D3126541,Q0,5,-6.24726,IndriQueryLikelihood


In [14]:
df_qrels_devtest = pd.read_csv("msmarco-docdev-qrels.tsv",encoding="utf8",delimiter="\t",header=None) 
df_qrels_devtest.rename( columns ={0: 'test'}, inplace = True )
df_qrels_devtest = pd.DataFrame( df_qrels_devtest.test.str.split(' ',3).tolist(),columns = ['query_id','i1','doc_id','i2'])
df_qrels_devtest.set_index(['query_id','doc_id'],inplace = True)
df_qrels_devtest.drop(['i1', 'i2'], axis=1,inplace = True)
df_qrels_devtest.head()

query_id,doc_id
2,D1650436
1215,D1202771
1288,D1547717
1576,D1313702
2235,D2113408


In [100]:
df_querys_dev = df_querys_devtest[500:]#.sample(frac=.95,random_state=2)
df_querys_test = df_querys_devtest.drop(df_querys_dev.index)
df_top100_dev = df_top100_devtest.loc[df_querys_dev.index] 
df_top100_test =df_top100_devtest.loc[df_querys_test.index]
df_qrels_dev = df_qrels_devtest.loc[df_querys_dev.index]
df_qrels_test = df_qrels_devtest.loc[df_querys_test.index]

print('Dev: ',len(df_querys_dev))
print('Test: ',len(df_querys_test))

Dev:  4693
Test:  500


## Evaluate 

In this section we will implement the function to rank the MRR@100 score for our implementations. The function is gathered from the official MS MARCO website.

In [16]:
#Official scoring function from: https://github.com/microsoft/MSMARCO-Document-Rankin,ms_marco_eval.py 
def compute_metrics(qids_to_relevant_documentids, qids_to_ranked_candidate_documents):
    """Compute MRR metric
    Args:    
    p_qids_to_relevant_documentids (dict): dictionary of query-document mapping
        Dict as read in with load_reference or load_reference_from_stream
    p_qids_to_ranked_candidate_documents (dict): dictionary of query-document candidates
    Returns:
        dict: dictionary of metrics {'MRR': <MRR Score>}
    """
    all_scores = {}
    MRR = 0
    qids_with_relevant_documents = 0
    ranking = []
    
    for qid in qids_to_ranked_candidate_documents:
        if qid in qids_to_relevant_documentids:
            ranking.append(0)
            target_pid = qids_to_relevant_documentids[qid]
            candidate_pid = qids_to_ranked_candidate_documents[qid]
            for i in range(0,len(candidate_pid)):
                if candidate_pid[i] in target_pid:
                    MRR += 1/(i + 1)
                    ranking.pop()
                    ranking.append(i+1)
                    break
    if len(ranking) == 0:
        raise IOError("No matching QIDs found. Are you sure you are scoring the evaluation set?")
    
    MRR = MRR/len(qids_to_relevant_documentids)
    all_scores['MRR @100'] = MRR
    all_scores['QueriesRanked'] = len(set(qids_to_ranked_candidate_documents))
    return all_scores


In [101]:
truth_test = {}
rank_indri ={}
for query in df_top100_test.index.get_level_values(0).unique():
    truth_test[query] = df_qrels_test.loc[query].index.tolist()
    rank_indri[query] = df_top100_test.loc[query].index.tolist()

result = compute_metrics(truth_test,rank_indri)
print('Indri:',result  )  

Indri: {'MRR @100': 0.22037977500707348, 'QueriesRanked': 500}


# Baseline model

## Functions from Assignment A5 DAT640 University of Savanger fall 2020:

Here we will implement the functions used to retrieve the features for our baseline model and functions for ranking. Many of these functions are taken from Assignment: A5 in the course DAT640-1 20H Information Retrieval and Text Mining by Krisztian Balog. The functions taken from this assignment is marked with '#A5 DAT640 UiS 2020'

In [88]:
#A5 DAT640 UiS 2020
def analyze_query(es, query, field, index=INDEX_NAME):
    """Analyzes a query with respect to the relevant index. 
    
    Arguments:
        es: Elasticsearch object instance.
        query: String of query terms.
        field: The field with respect to which the query is analyzed. 
        index: Name of the index with respect to which the query is analyzed.  
    
    Returns:
        A list of query terms that exist in the specified field among the documents in the index. 
    """
    tokens = es.indices.analyze(index=index, body={'text': query,"tokenizer": "standard","filter": [ "lowercase","kstem","stop" ]})['tokens']
    query_terms = []
    for t in sorted(tokens, key=lambda x: x['position']):
        ## Use a boolean query to find at least one document that contains the term.
        hits = es.search(index=index, body={'query': {'match': {field: t['token']}}}, 
                                   _source=False, size=1).get('hits', {}).get('hits', {})
        doc_id = hits[0]['_id'] if len(hits) > 0 else None
        if doc_id is None:
            continue
        query_terms.append(t['token'])
    return query_terms

#A5 DAT640 UiS 2020
def extract_doc_features(doc_id, es, index=INDEX_NAME):
    """Extracts features of a document.
    
        Arguments:
            doc_id: Document identifier of indexed document.
            es: Elasticsearch object instance.
            index: Name of relevant index on the running Elasticsearch service. 

        Returns:
            Dictionary with keys 'doc_length_title', 'doc_length_body'.
    """
    doc_features = {}
    
    vec = es.termvectors(index=index, id=doc_id,term_statistics=True)['term_vectors']
    title = vec['title']['terms'] if 'title' in vec.keys() else 0
    body = vec['body']['terms'] if 'body' in vec.keys() else 0
    
    doc_features['doc_length_title'] = sum([int(title[term]['term_freq']) for term in title ]) if title!=0 else 0
    doc_features['doc_length_body'] = sum([int(body[term]['term_freq']) for term in body ]) if body!=0 else 0

    return doc_features

#A5 DAT640 UiS 2020
def rerank(ltr,df_ranked,querys,es =es,index =INDEX_NAME,  fields = ['title','body'],df =df_querys_test):

    
    test_rankings={}
    for i,query in enumerate(querys):
            
            X=[]
            for doc in df_ranked.loc[query].index.tolist():
                f = get_features_baseline(df_querys_test.loc[query]['query'], doc, es, index,fields)
                X.append(f)

            
            r = ltr.rank(X, df_ranked.loc[query].index.tolist())
            
            test_rankings[query] = [i[0] for i in r]
            
            if i%2==0:
                    print(round((i/len(querys))*100,3),'%',end='\r')
    
    print(round((i/len(querys))*100,3),'%',end='\r')               
    return test_rankings

#A5 DAT640 UiS 2020
class PointWiseLTRModel(object):
    def __init__(self, regressor):
        """
        Arguments:
            classifier: An instance of scikit-learn regressor.
        """
        self.regressor = regressor

    def _train(self, X, y):
        """Trains an LTR model.
        
        Arguments:
            X: Features of training instances.
            y: Relevance assessments of training instances.
        """
        assert self.regressor is not None
        self.model = self.regressor.fit(X, y)

    def rank(self, ft, doc_ids):
        """Predicts relevance labels and rank documents for a given query.
        
        Arguments:
            ft: A list of feature vectors for query-document pairs.
            doc_ids: A list of document ids.
        Returns:
            List of tuples, each consisting of document ID and predicted relevance label.
        """
        assert self.model is not None
        rel_labels = self.model.predict(ft)
        
        sort_indices = np.argsort(rel_labels)[::-1]

        results = []
        for i in sort_indices:
            results.append((doc_ids[i], rel_labels[i]))
        return results

In [89]:

def get_unordered_bigram_matches(query,text):
        """Count number of sentences with more then two query matches
        Args:    
        query (list):  list of query tokens
        text (string) text to 
        Returns:
            dict: dictionary of metrics {'hits':number ,'length':number of sentences }
        """
        t_sentence = list(sent_tokenize(text))
        sentences = [get_tokenized(sentence) for sentence in t_sentence]
        sent = sum([1 for sentence in sentences if len(set(sentence).intersection(query))>=2 ])
        return {'hits':sent,'length':len(t_sentence)} 
    

def get_tokenized(text,limit=40000):
        """ 
        Args:    
        text (string): text to be tokenized  
        limit(int): cap the text length used
        Returns:
            list contaning tokens
        """
        
        tokens = es.indices.analyze(index=INDEX_NAME, body={'text': text[:limit],"tokenizer": "standard","filter": [ "lowercase","kstem","stop" ]})['tokens']
        return [ token['token'] for token in  sorted(tokens, key=lambda x: x['position']) ]

def get_ngram_matches(query,text,n_grams):
        """ 
        Args:    
        query (list): query tokens  
        text(string): text
        n_gram: number of n gram words to use.
        Returns:
             dict: dictionary of metrics{'unique_query_terms':int,'sum_TF':int,'max_TF':int,'avg_TF':int,'len':int }
        """
    
        features ={'unique_query_terms':0,'sum_TF':0,'max_TF':0,'avg_TF':0,'len':0 }
        
        if len(query)<n_grams:
            return features
        
        q_ngrams = list(ngrams(query,n_grams))
        t_ngrams = list(ngrams(text,n_grams))
        
        terms = [ t_ngrams.count(query_term) if query_term in t_ngrams else 0 for query_term in q_ngrams ]
        features['unique_query_terms'] = len( set([t  for t in  q_ngrams if t in t_ngrams]) )
        features['sum_TF'] =sum(terms)
        features['max_TF'] = max(terms) if sum(terms)!=0 else 0
        features['avg_TF'] = sum(terms)/len(terms) if sum(terms)!=0 else 0
        features['len'] = len(t_ngrams)
        
        return features    
            
def get_unigrams(query_terms, doc_id,field ,es, index=INDEX_NAME):
        """ 
        Args:    
        query_terms (list): query tokens  
        doc_id(string): id of document
        field(string): name of field to find matches
        es(): elasticsearch obj
        index(string): name of index
        Returns:
             dict: dictionary of metrics{'unique_query_terms':int,'sum_TF':int,'max_TF':int,'avg_TF':int,'len':int }
        """

        features = {'unique_query_terms':0,'sum_TF':0,'max_TF':0,'avg_TF':0,'len':0 }

        tv = es.termvectors(index=index,id=doc_id,field_statistics=True,term_statistics=True)['term_vectors']

        if field not in tv.keys():
            return features

        terms =  [tv[field]['terms'][t]['term_freq'] if t in tv[field]['terms'].keys() else 0  for t in  query_terms[field]  ]
        features['unique_query_terms'] = len( set([t  for t in  query_terms[field] if t in tv[field]['terms'].keys() ]) )
        features['sum_TF'] =sum(terms)
        features['max_TF'] = max(terms) if sum(terms)!=0 else 0
        features['avg_TF'] = sum(terms)/len(terms) if sum(terms)!=0 else 0
        features['len'] = int(sum([int(tv[field]['terms'][term]['term_freq']) for term in tv[field]['terms'].keys() ]) if terms!=0 else 0)

        return features


In [90]:
def get_feature_vectors_baseline(query,query_id,top_100_docs,elastic=es, index=INDEX_NAME, fields = ['title','body']):
        """ 
        Args:    
        query (string): query string  
        query_id(string): query id
        top_100_docs(list): list of document ids
        elastic(): elastic obj
        index(string):index name 
        Returns:
             numpy array: feauter vectors
             tuple: (query_id,doc_id) 
        """
    
        X = []
        ids = []
        for doc in top_100_docs:
            X.append( get_features_baseline(str(query),str(doc), elastic, index,))
            ids.append((str(query_id),str(doc)))
        
        return np.array(X),ids 
    
    

def get_features_baseline(query, doc_id, es, index='ms_marco_index', fields =['title','body']):
        """ 
        Args:    
        query (string): query string  
        query_id(string): query id
        elastic(): elastic obj
        index(string):index name 
        Returns:
             numpy array: feauter vector 
        """
        
        #print(doc_id)
        doc_orig = es.get(id= doc_id,index=INDEX_NAME)['_source']
       
    
        #Query Document features
        query_terms = {field: get_tokenized(analyze_query(es,str(query),field,index)) for field in fields}
        terms =  { field: get_tokenized(doc_orig[field]) for field in fields }
        unigram = {field:get_unigrams(query_terms, doc_id,field ,es, index ) for field in fields}
        bigram = {field:get_ngram_matches(query_terms[field],terms[field],2) for field in fields}
        trigram = {field:get_ngram_matches(query_terms[field],terms[field],3) for field in fields}  
        unordered_bigram_body = get_unordered_bigram_matches(query_terms['body'],doc_orig['body']) 
        
        
        #Query features
        q= np.array([ len(query_terms[field]) for field in fields] ).flatten()
       
    
        #document feautures
        d = extract_doc_features(doc_id,es)
        
        doc = np.array([d[val] for val in d]).flatten()
        uni = np.array([ [ unigram[field][val_type] for val_type in unigram[field]] for field in unigram ]).flatten()
        bi = np.array([ [ bigram[field][val_type] for val_type in bigram[field]] for field in bigram ]).flatten()
        tri = np.array([ [ trigram[field][val_type] for val_type in trigram[field]] for field in trigram ]).flatten()
        unor = np.array( [unordered_bigram_body[i] for i in  unordered_bigram_body ]  ).flatten()
       
        return np.concatenate(( q,doc,uni,bi,tri,unor), axis=None) 
    
    
    
    
def get_trainingdata_baseline(df_querys,df_top100,df_qrels,number_of_neg_samples=1, es=es, index=INDEX_NAME):
            """ 
            Args:    
            df_querys (pandas dataframe): 
            df_top100 (pandas dataframe): 
            df_qrels (pandas dataframe):
            number of neg samples(int): How many negative samples to include 
            es(): elastic obj
            index(string):index name 
            Returns:
                 numpy array: feauter vector X
                 list : lables y
                 tuple: (query_id,doc_id)
            """
            
            X = []
            y = []
            q = []
            ids = [] 
            for i,query in enumerate(df_querys.index.tolist()):
                
                top_100   = df_top100.loc[query].index.tolist()
                relevant  = df_qrels.loc[query].index.tolist()
                
                pos_all = [e for i,e in enumerate(top_100)  if e in relevant]    
                neg_all = [e for i,e in enumerate(top_100)  if e not in relevant]
                
                if not pos_all:
                    continue
                    
                query_text = df_querys.loc[query]['query']
                
                neg_samples = neg_all[-number_of_neg_samples:] #random.sample(neg_all, number_of_neg_samples)
                features_neg,dq_ids  = get_feature_vectors_baseline(query_text,query,neg_samples)
                
                for vec,id_ in zip(features_neg,dq_ids):
                    X.append(vec)
                    y.append(0)
                    ids.append(id_ )
                    
                    
                features_pos,dq_ids  = get_feature_vectors_baseline(query_text,query,pos_all)
                
                for vec,id_ in zip(features_pos,dq_ids):
                    X.append(vec)
                    y.append(1)
                    ids.append(id_)
                
                if i%10==0:
                    print(round((i/len(training_querys))*100,3),'%',end='\r')
            
            print(round((i/len(training_querys))*100,3),'%',end='\r')        
            return X,y,ids

## Training

In this section we will use the functions we have created earlier in this code to train our data using the RandomForestRegressor and PointWiseLTRModel.

In [91]:
training_querys = df_querys_train.sample(n=10000)#.index.get_level_values(0).tolist()
X_train_baseline,y_train_baseline,ids_baseline = get_trainingdata_baseline(training_querys,df_top100_train,df_qrels_train,number_of_neg_samples=2)

99.99 %

In [102]:
clf =RandomForestRegressor(max_depth=5, random_state=0, n_estimators=100)
ltr = PointWiseLTRModel(clf)
ltr._train(X_train_baseline, y_train_baseline)

In [103]:
reranked_baseline = rerank(ltr,df_top100_test,df_querys_test.index.tolist(),es =es,index =INDEX_NAME,  fields = ['title','body'],df =df_querys_test)
result = compute_metrics(truth_test,reranked_baseline)
print('Baseline:',result  ) 

Baseline: {'MRR @100': 0.22659855588013, 'QueriesRanked': 500}


## Advanced Model

In this section we will add functions to add additional features to our baseline model. Lastly we will rank this advanced model and evaluate it, to see how it scored compared to the baseline

In [62]:
def url_match(url,query):
        url = url.replace('com','').replace('www','').replace('html','').replace('en','').split('/')
        test = ' '.join([ word.replace('.',' ').replace('_',' ') +' ' for word in url[1:]])
        url_tokens = get_tokenized(test)
        
        return len(set(url_tokens).intersection(query))

    
def get_feature_vectors_advanced(query_id,df_querys,df_top100,elastic=es, index=INDEX_NAME, fields = ['title','body']):
        """ 
        Args:    
        query (string): query string  
        query_id(string): query id
        top_100_docs(list): list of document ids
        elastic(): elastic obj
        index(string):index name 
        Returns:
             numpy array: feauter vectors
             tuple: (query_id,doc_id) 
        """
    
        X = []
        ids = []
        #print(df_top100.head())
        for doc in df_top100.index.tolist():
            X.append( get_features_advanced(doc,query_id,df_querys,df_top100, elastic, index))
            ids.append((str(query_id),str(doc)))
 
        return np.array(X),ids    

def get_features_advanced(doc_id,query_id,df_querys,df_top100, es, index=INDEX_NAME, fields =['title','body']):
        
        #print(doc_id)
        doc_orig = es.get(id= doc_id,index=INDEX_NAME)['_source']
       
        score = float(df_top100.loc[doc_id]['score'])
        
        #Query Document features
        query_terms = {field: get_tokenized(analyze_query(es,str(df_querys.loc[query_id]['query']),field,index)) for field in fields}
        terms =  { field: get_tokenized(doc_orig[field]) for field in fields }
        unigram = {field:get_unigrams(query_terms, doc_id,field ,es, index ) for field in fields}
        bigram = {field:get_ngram_matches(query_terms[field],terms[field],2) for field in fields}
        trigram = {field:get_ngram_matches(query_terms[field],terms[field],3) for field in fields} 
        match = {field:get_ngram_matches(query_terms[field],terms[field],len(query_terms[field])) for field in fields}
        unordered_bigram_body = get_unordered_bigram_matches(query_terms['body'],doc_orig['body']) 
        
        
        #Query features
        q= np.array([ len(query_terms[field]) for field in fields] ).flatten()
       
    
        #document feautures
        d = extract_doc_features(doc_id,es)
        
        mat = np.array([ [ match[field][val_type] for val_type in match[field]] for field in match ]).flatten()
        doc = np.array([d[val] for val in d]).flatten()
        uni = np.array([ [ unigram[field][val_type] for val_type in unigram[field]] for field in unigram ]).flatten()
        bi = np.array([ [ bigram[field][val_type] for val_type in bigram[field]] for field in bigram ]).flatten()
        tri = np.array([ [ trigram[field][val_type] for val_type in trigram[field]] for field in trigram ]).flatten()
        unor = np.array( [unordered_bigram_body[i] for i in  unordered_bigram_body ]  ).flatten()
        url = url_match(doc_orig['url'],query_terms)
        
       
        return np.concatenate(( score,q,doc,uni,bi,tri,url,unor,mat), axis=None) 
    
    
def get_trainingdata_advanced(df_querys,df_top100,df_qrels,number_of_neg_samples=1, elastic=es, index=INDEX_NAME):
            """ 
            Args:    
            df_querys (pandas dataframe): 
            df_top100 (pandas dataframe): 
            df_qrels (pandas dataframe):
            number of neg samples(int): How many negative samples to include 
            es(): elastic obj
            index(string):index name 
            Returns:
                 numpy array: feauter vector X
                 list : lables y
                 tuple: (query_id,doc_id)
            """
            
            X = []
            y = []
            q = []
            ids = [] 
            for i,query in enumerate(df_querys.index.tolist()):
                #print(query)
                top_100   = df_top100.loc[query].index.tolist()
                relevant  = df_qrels.loc[query].index.tolist()
                
                pos_all = [e for i,e in enumerate(top_100)  if e in relevant]    
                neg_all = [e for i,e in enumerate(top_100)  if e not in relevant]
                
                if not pos_all:
                    continue
                    
                query_text = df_querys.loc[query]['query']
                
                neg_samples = neg_all[-number_of_neg_samples:] # query,df_querys,df_top100,elasitic,index
                #print(df_top100.loc[query].loc[neg_samples])
                
                features_neg,dq_ids  = get_feature_vectors_advanced(query,df_querys,df_top100.loc[query].loc[neg_samples],elastic,index)
                
                for vec,id_ in zip(features_neg,dq_ids):
                    X.append(vec)
                    y.append(0)
                    ids.append(id_ )
                    
                    
                features_pos,dq_ids  = get_feature_vectors_advanced(query,df_querys,df_top100.loc[query].loc[pos_all],elastic,index)
                
                for vec,id_ in zip(features_pos,dq_ids):
                    X.append(vec)
                    y.append(1)
                    ids.append(id_)
                
                if i%10==0:
                    print(round((i/len(training_querys))*100,3),'%',end='\r')
            
            print(round((i/len(training_querys))*100,3),'%',end='\r')        
            return X,y,ids  
        
        
def rerank_advanced(ltr,df_ranked,querys,elasitic =es,index =INDEX_NAME,  fields = ['title','body'],df =df_querys_test):

    
    test_rankings={}
    for i,query in enumerate(querys):
            
            X=[]
            for doc in df_ranked.loc[query].index.tolist():
                f = get_features_baseline(df_querys_test.loc[query]['query'], doc, elasitic, index,fields)
                X.append(f)
            
            X, _ =get_feature_vectors_advanced(query,df,df_ranked.loc[query],elasitic,index)
            
            r = ltr.rank(X, df_ranked.loc[query].index.tolist())
            
            test_rankings[query] = [i[0] for i in r]
            
            if i%2==0:
                    print(round((i/len(querys))*100,3),'%',end='\r')
    
    print(round((i/len(querys))*100,3),'%',end='\r')               
    return test_rankings

In [54]:
training_querys = df_querys_train.sample(n=10000)#.index.get_level_values(0).tolist()
X_train,y_train,pair = get_trainingdata_advanced(training_querys,df_top100_train,df_qrels_train,number_of_neg_samples=2)

99.99 %

In [104]:
clf =RandomForestRegressor(max_depth=5, random_state=0, n_estimators=100)
ltr = PointWiseLTRModel(clf)
ltr._train(X_train, y_train)

In [105]:
reranked_advanced = rerank_advanced(ltr,df_top100_test,df_querys_test.index.tolist(),elasitic =es,index =INDEX_NAME,  fields = ['title','body'],df =df_querys_test)
result = compute_metrics(truth_test,reranked_advanced)
print('Advanced:',result  ) 

Advanced: {'MRR @100': 0.24852538544781397, 'QueriesRanked': 500}
