In [1]:
# setup
import os
import pandas as pd
import itertools
import numpy as np
import json
from pyserini.index import IndexReader
from pyserini.search import SimpleSearcher

**Note**: The query id in PLN is implemented in a slightly redundant way, but the code for task 3 and task 4 is very similar and improved upon. For PLN, I did not consider the document title only the document description. For other tasks I merged the the text in Title and Description. I have used a separte python file to convert csv to json and I merge the csv columns : Title and Description(for documents) in there only.

# Task 2: Implement Pivoted Length Normalization

### Data Collection

In [2]:
documents_df = pd.read_csv('hw2/documents.csv')
documents_df = documents_df[documents_df['Document Description'].notnull()]
queries = pd.read_csv('hw2/query.csv')
queries.head(), documents_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59887 entries, 0 to 59886
Data columns (total 4 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   DocumentId            59887 non-null  int64 
 1   Title                 59724 non-null  object
 2   Date                  59879 non-null  object
 3   Document Description  48757 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.8+ MB


In [5]:
# get the doc and query ids
document_id = documents_df.DocumentId.astype(str).values.tolist()
query_id = queries.QueryId.astype(str).values.tolist()

#### This section precalculates the length of the documents and stores it in a file

In [10]:
# loop over the documents and get their length
# doc_length = {}
# for docid in document_id[:10]:
#     doc_vector = index_reader.get_document_vector(docid)
#     document_length[docid] = sum(doc_vector.values())

In [11]:
# store the doc lengths of each doc in a json file
filename= 'hw2/document_length'
# with open(filename, 'w') as outfile:
#     json.dump(document_length, outfile)

In [12]:
# read the 
with open(filename) as infile:
    document_length = json.load(infile)


#### Get Index reader

In [7]:
#read the index files for the documents and the querires
index_reader = IndexReader('hw2/documents')
query_index_reader = IndexReader('hw2/query')

In [13]:
N = index_reader.stats().get('non_empty_documents')
avg_dl = index_reader.stats().get('total_terms')/ N
avg_dl,N

(145.28287220296573, 48757)

In [18]:
def PivotedLengthNormalizationRanker(query_id, b=0.3):
    ''' Implements the PLN formula to score relevance
    Input: 
        QueryId, b(hyperparameter) - default value =0.3
    
    Output: 
        common_term: A nested dict with doc_id as key and common terms 
            between query and repective score as another dict
            Sample: {'doc_id1':{'how': 2.3, 'gain': 3},'doc_id2':{'tell': 2.3, 'level': 3} }
    '''
    
    #get the query contents
    query = query_index_reader.doc(query_id).raw()
    query= json.loads(query)['contents']
    query_vector = query_index_reader.get_document_vector(query_id)
    
    # get analyzed terms
    analyzed_query = index_reader.analyze(query)
    
    #loop over the analyzed terms
    for terms in analyzed_query:
        common_term = {}
        #get the documents which contain the query term
        postings_list = index_reader.get_postings_list(terms, analyzer=None)
        # find the tf of this term in query_vector
        tf_query = query_vector.get(terms) 
        
        # check the document contains the word and the term is not an empty string
        if postings_list is not None and tf_query is not None:
            
            df_word = len(postings_list)  # the no.of documents that contain the word
        
            # calculate the score for each document and word
            for posting in postings_list:
                tf_doc= posting.tf # c(w,d)
                #get doc id for the index reader
                docsid = posting.docid
                # get the doc_id corresponding to lucene index
                doc_id = index_reader.convert_internal_docid_to_collection_docid(docsid)
                doc_length= document_length.get(doc_id) #get doc length       
                
                # calculate the PLN score for one word in query
                idf = np.log((N+1)/df_word)
                normalized_tf = (1 + np.log(1 + np.log(tf_doc)))/(1- b + b * doc_length/avg_dl)
                score = tf_query * normalized_tf * idf

                #check if doc_id is already in the dict
                if doc_id in common_term.keys():
                    # summation of each word in query for a doc_id
                    common_term[doc_id] = common_term[doc_id] + score  
                else:
                    common_term[doc_id] = score
    
    return common_term
    

In [19]:
final_rank = []
# loop through the query ids
for q_id in query_id:
    #call the PLN ranker
    result = PivotedLengthNormalizationRanker(q_id)
    
    #sort the result as per score and take top 10 documents
    documents = dict(sorted(result.items(), key=lambda item: item[1], reverse=True)[:10]).keys()
    
    # pair the document id with the query id
    for doc_id in documents:
        final_rank.append([q_id, doc_id])

In [20]:
# convert list of lists to df and save as csv
df =pd.DataFrame(final_rank, columns= ['QueryId', 'DocumentId'])
df.to_csv('hw2/seffendi_pln.csv', index=False)

In [140]:
sample =pd.read_csv('hw2/sample_submission.csv')
sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   QueryId     350 non-null    int64
 1   DocumentId  350 non-null    int64
dtypes: int64(2)
memory usage: 5.6 KB


# Task 3: Implement and Tune BM25

### Data Collection

In [88]:
documents_gaming = pd.read_csv('hw2/documents_gaming.csv')
documents_gaming = documents_gaming[documents_gaming['Document Description'].notnull()]
queries_gaming = pd.read_csv('hw2/query_gaming.csv')
queries_gaming.info(), documents_gaming.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1595 entries, 0 to 1594
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   QueryId            1595 non-null   int64 
 1   Query Description  1595 non-null   object
dtypes: int64(1), object(1)
memory usage: 25.0+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 45300 entries, 0 to 45300
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   DocumentId            45300 non-null  int64 
 1   Document Title        45300 non-null  object
 2   Document Description  45300 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.4+ MB


(None, None)

In [91]:
# get the doc and query ids as lists
document_id_gaming = documents_gaming.DocumentId.astype(str).values.tolist()
query_id_gaming = queries_gaming.QueryId.astype(str).values.tolist()

#### Get Index reader

In [92]:
#read the index files for the documents and the querires
index_reader_gaming = IndexReader('hw2/gaming_documents2')
query_index_reader_gaming = IndexReader('hw2/gaming_query')

#### This section gets the sample query ids and then saves the words in each query and its tf 

In [93]:
queries_gaming_sample = pd.read_csv('hw2/gaming_query_sample_submission.csv')
# queries_gaming_sample.info()#, queries_gaming.info()
queries_from_sample = list(set(queries_gaming_sample.QueryId.tolist()))
len(queries_from_sample)
# queries_from_sample[0]

33

In [95]:
# loop over the queries and get their words and their tf
all_queries={}
for queryid in queries_from_sample:
    query_vector = query_index_reader_gaming.get_document_vector(str(queryid))
    all_queries[str(queryid)] = query_vector
    
len(all_queries.keys())

33

#### This section precalculates the length of the documents and stores it in a file

In [96]:
# loop over the documents and get their length
# document_length_gaming = {}
# for docid in document_id_gaming:
#     doc_vector = index_reader_gaming.get_document_vector(docid)
#     document_length_gaming[docid] = sum(doc_vector.values())

In [97]:
# store the doc lengths of each doc in a json file
filename= 'hw2/document_length_gaming'
# with open(filename, 'w') as outfile:
#     json.dump(document_length_gaming, outfile)

In [99]:
# read the file
with open(filename) as infile:
    document_length_gaming = json.load(infile)


#### Calculate no.of documents and the average length of documents in the file

In [100]:
N_gaming = index_reader_gaming.stats().get('non_empty_documents')
avg_dl_gaming = index_reader_gaming.stats().get('total_terms')/ N_gaming
avg_dl_gaming,N_gaming

(87.8650360919185, 45301)

In [101]:
def BM25Ranker(words_dict, k1=1.2, b=0.75, k3=1.2):
    ''' Implements the bm25 formula to score relevance
    Input: 
        words_dict: It is a dict containing analyzed terms of a query as keys 
                    and its term freq in the query as value
        hyperparameters: default values given
                k1 : 1.2
                b  : 0.75
                k3 : 1.2
    Output: 
        commom_term_bm25: A nested dict with doc_id as key and common terms 
                        between query and repective score as another dict
                        Sample: {'doc_id1':{'how': 2.3, 'gain': 3},'doc_id2':{'tell': 2.3, 'level': 3} }
    '''
    common_term_bm25 = {}
    #loop over the words in the words_dict 
    for word,qtf in words_dict.items():

        postings_list = index_reader_gaming.get_postings_list(word, analyzer=None)
        
        if postings_list is not None :
            word_df = len(postings_list)

            for posting in postings_list:
                doc_tf= posting.tf # c(w,d)
                docsid = posting.docid
                # get the doc_id corresponding to lucene index
                doc_id = index_reader_gaming.convert_internal_docid_to_collection_docid(docsid)
                doc_len_gaming= document_length_gaming.get(doc_id) #get doc length 
                
                #check the length is not none
                if doc_len_gaming:
                #calculate the BM25 score for one word in query
                    normalized_qtf = ((k3+1)* qtf)/(k3 + qtf)
                    normalized_tf = ((k1 +1)*doc_tf)/((k1*(1-b+ b*(doc_len_gaming/avg_dl_gaming)))+doc_tf)
                    variant_idf = np.log((N_gaming - word_df+0.5)/(word_df+0.5))
                    score = variant_idf * normalized_tf *normalized_qtf

                    #check if doc_id is already in the dict
                    if doc_id in common_term_bm25.keys():
                        # summation of each word in query for a doc_id
                        common_term_bm25[doc_id] = common_term_bm25[doc_id] + score 
                    else:
                        common_term_bm25[doc_id] = score
    return common_term_bm25
       

In [105]:
final_rank_gaming = []
# loop through the query ids
# pdb.set_trace()
for qid,words in all_queries.items():
    #call the BM25 ranker
    result = BM25Ranker(words, k1=1.2, b=0.76, k3=1.25)
    
    #sort the result as per score and take top 5 documents
    documents_gaming_rank = dict(sorted(result.items(), key=lambda item: item[1], reverse=True)[:5]).keys()
    # pair the document id with the query id
    for doc_id in documents_gaming_rank:
        final_rank_gaming.append([qid, doc_id])

In [106]:
# convert list of lists to df and save as csv
df_bm25 =pd.DataFrame(final_rank_gaming, columns= ['QueryId', 'DocumentId'])
df_bm25.to_csv('hw2/seffendi_bm25_3.csv', index=False)

In [107]:
df_bm25.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 165 entries, 0 to 164
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   QueryId     165 non-null    object
 1   DocumentId  165 non-null    object
dtypes: object(2)
memory usage: 2.7+ KB


# Task4: Design Your Own Scoring Function

### Data Collection

In [2]:
documents_android = pd.read_csv('hw2/ranker/documents_android.csv')#, index_col= 'DocumentId'
# documents_android = documents_gaming[documents_gaming['Document Description'].notnull()]
queries_android = pd.read_csv('hw2/ranker/query_android.csv')
queries_android_sample = pd.read_csv('hw2/ranker/android_query_sample_submission.csv')
queries_android.info(), documents_android.info(), queries_android_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   QueryId            699 non-null    int64 
 1   Query Description  699 non-null    object
dtypes: int64(1), object(1)
memory usage: 11.0+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22998 entries, 0 to 22997
Data columns (total 3 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   DocumentId            22998 non-null  int64 
 1   Document Title        22998 non-null  object
 2   Document Description  22998 non-null  object
dtypes: int64(1), object(2)
memory usage: 539.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype
---  ------      --------------  -----
 0   QueryId     255 non-null    int64
 1   DocumentId  255 non-n

(None, None, None)

### Get the Index reader for documents and queries

In [18]:
#read the index files for the documents and the querires
index_reader_android = IndexReader('hw2/ranker/android_documents')
query_index_reader_android = IndexReader('hw2/ranker/android_query')

### Calculations to make processing faster

In [19]:
# get the list of doc and query ids
document_id_android = documents_android.DocumentId.astype(str).values.tolist()
query_id_android = queries_android.QueryId.astype(str).values.tolist()

#### Pre-calculate the length of each document and store it in json

In [20]:
# loop over the documents and get their length
# document_length_android = {}
# for docid in document_id_android:
#     doc_vector = index_reader_android.get_document_vector(docid)
#     document_length_android[docid] = sum(doc_vector.values())

In [21]:
# store the doc lengths of each doc in a json file
filename= 'hw2/ranker/document_length_android'
# with open(filename, 'w') as outfile:
#     json.dump(document_length_android, outfile)

In [22]:
# read the doc lenghts from file
with open(filename) as infile:
    document_length_android = json.load(infile)

In [23]:
len(document_length_android)

22998

### Pre- calculate the average frequency of nouns in the queries and the documents

#### This section precalculates the length of the documents and stores it in a file

In [13]:
queries_android_sample_list = list(set(queries_android_sample.QueryId.tolist()))
len(queries_android_sample_list)

51

In [30]:
# loop over the queries and get their words and their tf 
all_queries_android={}
# for queryid in query_id_android:
for queryid in queries_android_sample_list:
    query_vector = query_index_reader_android.get_document_vector(str(queryid))
    all_queries_android[str(queryid)] = query_vector
    
len(all_queries_android.keys())

51

#### Calculate average noun frequency for each query and document

In [25]:
import spacy
nlp = spacy.load('en_core_web_sm')

def get_avg_noun_freq(text_id, searcher):
    '''Calculate average noun frequency'''
    # Fetch raw text of a document given its docid
    raw = searcher.doc(text_id).raw()
    # Get actual content from raw
    content = json.loads(raw)['contents']
    doc = nlp(content)

    text_noun_count =0
    for token in doc:
        if token.pos_ == 'NOUN':
            text_noun_count = text_noun_count+1

    avg_noun_freq = text_noun_count/len(doc)
    return avg_noun_freq                                  

In [26]:
# getting the average noun frequency for the queries
query_searcher = SimpleSearcher('hw2/ranker/android_query')

q_noun_freq ={}
for queryid in queries_android_sample_list:
    q_noun_freq[str(queryid)] = get_avg_noun_freq(str(queryid), query_searcher)

In [27]:
len(document_id_android)

22998

#### For the documents get the words and their tf for each query

In [28]:
# getting the average noun frequency for the documents
# doc_searcher = SimpleSearcher('hw2/ranker/android_documents')
# d_noun_freq ={}
# for docid in document_id_android:
#     d_noun_freq[str(docid)] = get_avg_noun_freq(str(docid), doc_searcher)


In [29]:
# write document noun frequence to file
filename= 'hw2/ranker/doc_noun_freq'
# with open(filename, 'w') as outfile:
#     json.dump(d_noun_freq, outfile)

# read the document noun frequence from file
with open(filename) as infile:
    d_noun_freq = json.load(infile)

## Implementing the ranker

In [31]:
# get total no.of documents and average document length
N_android = index_reader_android.stats().get('non_empty_documents')
avg_dl_android = index_reader_android.stats().get('total_terms')/ N_android
avg_dl_android,N_android

(76.56148360727019, 22998)

In [35]:
def CustomRanker(query_id, words_dict, k1=1.2, b=0.75, k3=1.2, k4= 1):
    ''' Implements the custom ranker formula to score relevance
    Input: 
        words_dict: It is a dict containing analyzed terms of a query as keys 
                    and its term freq in the query as value
        hyperparameters: default values given
                k1 : 1.2
                b  : 0.75
                k3 : 1.2
                k4 : 1 [should be <= 1]
    Output: 
        commom_term_bm25: A nested dict with doc_id as key and common terms 
                        between query and repective score as another dict
                        Sample: {'doc_id1':{'how': 2.3, 'gain': 3},'doc_id2':{'tell': 2.3, 'level': 3} }
    '''
    common_term_ranker = {}
    q_noun_f = q_noun_freq[query_id]
    for word,qtf in words_dict.items():

        #get the documents which contain the query term
        postings_list = index_reader_android.get_postings_list(word, analyzer=None)
        
        if postings_list is not None :
            word_df = len(postings_list)

            for posting in postings_list:
                doc_tf= posting.tf # c(w,d)
                docsid = posting.docid
                doc_id = index_reader_android.convert_internal_docid_to_collection_docid(docsid)
                doc_len_android= document_length_android.get(doc_id) #get doc length
                
                # calculte the average noun frequency for each doc
                doc_searcher = SimpleSearcher('hw2/ranker/android_documents')
                d_noun_f = d_noun_freq.get(doc_id)

                if doc_len_android:
                #calculate the CustomRanker score for one word in query
                    normalized_qtf = ((k3+1)* qtf)/(k3 + qtf)
                    normalized_tf = ((k1 +1)*doc_tf)/((k1*(1-b+ b*(doc_len_android/avg_dl_android)))+doc_tf)
                    variant_idf = np.log((N_android - word_df+0.5)/(word_df+0.5))
                    context = k4 *(q_noun_f+1)/(d_noun_f +1)
                    score = variant_idf * normalized_tf *normalized_qtf *context

                    #check if doc_id is already in the dict
                    if doc_id in common_term_ranker.keys():
                        # summation of each word in query for a doc_id
                        common_term_ranker[doc_id] = common_term_ranker[doc_id] + score 
                    else:
                        common_term_ranker[doc_id] = score
    return common_term_ranker       

In [36]:
final_rank_android = []
# loop through the query ids
for qid,words in all_queries_android.items():
    result = CustomRanker(str(qid),words)
    #sort the result as per score and take top 5 documents
    documents_android_rank = dict(sorted(result.items(), key=lambda item: item[1], reverse=True)[:5]).keys()

    # pair the document id with the query id
    for doc_id in documents_android_rank:
        final_rank_android.append([qid, doc_id])
#     print(qid, result)

In [37]:
df_ranker =pd.DataFrame(final_rank_android, columns= ['QueryId', 'DocumentId'])

# df_ranker.groupby('QueryId').size()
df_ranker.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255 entries, 0 to 254
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   QueryId     255 non-null    object
 1   DocumentId  255 non-null    object
dtypes: object(2)
memory usage: 4.1+ KB


In [38]:
df_ranker.to_csv('hw2/ranker/seffendi_ranker_5.csv', index=False)

I approached the calculation of tf-idf similar to BM25, that is, I reused that portion with k1=1.2, b=0.75 and k3=1.2, because I wanted to get to the baseline score. After that I noticed that for the common terms the context in which the query and documents contain the common words is different. Since, pyserini uses stemmer we lose the context in which the word is being used. A word could be a noun in a query and used as a verb in the doc which gets a high relevance score when it actually shouldn't. So, to match the context in the query and documents I calculated the average noun frequency in the query and the average noun frequency in the document. If a query and document have similar terms and then have similar average frequency of nouns then probably the context will be the same. If the average frequency of nouns is less when tf is high then, maybe the term is being used as a verb and in that case it is not very relevant to the query.

To calculate the score I took the ratio of the average frequency of noun in query with the average frequency of noun in document:

context = (k4 \*(q_noun_f+1))/(d_noun_f +1)

I have added 1 in the numerator and denominator to avoid division by 0 or overall 0 value for the context variable. The value of k4 by default is 1 but can be tuned to account for less importance to average frequency of noun in query, since queries are short and can have higher avg noun frequency.

For each query, document pair,

score = variant_idf * normalized_tf *normalized_qtf *context

Improvements: I think I could add terms to penalize the length of doc, which can result in higher average frequency of nouns but I could not get to a formula which resulted in beating baseline.
 