# Assignment 2A, Part 2: Retrieval

Implement BM25 and LM retrieval.

In [None]:
QUERY_FILE = "data/queries.txt"  # make sure the query file exists on this location
OUTPUT_FILE = "data/output.txt"  # output the ranking

In [2]:
import re
import gzip
from bs4 import BeautifulSoup
import hashedindex
from hashedindex import textparser
import glob
import pickle
import math
import pandas as pd

import nltk

from IPython.display import clear_output # Using IPython.display.clear_output to clear the output of a cell.

nltk_stopwords = set(nltk.corpus.stopwords.words('english'))

## Load index

In [3]:
# TODO: place the indexing related code here. This may be copy-pasted from Part 1.
def add_docs_bulk(docs, section):
    indexes = hashedindex.HashedIndex()
    doclen = {}
    tC = {}
    total_term_count = 0
    
    for doc_id, doc in docs.items():
        # TODO: complete
#         print("Indexing document {}".format(doc_id))
        
        terms = list(textparser.word_tokenize((doc[section]).lower(), stopwords = nltk_stopwords))
        doclen[doc_id] = len(terms)
        
        for term in terms:
            indexes.add_term_occurrence(term[0], doc_id)
        
    indexes = indexes.items()
    for term, doc_freq_pair in indexes.items():
        doc_freq_pair = dict(doc_freq_pair)
        indexes[term] = doc_freq_pair
        term_count = sum(list(doc_freq_pair.values()))
        
        tC[term] = term_count
        total_term_count += term_count
        
    return (indexes, doclen, tC, total_term_count)


def combine_indexes(prev_indexes, new_indexes):
    for k, val in new_indexes.items():
        if k in prev_indexes.keys():
            prev_indexes[k].update(new_indexes[k])
        else:
            prev_indexes[k] = new_indexes[k]
    
    return prev_indexes


def index_file(file_names, section):
    doc_len = {}
    indexes = {}
    P_tc = {}
    total_term_count = 0
    
    total_files_indexed = 0
    gz_files_read = 0
    for file_name in file_names:
        gz_files_read += 1
        clear_output()
        print("Processing", file_name)
        docs = {}
        with gzip.open(file_name, "rt") as fin:
            is_body = False
            doc_id, body = None, None
            
            for line in fin:
                line = line.strip()
                if line.startswith("<DOCNO>"):  # get doc id
                    doc_id = re.sub("<DOCNO> | </DOCNO>", "", line)
                elif line.startswith("<BODY>"):  # start to parse body
                    is_body = True
                    body = []
                elif line.startswith("</BODY>"):  # finished reading body
                    soup = BeautifulSoup("\n".join(body), "lxml")
                    headline = soup.find("headline")
                    text = soup.find("text")
                    docs[doc_id] = {
                        "title": headline.text if headline is not None else "",  # use an empty string if no <HEADLINE> found
                        "content": text.text if text is not None else ""  # everything inside <TEXT> is indexed as content
                    }
                    # get ready for next document
                    doc_id = None
                    is_body = False
                elif is_body:  # accumulate body content
                    body.append(line)

            # bulk index the collected documents
            total_files_indexed += len(docs)
            print("Bulk indexed:", len(docs), "documents.")
            print("Total files indexed so far:", total_files_indexed)
            print(gz_files_read,"/",len(file_names), "gz files finished reading.")
            new_indexes, doclen, tC, total_tc = add_docs_bulk(docs, section)
            
            # Concatanate and combine the indexes
            indexes = combine_indexes(indexes, new_indexes)
            
            # Concatenate the new document lengths
            doc_len.update(doclen)
            
            # Sum all term counts
            for term, count in tC.items():
                if term in P_tc.keys():
                    P_tc[term] = P_tc[term] + count
                else:
                    P_tc[term] = count
                    
            # Add the total term count
            total_term_count += total_tc
    
    # Calculate P(t|C) needed for Language Model
    for term, count in P_tc.items():
        P_tc[term] = P_tc[term]/total_term_count
        
    clear_output()
    print("Finished indexing", total_files_indexed, "files in", len(file_names), "gz files.")
    return (indexes, doc_len, P_tc)

In [4]:
# Create Indexes and doc_len
# indexes, doc_len, P_tC = index_file(glob.glob("data/aquaint/**/*.gz", recursive=True), section = 'content')
# print(len(indexes), len(doc_len), len(P_tC))

### Note
I have already calculated the indexes, the document length and the P(t|C) in '1_Indexer' so I am only loading that data here.

In [5]:
# Load indexes, doc_len and P(t|C) for content data
inv_idx = pickle.load(open("data/indexes_content.p", "rb" ))
doc_len = pickle.load(open("data/doc_len_content.p", "rb" ))
P_tC = pickle.load(open("data/P_tC_content.p", "rb" ))
NUM_DOCS = len(doc_len)
avg_dl = sum(list(doc_len.values()))/NUM_DOCS

### Load the queries from the file

See the assignment description for the format of the query file [here](https://github.com/kbalog/uis-dat640-fall2019/tree/master/assignments/assignment-2a).

In [6]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries

In [7]:
queries = load_queries(QUERY_FILE)

## Retrieval models

In [1]:
# TODO write your scoring code here
def get_BM25_score_for(q_id, query, k1, b):    
    print("Ranking documents using BM25 for [%s] '%s'" % (q_id, query))
    scores = {}
    q_words = query.lower().replace("-", " ").replace(",", "").split()
    
    # TODO generate ranking
    for term in list(set(q_words)):
        if term in list(inv_idx.keys()):
            for (doc_id, f_td) in inv_idx[term].items():
                n = len(inv_idx[term])
                idf = math.log((NUM_DOCS - n + 0.5) / (n + 0.5))
                
                score_for_term =  round((idf*(f_td * (k1 + 1)) / (f_td + k1*(1 - b + b*(doc_len[doc_id]/avg_dl)))), 3)
                
                scores[doc_id] = scores.get(doc_id, 0) + score_for_term
                
    scores = sorted(scores.items(), key=lambda score: score[1], reverse = True)[:100]
    
    return scores

def get_LM_score_for(q_id, query, lmbda):
    print("Ranking documents using Jelinek-Mercer smoothing for [%s] '%s'" % (q_id, query))
    
    scores = {}
    q_words = query.lower().replace("-", " ").replace(",", "").split()
    
    for term in list(set(q_words)):
        f_tq = q_words.count(term)
        
        if term in list(inv_idx.keys()):
            for (doc_id, f_td) in inv_idx[term].items():
                score_for_term =  round(math.log((1-lmbda)*(f_td / doc_len[doc_id]) + (lmbda*P_tC[term])), 3)
    
                scores[doc_id] = scores.get(doc_id, 0) + score_for_term
            scores[doc_id] = f_tq * scores[doc_id]
    
    scores = sorted(scores.items(), key=lambda score: score[1], reverse = True)[:100]
    
    return scores

### Perform retrieval

**TODO** Generate a ranking for each query and output the results to `OUTPUT_FILE`

See the assignment description for the format of the output file [here](https://github.com/kbalog/uis-dat640-fall2019/tree/master/assignments/assignment-2a).

In [11]:
BM25_data = pd.DataFrame(columns=['QueryId', 'DocumentId'])
LM_data = pd.DataFrame(columns=['QueryId', 'DocumentId'])

for q_id, query in queries.items():
    # TODO generate ranking
    BM_25_rankings = get_BM25_score_for(q_id, query, k1 = 1.2, b = 0.7)
    LM_rankings = get_LM_score_for(q_id, query, lmbda = 0.8)
    
    
    # TODO write results to file
    for score in BM_25_rankings:
        BM25_data = BM25_data.append(pd.DataFrame([[q_id, score[0]]], columns=['QueryId', 'DocumentId']))
        
    for score in LM_rankings:
        LM_data = LM_data.append(pd.DataFrame([[q_id, score[0]]], columns=['QueryId', 'DocumentId']))
    
    clear_output()

# Save scores to files 
BM25_data.to_csv("bm25_singlefield.csv", index = False)
LM_data.to_csv("lm_singlefield.csv", index = False)
print("Done")

Done


## Evaluation

Report on the evaluation results (using the [Evaluation notebook](1_Evaluation.ipynb)) here.

Describe the parameter settings used for the two methods: 

Parameters for BM25: k1 = 1.2, b = 0.7
Parameters for LM: lambda = 0.8

Write the name of the corresponding output file in the table. These files should be pushed to your repository.

| **Method** | **Output file** | **P@10** | **MAP** | **MRR** |
| -- | -- | -- | -- | -- |
| BM25 | `bm25_singlefield.csv` | 0.172 | 0.063 | 0.292 |
| LM | `lm_singlefield.csv` | 0.010 | 0.002 | 0.022 |