# Assignment 2A, Part 2: Retrieval

Implement BM25 and LM retrieval.

In [1]:
QUERY_FILE = "data/queries.txt"  # make sure the query file exists on this location
OUTPUT_FILE = "data/output.txt"  # output the ranking

In [2]:
import pickle
import nltk
import math
import pandas as pd

In [3]:
content_indx = pickle.load(open("data/content_indx.p", "rb" ))
content_d_len = pickle.load(open("data/content_d_len.p", "rb" ))
content_PtC = pickle.load(open("data/content_PtC.p", "rb" ))
NUM_DOCS = len(content_d_len)
avg_len = sum(list(content_d_len.values()))/NUM_DOCS

### Load the queries from the file

See the assignment description for the format of the query file [here](https://github.com/kbalog/uis-dat640-fall2019/tree/master/assignments/assignment-2a).

In [4]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries

In [5]:
queries = load_queries(QUERY_FILE)

## Retrieval models

In [6]:
def calc_idf(word):
    n = len(content_indx[word])
    return math.log((NUM_DOCS - n + 0.5) / (n + 0.5))


# TODO write your scoring code here
def BM25(q_id, query, k1, b):
    scores = {}
    words = nltk.word_tokenize(query.lower())
    
    for word in set(words):
        if word in list(content_indx.keys()):
            for (doc_id, freq) in content_indx[word].items():
                idf = calc_idf(word)
                
                score =  idf*(freq * (k1 + 1)) / (freq + k1*(1 - b + b*(content_d_len[doc_id]/avg_len)))
                score = round(score, 3)
                
                scores[doc_id] = scores.get(doc_id, 0) + score
                
    scores = sorted(scores.items(), key=lambda x: x[1], reverse = True)
    scores = scores[:100]
    
    return scores

def LM(q_id, query, l):    
    scores = {}
    words = nltk.word_tokenize(query.lower())
    
    for word in set(words):
        ftq = words.count(word)
        
        if word in list(content_indx.keys()):
            for (doc_id, freq) in content_indx[word].items():
                score =  round(math.log((1-l)*(freq / content_d_len[doc_id]) + (l*content_PtC[word])), 3)
    
                scores[doc_id] = scores.get(doc_id, 0) + score
            scores[doc_id] = ftq * scores[doc_id]
    
    scores = sorted(scores.items(), key=lambda x: x[1], reverse = True)
    scores = scores[:100]
    
    return scores

### Perform retrieval

**TODO** Generate a ranking for each query and output the results to `OUTPUT_FILE`

See the assignment description for the format of the output file [here](https://github.com/kbalog/uis-dat640-fall2019/tree/master/assignments/assignment-2a).

In [7]:
BM25_QueryId = []
BM25_DocumentId = []
LM_QueryId = []
LM_DocumentId = []

for q_id, query in queries.items():
    # TODO generate ranking
    
    BM_25_res = BM25(q_id,query,k1=1.2,b=0.7)
    for score in BM_25_res:
        BM25_QueryId.append(q_id)
        BM25_DocumentId.append(score[0])
    
    LM_res = LM(q_id, query, l=0.8)
    for score in LM_res:
        LM_QueryId.append(q_id)
        LM_DocumentId.append(score[0])

In [8]:
# Save scores to files
data = pd.DataFrame()
data['QueryId'] = BM25_QueryId
data['DocumentId'] = BM25_DocumentId
data.to_csv("bm25_singlefield.csv", index = False)

In [9]:
data = pd.DataFrame()
data['QueryId'] = LM_QueryId
data['DocumentId'] = LM_DocumentId
data.to_csv("lm_singlefield.csv", index = False)

## Evaluation

Report on the evaluation results (using the [Evaluation notebook](1_Evaluation.ipynb)) here.

Describe the parameter settings used for the two methods: 

Parameters for BM25: k1 = 1.2, b = 0.7

Parameters for LM: lambda = 0.8

Write the name of the corresponding output file in the table. These files should be pushed to your repository.

| **Method** | **Output file** | **P@10** | **MAP** | **MRR** |
| -- | -- | -- | -- | -- |
| BM25 | `bm25_singlefield.csv` | 0.184 | 0.067 | 0.319 |
| LM | `lm_singlefield.csv` | 0.036 | 0.012 | 0.084 |