# Assignment 2A, Part 3: Multifield retrieval

Implement BM25F and the Mixture of Language Models (MLM). Use two fields: title and content.

In [1]:
QUERY_FILE = "data/queries.txt"  # make sure the query file exists on this location
OUTPUT_FILE = "data/output.txt"  # output the ranking

In [2]:
import pickle
import nltk
import math
import pandas as pd

## Load index

In [3]:
# TODO: place the indexing related code here. This may be copy-pasted from Part 1.
indx, d_len, PtC, avg_len = [], [], [], [0, 0] # one index for each parameters

In [4]:
indx.append(pickle.load(open("data/title_indx.p", "rb")))
d_len.append(pickle.load(open("data/title_d_len.p", "rb")))
PtC.append(pickle.load(open("data/title_PtC.p", "rb")))
NUM_DOCS = len(d_len[0])
avg_len[0] = sum(list(d_len[0].values()))/NUM_DOCS

In [5]:
indx.append(pickle.load(open("data/content_indx.p", "rb")))
d_len.append(pickle.load(open("data/content_d_len.p", "rb")))
PtC.append(pickle.load(open("data/content_PtC.p", "rb")))
avg_len[1] = sum(list(d_len[1].values()))/NUM_DOCS

### Load the queries from the file

See the assignment description for the format of the query file [here](https://github.com/kbalog/uis-dat640-fall2019/tree/master/assignments/assignment-2a#queries).

In [6]:
def load_queries(query_file):
    queries = {}
    with open(query_file, "r") as fin:
        for line in fin.readlines():
            qid, query = line.strip().split(" ", 1)
            queries[qid] = query
    return queries

In [7]:
queries = load_queries(QUERY_FILE)

## Retrieval models

In [8]:
def calc_idf(word, n):
    return math.log(NUM_DOCS / n)

# TODO write your scoring code here
def BM25f(q_id, query, k1, b, weights):
    scores = {}
    
    words = nltk.word_tokenize(query.lower())
    
    for word in list(set(words)):
        for i, w_i in enumerate(weights):
            if word in list(indx[i].keys()):
                f_td = {}
                
                idf = calc_idf(word, len(indx[i][word]))
                
                for (doc_id, freq) in indx[i][word].items():                    
                    B_i = 1 - b[i] + b[i]*(d_len[i][doc_id]/avg_len[i])
                    
                    f_td[doc_id] = f_td.get(doc_id, 0) + (w_i*(freq/B_i))
                    
                    score = round(idf*(f_td[doc_id]/(k1 + f_td[doc_id])), 3)
                    scores[doc_id] = scores.get(doc_id, 0) + score

    
    
    scores = sorted(scores.items(), key=lambda x: x[1], reverse = True)
    scores = scores[:100]
    
    return scores

def MLM(q_id, query, l, weights):    
    scores = {}
    words = nltk.word_tokenize(query.lower())
    
    for word in list(set(words)):
        for i, w_i in enumerate(weights):
            if (word in list(indx[i].keys())):
                for (doc_id, freq) in indx[i][word].items():
                    score = w_i*((1-l[i])*(freq / d_len[i][doc_id]) + (l[i]*PtC[i][word]))
                    scores[doc_id] = scores.get(doc_id, 0) + score

    for doc_id, score in scores.items():
        scores[doc_id] = math.log(scores[doc_id])
    
    scores = sorted(scores.items(), key=lambda x: x[1], reverse = True)
    scores = scores[:100]
    
    return scores

### Perform retrieval

**TODO** Generate a ranking for each query and output the results to `OUTPUT_FILE`

See the assignment description for the format of the output file [here](https://github.com/kbalog/uis-dat640-fall2019/tree/master/assignments/assignment-2a#output-file-format).

In [9]:
BM25f_QueryId = []
BM25f_DocumentId = []
MLM_QueryId = []
MLM_DocumentId = []

for q_id, query in queries.items():
    # TODO generate ranking
    
    BM25f_res = BM25f(q_id,query,k1=1.2,b=[0.3, 0.2],weights=[0.1, 0.9])
    for score in BM25f_res:
        BM25f_QueryId.append(q_id)
        BM25f_DocumentId.append(score[0])
    
    MLM_res = MLM(q_id, query,l=[0.8,0.8],weights=[0.1, 0.9])
    for score in MLM_res:
        MLM_QueryId.append(q_id)
        MLM_DocumentId.append(score[0])

In [10]:
# Save scores to files
data = pd.DataFrame()
data['QueryId'] = BM25f_QueryId
data['DocumentId'] = BM25f_DocumentId
data.to_csv("bm25f_multifield.csv", index = False)

In [11]:
data = pd.DataFrame()
data['QueryId'] = MLM_QueryId
data['DocumentId'] = MLM_DocumentId
data.to_csv("mlm_multifield.csv", index = False)

## Evaluation

Report on the evaluation results (using the [Evaluation notebook](1_Evaluation.ipynb)) here.

I have found my parameters by trying different values.


| **Method** | **Parameter settings** | **Output file** | **P@10** | **MAP** | **MRR** |
| -- | -- | -- | -- | -- | -- |
| BM25F | k1: 1.2, b: [0.3, 0.2], $w_{title}$: 0.1, $w_{content}$: 0.9 | `data/bm25f_multifield.csv` | 0.193 | 0.073 | 0.313 |
| MLM | Smoothing method: Jelinek-Mercer, smoothing param: lambda = [0.8,0.8], $w_{title}$: 0.9, $w_{content}$: 0.1 | `data/mlm_multifield.csv` | 0.144 | 0.051 | 0.245 |