## Content
1. [Loading preprocessed data](#1.-Loading-preprocessed-data)  
2. [BM25](#2.-BM25)  

## 1. Loading preprocessed data

In [1]:
import json
import pandas as pd

from input_output import load_data, write_results
from pprint import pprint
from tqdm import tqdm

IN_PATH = '../data/squad/'

In [2]:
train = load_data(IN_PATH + 'train-v1.1-preprocessed.json')
dev = load_data(IN_PATH + 'dev-v1.1-preprocessed.json')

## 2. BM25

In [3]:
import evaluation
import numpy as np

from gensim.summarization.bm25 import BM25

In [4]:
def get_bm25_scores(dataset):
    bm25_scores = list()
    labels = list()
    
    for article in tqdm(dataset['data']):
        for qas_context in article['paragraphs']:
            for qas in qas_context['qas']:
                question_sentence = qas['question_words']
                candidate_sentences = qas_context['context_sentences_words']   
                labels.append(set([d['answer_label'] for d in qas['answers']]))
                
                bm25 = BM25(candidate_sentences)
                candidate_scores = bm25.get_scores(question_sentence, 1)
                candidate_scores_dict = dict(zip(range(len(candidate_scores)),
                                                 candidate_scores))
                
                sorted_scores_id = sorted(candidate_scores_dict, 
                                          key=candidate_scores_dict.get, 
                                          reverse=True)
                
                bm25_scores.append(sorted_scores_id)
    
    return (bm25_scores, labels)

In [5]:
(train_scores_id_sorted, train_labels) = get_bm25_scores(train)
(dev_scores_id_sorted, dev_labels) = get_bm25_scores(dev)

100%|██████████| 442/442 [00:11<00:00, 36.84it/s]
100%|██████████| 48/48 [00:01<00:00, 32.78it/s]


In [6]:
train_results = evaluation.get_results(train_scores_id_sorted, train_labels, 'train [bm25]')
dev_results = evaluation.get_results(dev_scores_id_sorted, dev_labels, 'dev [bm25]')

In [7]:
write_results(train_results)
write_results(dev_results)

Method: train [bm25]
AvgPrec@1: 0.755316841516 (std = 0.429899186366)
MAP: 0.855130002137 (std = 0.261382586907)


Method: dev [bm25]
AvgPrec@1: 0.786754966887 (std = 0.409599302936)
MAP: 0.869082662712 (std = 0.241973363739)


