## Content
1. [Loading preprocessed data](#1.-Loading-preprocessed-data)  
2. [Loading pretrained word embeddings](#2.-Loading-pretrained-word-embeddings)  
3. [Classification](#3.-Classification)

## 1. Loading preprocessed data

In [1]:
import json
import pandas as pd
from pprint import pprint
from tqdm import tqdm

IN_PATH = '../data/squad/'


def load_data(filename):
    data_frame = pd.read_json(filename)
    data = data_frame.to_dict(orient='list')
    
    return data

In [2]:
train = load_data(IN_PATH + 'train-v1.1-preprocessed.json')
dev = load_data(IN_PATH + 'dev-v1.1-preprocessed.json')

## 2. Loading pretrained word embeddings

In [3]:
import gensim
import numpy as np

WORD2VEC_PATH = '../word_embeddings/GoogleNews-vectors-negative300.bin'

word2vec_model = None

In [4]:
def get_pretrained_word_embeddings(sentence, word_emb_dims=300):
    global word2vec_model
    
    if word2vec_model is None:
        print('load word2vec_model')
        word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)

    word_embeddings = np.zeros(word_emb_dims)
    word_emb_nr = 0

    for word in sentence:
        if word2vec_model.vocab.has_key(word):
            word_embeddings += word2vec_model.word_vec(word)
            word_emb_nr += 1

    # a sentence is represented by the average of word_embbedings
    if word_emb_nr != 0:
        word_embeddings = word_embeddings / word_emb_nr

    return word_embeddings

In [5]:
get_pretrained_word_embeddings(['today', 'is', 'wednesday']);

load word2vec_model


In [6]:
def get_features(dataset):
    train_x = list()
    train_y = list()
    train_steps = list()
    
    for article in tqdm(dataset['data']):
        for qas_context in article['paragraphs']:
            for qas in qas_context['qas']:
                question_words = qas['question_words']
                question_embedding = get_pretrained_word_embeddings(question_words)
                
                candidate_sentences_embeddings = list()
                for sentence in qas_context['context_sentences_words']:
                    candidate_sentences_embeddings.append(get_pretrained_word_embeddings(sentence))
                    
                labels = set([d['answer_label'] for d in qas['answers']])
                
                train_steps.append(len(candidate_sentences_embeddings))
                
                for i in range(len(candidate_sentences_embeddings)):
                    train_x.append(question_embedding + candidate_sentences_embeddings[i])
                    label = i in labels
                    train_y.append(label)
                    
    return (train_x, train_y, train_steps)

In [7]:
(train_x, train_y, _) = get_features(train)
(dev_x, dev_y, dev_steps) = get_features(dev)

100%|██████████| 442/442 [00:28<00:00, 15.25it/s]
100%|██████████| 48/48 [00:03<00:00, 12.75it/s]


In [8]:
print((len(train_x)))
print(np.sum(train_y == np.array([1]*len(train_y))))
print(np.sum(train_y == np.array([0]*len(train_y))))

447101
87599
359502


In [9]:
print((len(dev_x)))
print(np.sum(dev_y == np.array([1]*len(dev_y))))
print(np.sum(dev_y == np.array([0]*len(dev_y))))

53967
11436
42531


## 3. Classification

In [10]:
from sklearn.linear_model import LogisticRegression

In [11]:
def logistic_regression(train_x, train_y, test_x):
    pred = np.zeros(len(test_x))

    lr_model = LogisticRegression(C=4, dual=True, class_weight='balanced')
    lr_model.fit(train_x, train_y)
    pred = lr_model.predict_proba(test_x)

    return pred

In [12]:
pred = logistic_regression(train_x, train_y, dev_x)

In [13]:
pred[0:10]

array([[ 0.40176146,  0.59823854],
       [ 0.44002845,  0.55997155],
       [ 0.44106264,  0.55893736],
       [ 0.43749417,  0.56250583],
       [ 0.41188011,  0.58811989],
       [ 0.4503853 ,  0.5496147 ],
       [ 0.45142421,  0.54857579],
       [ 0.44783908,  0.55216092],
       [ 0.49391697,  0.50608303],
       [ 0.53313913,  0.46686087]])

In [14]:
def extract_scores(y_pred, steps_lengths):
    pred_scores = list()
    steps_lengths_sum = 0
    
    for step_length in steps_lengths:
        scores = {}
        for idx in range(step_length):
            scores[idx] = y_pred[steps_lengths_sum + idx]
        
        sorted_scores = sorted(scores, key=scores.get, reverse=True)
        pred_scores.append(sorted_scores)
        
        steps_lengths_sum += step_length
        
    return pred_scores

In [15]:
pred_scores = extract_scores(dev_y, dev_steps)

In [16]:
print(len(pred_scores))

10570


In [17]:
import evaluation

def get_results(pred_scores, y, method):
    results = {'Method': method, 'Prec@1': [], 'Prec@5': [], 'Prec@10': [],
           'AvgPrec': [], 'MAP': 0} 
    
    for i in range(len(pred_scores)):
        y_pred = pred_scores[i]
        results['Prec@1'].append(evaluation.precision_at_k(y_pred, y, k=1))
        results['AvgPrec'].append(evaluation.average_precision(y_pred, y))
    
    # evaluation (MAP - mean average precision)
    results['MAP'] = np.mean(results['AvgPrec'])
    results['StdAP'] = np.std(results['AvgPrec'])
    results['AvgPrec@1'] = np.mean(results['Prec@1'])
    results['StdPrec@1'] = np.std(results['Prec@1'])
    
    return results

In [18]:
dev_results = get_results(pred_scores, dev_y, 'dev')

In [19]:
def write_results(results):
    print('Method: {}'.format(results['Method']))
    print('AvgPrec@1: {} (std = {})'.format(results['AvgPrec@1'], results['StdPrec@1']))
    print('MAP: {} (std = {})'.format(results['MAP'], results['StdAP']))
    print('\n')

In [20]:
write_results(dev_results)

Method: dev
AvgPrec@1: 0.576348155156 (std = 0.494136579504)
MAP: 0.813787448754 (std = 0.209770386056)


