## Content
1. [Loading preprocessed data](#1.-Loading-preprocessed-data)  
2. [Loading pretrained word embeddings](#2.-Loading-pretrained-word-embeddings)  
3. [Classification](#3.-Classification)

## 1. Loading preprocessed data

In [1]:
import json
import pandas as pd
from pprint import pprint
from tqdm import tqdm

IN_PATH = '../data/squad/'


def load_data(filename):
    data_frame = pd.read_json(filename)
    data = data_frame.to_dict(orient='list')
    
    return data

In [2]:
train = load_data(IN_PATH + 'train-v1.1-preprocessed.json')
dev = load_data(IN_PATH + 'dev-v1.1-preprocessed.json')

## 2. Loading pretrained word embeddings

In [3]:
import gensim
import numpy as np

WORD2VEC_PATH = '../word_embeddings/GoogleNews-vectors-negative300.bin'

word2vec_model = None

In [4]:
def get_pretrained_word_embeddings(sentence, word_emb_dims=300):
    global word2vec_model
    
    if word2vec_model is None:
        print('load word2vec_model')
        word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)

    word_embeddings = np.zeros(word_emb_dims)
    word_emb_nr = 0

    for word in sentence:
        if word2vec_model.vocab.has_key(word):
            word_embeddings += word2vec_model.word_vec(word)
            word_emb_nr += 1

    # a sentence is represented by the average of word_embbedings
    if word_emb_nr != 0:
        word_embeddings = word_embeddings / word_emb_nr

    return word_embeddings

In [5]:
get_pretrained_word_embeddings(['today', 'is', 'wednesday']);

load word2vec_model


In [6]:
def get_features(dataset, q_a_combination_method):
    train_x = list()
    train_y = list()
    train_steps = list()
    train_labels = list()
    
    for article in tqdm(dataset['data']):
        for qas_context in article['paragraphs']:
            for qas in qas_context['qas']:
                question_words = qas['question_words']
                question_embedding = get_pretrained_word_embeddings(question_words)
                
                candidate_sentences_embeddings = list()
                for sentence in qas_context['context_sentences_words']:
                    candidate_sentences_embeddings.append(get_pretrained_word_embeddings(sentence))
                
                labels = set([d['answer_label'] for d in qas['answers']])
                train_labels.append(labels)
                
                train_steps.append(len(candidate_sentences_embeddings))
                
                for i in range(len(candidate_sentences_embeddings)):
                    feature_vector = []
                    if q_a_combination_method == 'concatenation':
                        feature_vector = np.concatenate((question_embedding,
                                                         candidate_sentences_embeddings[i]))
                    elif q_a_combination_method == 'diff_abs':
                        feature_vector = np.abs(question_embedding - 
                                                candidate_sentences_embeddings[i])
                    elif q_a_combination_method == 'diff_sqr':
                        feature_vector = np.square(question_embedding - 
                                                   candidate_sentences_embeddings[i])
                    elif q_a_combination_method == 'sum':
                        feature_vector = question_embedding + candidate_sentences_embeddings[i]
                    elif q_a_combination_method == 'dot_product':
                        feature_vector = question_embedding * candidate_sentences_embeddings[i]
                    elif q_a_combination_method == 'min':
                        feature_vector = np.minimum(question_embedding,
                                                    candidate_sentences_embeddings[i])            
                    elif q_a_combination_method == 'max':
                        feature_vector = np.maximum(question_embedding,
                                                    candidate_sentences_embeddings[i])  
                        
                    train_x.append(feature_vector)
                    label = i in labels
                    train_y.append(label)
                    
    return (train_x, train_y, train_steps, train_labels)

In [7]:
(train_x, train_y, _, train_labels) = get_features(train, 'diff_abs')
(dev_x, dev_y, dev_steps, dev_labels) = get_features(dev, 'diff_abs')

100%|██████████| 442/442 [00:29<00:00, 14.81it/s]
100%|██████████| 48/48 [00:03<00:00, 13.02it/s]


In [8]:
print(len(train_x[0]))

300


In [9]:
print((len(train_x)))
print(np.sum(train_y == np.array([1]*len(train_y))))
print(np.sum(train_y == np.array([0]*len(train_y))))

447101
87599
359502


In [10]:
print((len(dev_x)))
print(np.sum(dev_y == np.array([1]*len(dev_y))))
print(np.sum(dev_y == np.array([0]*len(dev_y))))

53967
11436
42531


## 3. Classification

In [11]:
import evaluation

from sklearn.linear_model import LogisticRegression

In [12]:
def logistic_regression(train_x, train_y, test_x):
    pred = np.zeros(len(test_x))

    lr_model = LogisticRegression(C=1, dual=True)
    lr_model.fit(train_x, train_y)
    pred = lr_model.predict_proba(test_x)[:, 1]

    return (lr_model, pred)

In [13]:
(_, dev_preds) = logistic_regression(train_x, train_y, dev_x)

In [14]:
dev_preds[0:10]

array([ 0.35104771,  0.18424411,  0.14537631,  0.23334672,  0.38269845,
        0.20189636,  0.17294577,  0.24725637,  0.05252859,  0.05112427])

In [15]:
print(type(dev_preds))
print(dev_preds.shape)

<type 'numpy.ndarray'>
(53967,)


In [16]:
print(type(dev_preds[0]))

<type 'numpy.float64'>


In [17]:
def extract_scores(y_pred, steps_lengths):
    pred_scores_id = list()
    steps_lengths_sum = 0
    
    for step_length in steps_lengths:
        scores = {}
        for idx in range(step_length):
            scores[idx] = y_pred[steps_lengths_sum + idx]
        
        sorted_scores_id = sorted(scores, key=scores.get, reverse=True)
        pred_scores_id.append(sorted_scores_id)
        
        steps_lengths_sum += step_length
        
    return pred_scores_id

In [18]:
pred_scores_id = extract_scores(dev_preds, dev_steps)

In [19]:
print(len(pred_scores_id))
print(len(dev_y))
print(len(dev_labels))

10570
53967
10570


In [20]:
print(pred_scores_id[0:5])
print(type(pred_scores_id[0:5]))

[[0, 3, 1, 2], [0, 3, 1, 2], [3, 2, 0, 1], [0, 1, 3, 2], [3, 2, 0, 1]]
<type 'list'>


In [21]:
print(dev_y[0:5])
print(type(dev_y[0:5]))

[False, True, False, False, False]
<type 'list'>


In [22]:
print(dev_labels[0:5])
print(type(dev_labels[0:5]))

[set([1]), set([1]), set([2]), set([1]), set([3])]
<type 'list'>


In [23]:
dev_results = evaluation.get_results(pred_scores_id, dev_labels, 'dev')

In [24]:
def write_results(results):
    print('Method: {}'.format(results['Method']))
    print('AvgPrec@1: {} (std = {})'.format(results['AvgPrec@1'], results['StdPrec@1']))
    print('MAP: {} (std = {})'.format(results['MAP'], results['StdAP']))
    print('\n')

In [25]:
write_results(dev_results)

Method: dev
AvgPrec@1: 0.734720908231 (std = 0.441481704309)
MAP: 0.832476062341 (std = 0.273161211053)




In [26]:
del train_x
del train_y
del dev_x
del dev_y

In [27]:
q_a_combination_method = ['concatenation', 'diff_abs', 'diff_sqr', 
                          'sum', 'dot_product', 'min', 'max']

def run_LR(q_a_combination_method):
    for q_a_comb in q_a_combination_method:
        # feature extraction
        (train_x, train_y, train_steps, train_labels) = get_features(train, q_a_comb)
        (dev_x, dev_y, dev_steps, dev_labels) = get_features(dev, q_a_comb)
        print(q_a_comb, len(train_x[0]))
        
        # dev set
        (lr_model, dev_preds) = logistic_regression(train_x, train_y, dev_x)
        dev_pred_scores_id = extract_scores(dev_preds, dev_steps)
        
        dev_results = evaluation.get_results(dev_pred_scores_id, dev_labels, 
                                             'dev [' + q_a_comb + ']')
        write_results(dev_results)

In [28]:
run_LR(q_a_combination_method)

100%|██████████| 442/442 [00:29<00:00, 15.02it/s]
100%|██████████| 48/48 [00:03<00:00, 13.01it/s]


('concatenation', 600)


  0%|          | 0/442 [00:00<?, ?it/s]

Method: dev [concatenation]
AvgPrec@1: 0.326584673605 (std = 0.46896388408)
MAP: 0.557686249614 (std = 0.317339845347)




100%|██████████| 442/442 [00:35<00:00, 12.33it/s]
100%|██████████| 48/48 [00:03<00:00, 13.42it/s]


('diff_abs', 300)


  0%|          | 1/442 [00:00<00:50,  8.75it/s]

Method: dev [diff_abs]
AvgPrec@1: 0.734720908231 (std = 0.441481704309)
MAP: 0.832476062341 (std = 0.273161211053)




100%|██████████| 442/442 [00:29<00:00, 14.93it/s]
100%|██████████| 48/48 [00:03<00:00, 13.21it/s]


('diff_sqr', 300)


  0%|          | 1/442 [00:00<00:49,  8.98it/s]

Method: dev [diff_sqr]
AvgPrec@1: 0.753169347209 (std = 0.431167347597)
MAP: 0.842604529743 (std = 0.268318625504)




100%|██████████| 442/442 [00:28<00:00, 15.71it/s]
100%|██████████| 48/48 [00:03<00:00, 13.76it/s]


('sum', 300)


  0%|          | 1/442 [00:00<00:54,  8.15it/s]

Method: dev [sum]
AvgPrec@1: 0.314191106906 (std = 0.464192907364)
MAP: 0.547317845517 (std = 0.316215470134)




100%|██████████| 442/442 [00:28<00:00, 15.70it/s]
100%|██████████| 48/48 [00:03<00:00, 13.86it/s]


('dot_product', 300)


  0%|          | 1/442 [00:00<00:50,  8.73it/s]

Method: dev [dot_product]
AvgPrec@1: 0.676253547777 (std = 0.467904570288)
MAP: 0.801636720479 (std = 0.282876403657)




100%|██████████| 442/442 [00:28<00:00, 15.43it/s]
100%|██████████| 48/48 [00:03<00:00, 13.38it/s]


('min', 300)


  0%|          | 1/442 [00:00<00:50,  8.70it/s]

Method: dev [min]
AvgPrec@1: 0.721097445601 (std = 0.448459495996)
MAP: 0.826097043133 (std = 0.274701457339)




100%|██████████| 442/442 [00:28<00:00, 15.46it/s]
100%|██████████| 48/48 [00:03<00:00, 13.49it/s]


('max', 300)
Method: dev [max]
AvgPrec@1: 0.720340586566 (std = 0.44883184592)
MAP: 0.825343018522 (std = 0.275648882826)


