## Content
1. [Loading preprocessed data](#1.-Loading-preprocessed-data)  
2. [Loading pretrained word embeddings](#2.-Loading-pretrained-word-embeddings)  
3. [Classification](#3.-Classification)

## 1. Loading preprocessed data

In [1]:
import json
import pandas as pd

from input_output import load_data, write_results
from pprint import pprint
from tqdm import tqdm

IN_PATH = '../data/squad/'

In [2]:
train = load_data(IN_PATH + 'train-v1.1-preprocessed.json')
dev = load_data(IN_PATH + 'dev-v1.1-preprocessed.json')

## 2. Loading pretrained word embeddings

In [3]:
import gensim
import numpy as np

WORD2VEC_PATH = '../word_embeddings/GoogleNews-vectors-negative300.bin'

word2vec_model = None

In [4]:
def get_pretrained_word_embeddings(sentence, word_emb_dims=300):
    global word2vec_model
    
    if word2vec_model is None:
        print('load word2vec_model')
        word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)

    word_embeddings = np.zeros(word_emb_dims)
    word_emb_nr = 0

    for word in sentence:
        if word2vec_model.vocab.has_key(word):
            word_embeddings += word2vec_model.word_vec(word)
            word_emb_nr += 1

    # a sentence is represented by the average of word_embbedings
    if word_emb_nr != 0:
        word_embeddings = word_embeddings / word_emb_nr

    return word_embeddings

In [5]:
get_pretrained_word_embeddings(['today', 'is', 'wednesday']);

load word2vec_model


In [6]:
def get_features(dataset, q_a_combination_method):
    train_x = list()
    train_y = list()
    train_steps = list()
    train_labels = list()
    
    for article in tqdm(dataset['data']):
        for qas_context in article['paragraphs']:
            for qas in qas_context['qas']:
                question_words = qas['question_lemmas_without_stopwords']
                question_embedding = get_pretrained_word_embeddings(question_words)
                
                candidate_sentences_embeddings = list()
                for sentence in qas_context['context_sentences_lemmas_without_stopwords']:
                    candidate_sentences_embeddings.append(get_pretrained_word_embeddings(sentence))
                
                labels = set([d['answer_label'] for d in qas['answers']])
                train_labels.append(labels)
                
                train_steps.append(len(candidate_sentences_embeddings))
                
                for i in range(len(candidate_sentences_embeddings)):
                    feature_vector = []
                    if q_a_combination_method == 'concatenation':
                        feature_vector = np.concatenate((question_embedding,
                                                         candidate_sentences_embeddings[i]))
                    elif q_a_combination_method == 'diff_abs':
                        feature_vector = np.abs(question_embedding - 
                                                candidate_sentences_embeddings[i])
                    elif q_a_combination_method == 'diff_sqr':
                        feature_vector = np.square(question_embedding - 
                                                   candidate_sentences_embeddings[i])
                    elif q_a_combination_method == 'sum':
                        feature_vector = question_embedding + candidate_sentences_embeddings[i]
                    elif q_a_combination_method == 'dot_product':
                        feature_vector = question_embedding * candidate_sentences_embeddings[i]
                    elif q_a_combination_method == 'min':
                        feature_vector = np.minimum(question_embedding,
                                                    candidate_sentences_embeddings[i])            
                    elif q_a_combination_method == 'max':
                        feature_vector = np.maximum(question_embedding,
                                                    candidate_sentences_embeddings[i])  
                        
                    train_x.append(feature_vector)
                    label = i in labels
                    train_y.append(label)
                    
    return (train_x, train_y, train_steps, train_labels)

In [7]:
(train_x, train_y, _, train_labels) = get_features(train, 'diff_abs')
(dev_x, dev_y, dev_steps, dev_labels) = get_features(dev, 'diff_abs')

100%|██████████| 442/442 [00:19<00:00, 22.19it/s]
100%|██████████| 48/48 [00:02<00:00, 19.10it/s]


In [8]:
print(len(train_x[0]))

300


In [9]:
print((len(train_x)))
print(np.sum(train_y == np.array([1]*len(train_y))))
print(np.sum(train_y == np.array([0]*len(train_y))))

447101
87599
359502


In [10]:
print((len(dev_x)))
print(np.sum(dev_y == np.array([1]*len(dev_y))))
print(np.sum(dev_y == np.array([0]*len(dev_y))))

53967
11436
42531


## 3. Classification

In [11]:
import evaluation

from sklearn.linear_model import LogisticRegression

In [12]:
def logistic_regression(train_x, train_y, test_x):
    pred = np.zeros(len(test_x))

    lr_model = LogisticRegression(C=1, dual=True)
    lr_model.fit(train_x, train_y)
    pred = lr_model.predict_proba(test_x)[:, 1]

    return (lr_model, pred)

In [13]:
(_, dev_preds) = logistic_regression(train_x, train_y, dev_x)

In [14]:
dev_preds[0:10]

array([ 0.38079386,  0.26921253,  0.14783836,  0.20823277,  0.41239707,
        0.27310828,  0.15625174,  0.20678496,  0.15200572,  0.19978923])

In [15]:
print(type(dev_preds))
print(dev_preds.shape)

<type 'numpy.ndarray'>
(53967,)


In [16]:
print(type(dev_preds[0]))

<type 'numpy.float64'>


In [17]:
def extract_scores(y_pred, steps_lengths):
    pred_scores_id = list()
    steps_lengths_sum = 0
    
    for step_length in steps_lengths:
        scores = {}
        for idx in range(step_length):
            scores[idx] = y_pred[steps_lengths_sum + idx]
        
        sorted_scores_id = sorted(scores, key=scores.get, reverse=True)
        pred_scores_id.append(sorted_scores_id)
        
        steps_lengths_sum += step_length
        
    return pred_scores_id

In [18]:
pred_scores_id = extract_scores(dev_preds, dev_steps)

In [19]:
print(len(pred_scores_id))
print(len(dev_y))
print(len(dev_labels))

10570
53967
10570


In [20]:
print(pred_scores_id[0:5])
print(type(pred_scores_id[0:5]))

[[0, 1, 3, 2], [0, 1, 3, 2], [3, 1, 0, 2], [0, 1, 3, 2], [3, 2, 1, 0]]
<type 'list'>


In [21]:
print(dev_y[0:5])
print(type(dev_y[0:5]))

[False, True, False, False, False]
<type 'list'>


In [22]:
print(dev_labels[0:5])
print(type(dev_labels[0:5]))

[set([1]), set([1]), set([2]), set([1]), set([3])]
<type 'list'>


In [23]:
dev_results = evaluation.get_results(pred_scores_id, dev_labels, 'dev')

In [24]:
write_results(dev_results)

Method: dev
AvgPrec@1: 0.752696310312 (std = 0.431444752842)
MAP: 0.844472507558 (std = 0.264608727603)




In [25]:
del train_x
del train_y
del dev_x
del dev_y

In [26]:
q_a_combination_method = [#'concatenation', 
                          'diff_abs', 'diff_sqr', 
                          'sum', 'dot_product', 'min', 'max']

def run_LR(q_a_combination_method):
    for q_a_comb in q_a_combination_method:
        # feature extraction
        (train_x, train_y, train_steps, train_labels) = get_features(train, q_a_comb)
        (dev_x, dev_y, dev_steps, dev_labels) = get_features(dev, q_a_comb)
        print(q_a_comb, len(train_x[0]))
        
        # dev set
        (lr_model, dev_preds) = logistic_regression(train_x, train_y, dev_x)
        dev_pred_scores_id = extract_scores(dev_preds, dev_steps)
        
        dev_results = evaluation.get_results(dev_pred_scores_id, dev_labels, 
                                             'dev [' + q_a_comb + ']')
        write_results(dev_results)
        
        # train set, just for curiosity
        train_preds = lr_model.predict_proba(train_x)[:, 1]
        train_pred_scores_id = extract_scores(train_preds, train_steps)
        
        train_results = evaluation.get_results(train_pred_scores_id, train_labels, 
                                             'train [' + q_a_comb + ']')
        write_results(train_results)

In [27]:
run_LR(q_a_combination_method)

100%|██████████| 442/442 [00:19<00:00, 23.16it/s]
100%|██████████| 48/48 [00:02<00:00, 20.70it/s]


('diff_abs', 300)
Method: dev [diff_abs]
AvgPrec@1: 0.752696310312 (std = 0.431444752842)
MAP: 0.844472507558 (std = 0.264608727603)




  0%|          | 0/442 [00:00<?, ?it/s]

Method: train [diff_abs]
AvgPrec@1: 0.730282309159 (std = 0.443813089136)
MAP: 0.835169204597 (std = 0.27891289984)




100%|██████████| 442/442 [00:19<00:00, 22.80it/s]
100%|██████████| 48/48 [00:02<00:00, 20.26it/s]


('diff_sqr', 300)
Method: dev [diff_sqr]
AvgPrec@1: 0.752696310312 (std = 0.431444752842)
MAP: 0.844228057735 (std = 0.264833626076)




  0%|          | 0/442 [00:00<?, ?it/s]

Method: train [diff_sqr]
AvgPrec@1: 0.727211497848 (std = 0.44539301212)
MAP: 0.833423469318 (std = 0.279731636706)




100%|██████████| 442/442 [00:18<00:00, 23.40it/s]
100%|██████████| 48/48 [00:02<00:00, 19.83it/s]


('sum', 300)
Method: dev [sum]
AvgPrec@1: 0.294701986755 (std = 0.45590868138)
MAP: 0.535056349173 (std = 0.312012221238)




  0%|          | 0/442 [00:00<?, ?it/s]

Method: train [sum]
AvgPrec@1: 0.280242925148 (std = 0.449117833149)
MAP: 0.524755196253 (std = 0.316029472782)




100%|██████████| 442/442 [00:18<00:00, 23.84it/s]
100%|██████████| 48/48 [00:02<00:00, 20.70it/s]


('dot_product', 300)
Method: dev [dot_product]
AvgPrec@1: 0.677388836329 (std = 0.467475347741)
MAP: 0.802098772533 (std = 0.282605987679)




  0%|          | 0/442 [00:00<?, ?it/s]

Method: train [dot_product]
AvgPrec@1: 0.655852235756 (std = 0.475089550095)
MAP: 0.793331664404 (std = 0.293780571785)




100%|██████████| 442/442 [00:19<00:00, 22.63it/s]
100%|██████████| 48/48 [00:02<00:00, 17.95it/s]


('min', 300)
Method: dev [min]
AvgPrec@1: 0.739262062441 (std = 0.439037202839)
MAP: 0.836777015164 (std = 0.269044572425)




  0%|          | 0/442 [00:00<?, ?it/s]

Method: train [min]
AvgPrec@1: 0.714357469834 (std = 0.451719907826)
MAP: 0.825597981509 (std = 0.283662527447)




100%|██████████| 442/442 [00:21<00:00, 20.35it/s]
100%|██████████| 48/48 [00:02<00:00, 16.71it/s]


('max', 300)
Method: dev [max]
AvgPrec@1: 0.739924314096 (std = 0.438675647267)
MAP: 0.836958444919 (std = 0.269134960561)


Method: train [max]
AvgPrec@1: 0.713284398224 (std = 0.452227559393)
MAP: 0.825082022088 (std = 0.283788186065)


