## Content
1. [Loading preprocessed data](#1.-Loading-preprocessed-data)  
2. [Loading pretrained word embeddings](#2.-Loading-pretrained-word-embeddings)  
3. [Classification](#3.-Classification)

## 1. Loading preprocessed data

In [1]:
import json
import pandas as pd
from pprint import pprint
from tqdm import tqdm

IN_PATH = '../data/squad/'


def load_data(filename):
    data_frame = pd.read_json(filename)
    data = data_frame.to_dict(orient='list')
    
    return data

In [2]:
train = load_data(IN_PATH + 'train-v1.1-preprocessed.json')
dev = load_data(IN_PATH + 'dev-v1.1-preprocessed.json')

## 2. Loading pretrained word embeddings

In [3]:
import gensim
import numpy as np

WORD2VEC_PATH = '../word_embeddings/GoogleNews-vectors-negative300.bin'

word2vec_model = None

In [4]:
def get_pretrained_word_embeddings(sentence, word_emb_dims=300):
    global word2vec_model
    
    if word2vec_model is None:
        print('load word2vec_model')
        word2vec_model = gensim.models.KeyedVectors.load_word2vec_format(WORD2VEC_PATH, binary=True)

    word_embeddings = np.zeros(word_emb_dims)
    word_emb_nr = 0

    for word in sentence:
        if word2vec_model.vocab.has_key(word):
            word_embeddings += word2vec_model.word_vec(word)
            word_emb_nr += 1

    # a sentence is represented by the average of word_embbedings
    if word_emb_nr != 0:
        word_embeddings = word_embeddings / word_emb_nr

    return word_embeddings

In [5]:
get_pretrained_word_embeddings(['today', 'is', 'wednesday']);

load word2vec_model


In [6]:
def get_features(dataset, q_a_combination_method):
    train_x = list()
    train_y = list()
    train_steps = list()
    
    for article in tqdm(dataset['data']):
        for qas_context in article['paragraphs']:
            for qas in qas_context['qas']:
                question_words = qas['question_words']
                question_embedding = get_pretrained_word_embeddings(question_words)
                
                candidate_sentences_embeddings = list()
                for sentence in qas_context['context_sentences_words']:
                    candidate_sentences_embeddings.append(get_pretrained_word_embeddings(sentence))
                    
                labels = set([d['answer_label'] for d in qas['answers']])
                
                train_steps.append(len(candidate_sentences_embeddings))
                
                for i in range(len(candidate_sentences_embeddings)):
                    feature_vector = []
                    if q_a_combination_method == 'concatenation':
                        feature_vector = np.concatenate((question_embedding,
                                                         candidate_sentences_embeddings[i]))
                    elif q_a_combination_method == 'diff_abs':
                        feature_vector = np.abs(question_embedding - 
                                                candidate_sentences_embeddings[i])
                    elif q_a_combination_method == 'diff_sqr':
                        feature_vector = np.square(question_embedding - 
                                                   candidate_sentences_embeddings[i])
                    elif q_a_combination_method == 'sum':
                        feature_vector = question_embedding + candidate_sentences_embeddings[i]
                                
                    
                    train_x.append(feature_vector)
                    label = i in labels
                    train_y.append(label)
                    
    return (train_x, train_y, train_steps)

In [7]:
(train_x, train_y, _) = get_features(train, 'diff_abs')
(dev_x, dev_y, dev_steps) = get_features(dev, 'diff_abs')

100%|██████████| 442/442 [00:27<00:00, 15.94it/s]
100%|██████████| 48/48 [00:03<00:00, 14.03it/s]


In [8]:
print(len(train_x[0]))

300


In [9]:
print((len(train_x)))
print(np.sum(train_y == np.array([1]*len(train_y))))
print(np.sum(train_y == np.array([0]*len(train_y))))

447101
87599
359502


In [10]:
print((len(dev_x)))
print(np.sum(dev_y == np.array([1]*len(dev_y))))
print(np.sum(dev_y == np.array([0]*len(dev_y))))

53967
11436
42531


## 3. Classification

In [11]:
from sklearn.linear_model import LogisticRegression

In [12]:
def logistic_regression(train_x, train_y, test_x):
    pred = np.zeros(len(test_x))

    lr_model = LogisticRegression(C=4, dual=True, class_weight='balanced')
    lr_model.fit(train_x, train_y)
    pred = lr_model.predict_proba(test_x)[:, 1]

    return (lr_model, pred)

In [13]:
(_, dev_preds) = logistic_regression(train_x, train_y, dev_x)

In [14]:
dev_preds[0:10]

array([ 0.6701109 ,  0.46673225,  0.42017233,  0.56337556,  0.70080548,
        0.49292632,  0.46756613,  0.57458492,  0.21215984,  0.20779809])

In [15]:
print(type(dev_preds))
print(dev_preds.shape)

<type 'numpy.ndarray'>
(53967,)


In [16]:
print(type(dev_preds[0]))

<type 'numpy.float64'>


In [17]:
def extract_scores(y_pred, steps_lengths):
    pred_scores = list()
    steps_lengths_sum = 0
    
    for step_length in steps_lengths:
        scores = {}
        for idx in range(step_length):
            scores[idx] = y_pred[steps_lengths_sum + idx]
        
        sorted_scores = sorted(scores, key=scores.get, reverse=True)
        pred_scores.append(sorted_scores)
        
        steps_lengths_sum += step_length
        
    return pred_scores

In [18]:
pred_scores = extract_scores(dev_preds, dev_steps)

In [19]:
print(len(pred_scores))

10570


In [20]:
import evaluation

def get_results(pred_scores, y, method):
    results = {'Method': method, 'Prec@1': [], 'Prec@5': [], 'Prec@10': [],
           'AvgPrec': [], 'MAP': 0} 
    
    for i in range(len(pred_scores)):
        y_pred = pred_scores[i]
        results['Prec@1'].append(evaluation.precision_at_k(y_pred, y, k=1))
        results['AvgPrec'].append(evaluation.average_precision(y_pred, y))
    
    # evaluation (MAP - mean average precision)
    results['MAP'] = np.mean(results['AvgPrec'])
    results['StdAP'] = np.std(results['AvgPrec'])
    results['AvgPrec@1'] = np.mean(results['Prec@1'])
    results['StdPrec@1'] = np.std(results['Prec@1'])
    
    return results

In [21]:
dev_results = get_results(pred_scores, dev_y, 'dev')

In [22]:
def write_results(results):
    print('Method: {}'.format(results['Method']))
    print('AvgPrec@1: {} (std = {})'.format(results['AvgPrec@1'], results['StdPrec@1']))
    print('MAP: {} (std = {})'.format(results['MAP'], results['StdAP']))
    print('\n')

In [23]:
write_results(dev_results)

Method: dev
AvgPrec@1: 0.545789971618 (std = 0.497898863726)
MAP: 0.674179773456 (std = 0.258354658703)




In [24]:
del train_x
del train_y
del dev_x
del dev_y

In [25]:
q_a_combination_method = ['concatenation', 'diff_abs', 'diff_sqr', 'sum']

def run_LR(q_a_combination_method):
    for q_a_comb in q_a_combination_method:
        # feature extraction
        (train_x, train_y, train_steps) = get_features(train, q_a_comb)
        (dev_x, dev_y, dev_steps) = get_features(dev, q_a_comb)
        print(q_a_comb, len(train_x[0]))
        
        # dev set
        (lr_model, dev_preds) = logistic_regression(train_x, train_y, dev_x)
        dev_pred_scores = extract_scores(dev_preds, dev_steps)
        
        dev_results = get_results(dev_pred_scores, dev_y, 'dev [' + q_a_comb + ']')
        write_results(dev_results)
        
        # train set, just for curiosity
        '''
        train_pred = lr_model.predict_proba(train_x)
        train_pred_scores = extract_scores(train_y, train_steps)
        
        train_results = get_results(train_pred_scores, train_y, 'train [' + q_a_comb + ']')
        write_results(train_results)
        '''

In [26]:
run_LR(q_a_combination_method)

100%|██████████| 442/442 [00:27<00:00, 15.97it/s]
100%|██████████| 48/48 [00:03<00:00, 13.92it/s]


('concatenation', 600)


  0%|          | 1/442 [00:00<01:14,  5.95it/s]

Method: dev [concatenation]
AvgPrec@1: 0.54389782403 (std = 0.498069253262)
MAP: 0.671481101023 (std = 0.252198362598)




100%|██████████| 442/442 [00:38<00:00, 11.46it/s]
100%|██████████| 48/48 [00:03<00:00, 14.28it/s]


('diff_abs', 300)


  0%|          | 1/442 [00:00<00:47,  9.38it/s]

Method: dev [diff_abs]
AvgPrec@1: 0.545789971618 (std = 0.497898863726)
MAP: 0.674179773456 (std = 0.258354658703)




100%|██████████| 442/442 [00:26<00:00, 16.53it/s]
100%|██████████| 48/48 [00:03<00:00, 14.55it/s]


('diff_sqr', 300)


  0%|          | 1/442 [00:00<00:57,  7.67it/s]

Method: dev [diff_sqr]
AvgPrec@1: 0.547209082308 (std = 0.497766313191)
MAP: 0.675022769117 (std = 0.25850526639)




100%|██████████| 442/442 [00:27<00:00, 15.87it/s]
100%|██████████| 48/48 [00:03<00:00, 13.94it/s]


('sum', 300)
Method: dev [sum]
AvgPrec@1: 0.53793755913 (std = 0.498558664158)
MAP: 0.670287341404 (std = 0.252211717281)


